diff options
author | Yi Kong <yikong@google.com> | 2022-02-25 17:02:53 +0000 |
---|---|---|
committer | Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com> | 2022-02-25 17:02:53 +0000 |
commit | edb0ad5bb04b48aab7dd0978f0475edd3550de7c (patch) | |
tree | fb979fb4cf4f8052c8cc66b1ec9516d91fcd859b /unsupported/test | |
parent | 8fd413e275f78a4c240f1442ce5cf77c73a20a55 (diff) | |
parent | bc0f5df265caa21a2120c22453655a7fcc941991 (diff) | |
download | eigen-edb0ad5bb04b48aab7dd0978f0475edd3550de7c.tar.gz |
Merge changes Iee153445,Iee274471 am: 79df15ea88 am: 10f298fc41 am: 7cb5001398 am: bc0f5df265aml_uwb_331910010aml_uwb_331820070aml_uwb_331613010aml_uwb_331611010aml_uwb_331410010aml_uwb_331310030aml_uwb_331115000aml_uwb_331015040aml_uwb_330810010aml_tz4_332714070aml_tz4_332714050aml_tz4_332714010aml_tz4_331910000aml_tz4_331314030aml_tz4_331314020aml_tz4_331314010aml_tz4_331012050aml_tz4_331012040aml_tz4_331012000aml_ase_331311020aml_ase_331112000aml_ase_331011020android13-mainline-uwb-releaseandroid13-mainline-tzdata4-releaseandroid13-mainline-appsearch-releaseaml_tz4_332714010
Original change: https://android-review.googlesource.com/c/platform/external/eigen/+/1999079
Change-Id: Ife39d10c8b23d3eeb174cd52f462f9d20527ad03
Diffstat (limited to 'unsupported/test')
131 files changed, 15434 insertions, 4773 deletions
diff --git a/unsupported/test/BVH.cpp b/unsupported/test/BVH.cpp index ff5b3299d..d8c39d556 100644 --- a/unsupported/test/BVH.cpp +++ b/unsupported/test/BVH.cpp @@ -192,7 +192,7 @@ struct TreeTest }; -void test_BVH() +EIGEN_DECLARE_TEST(BVH) { for(int i = 0; i < g_repeat; i++) { #ifdef EIGEN_TEST_PART_1 diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index b5fa1c845..d30fa62bd 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -1,16 +1,7 @@ -# generate split test header file only if it does not yet exist -# in order to prevent a rebuild everytime cmake is configured -if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) - file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "") - foreach(i RANGE 1 999) - file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h - "#ifdef EIGEN_TEST_PART_${i}\n" - "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n" - "#else\n" - "#define CALL_SUBTEST_${i}(FUNC)\n" - "#endif\n\n" - ) - endforeach() +# The file split_test_helper.h was generated at first run, +# it is now included in test/ +if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) + file(REMOVE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h) endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported") @@ -22,22 +13,27 @@ include_directories(../../test ../../unsupported ../../Eigen find_package (Threads) find_package(GoogleHash) -if(GOOGLEHASH_FOUND) +if(GoogleHash_FOUND) add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT") include_directories(${GOOGLEHASH_INCLUDES}) ei_add_property(EIGEN_TESTED_BACKENDS "GoogleHash, ") -else(GOOGLEHASH_FOUND) +else() ei_add_property(EIGEN_MISSING_BACKENDS "GoogleHash, ") -endif(GOOGLEHASH_FOUND) +endif() + find_package(Adolc) -if(ADOLC_FOUND) +if(Adolc_FOUND) include_directories(${ADOLC_INCLUDES}) ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ") - ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES}) -else(ADOLC_FOUND) + if(EIGEN_TEST_CXX11) + ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES}) + else() + message(STATUS "Adolc found, but tests require C++11 mode") + endif() +else() ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ") -endif(ADOLC_FOUND) +endif() # this test seems to never have been successful on x87, so is considered to contain a FP-related bug. # see thread: "non-linear optimization test summary" @@ -47,9 +43,7 @@ ei_add_test(NumericalDiff) ei_add_test(autodiff_scalar) ei_add_test(autodiff) -if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$") ei_add_test(BVH) -endif() ei_add_test(matrix_exponential) ei_add_test(matrix_function) @@ -61,13 +55,11 @@ ei_add_test(FFT) ei_add_test(EulerAngles) -find_package(MPFR 2.3.0) -find_package(GMP) -if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11) - include_directories(${MPFR_INCLUDES} ./mpreal) +find_package(MPREAL) +if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") - set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" ) + include_directories(${MPREAL_INCLUDES}) + ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() @@ -87,8 +79,8 @@ else() ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ") endif() -option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF) -if(NOT EIGEN_TEST_NO_OPENGL) +option(EIGEN_TEST_OPENGL "Enable OpenGL support in unit tests" OFF) +if(EIGEN_TEST_OPENGL) find_package(OpenGL) find_package(GLUT) find_package(GLEW) @@ -108,89 +100,192 @@ ei_add_test(polynomialsolver) ei_add_test(polynomialutils) ei_add_test(splines) ei_add_test(gmres) +ei_add_test(dgmres) ei_add_test(minres) +ei_add_test(idrs) ei_add_test(levenberg_marquardt) ei_add_test(kronecker_product) +ei_add_test(bessel_functions) ei_add_test(special_functions) - -# TODO: The following test names are prefixed with the cxx11 string, since historically -# the tests depended on c++11. This isn't the case anymore so we ought to rename them. -# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests -# when using visual studio. We should make the check more strict to enable the tests for -# newer versions of MSVC. -if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") -ei_add_test(cxx11_tensor_dimension) -ei_add_test(cxx11_tensor_map) -ei_add_test(cxx11_tensor_assign) -ei_add_test(cxx11_tensor_comparisons) -ei_add_test(cxx11_tensor_forced_eval) -ei_add_test(cxx11_tensor_math) -ei_add_test(cxx11_tensor_const) -ei_add_test(cxx11_tensor_intdiv) -ei_add_test(cxx11_tensor_casts) -ei_add_test(cxx11_tensor_empty) -ei_add_test(cxx11_tensor_sugar) -ei_add_test(cxx11_tensor_roundings) -ei_add_test(cxx11_tensor_layout_swap) -ei_add_test(cxx11_tensor_io) -if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - # This test requires __uint128_t which is only available on 64bit systems - ei_add_test(cxx11_tensor_uint128) -endif() -endif() +ei_add_test(special_packetmath "-DEIGEN_FAST_MATH=1") if(EIGEN_TEST_CXX11) if(EIGEN_TEST_SYCL) - ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11") - ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11") - endif(EIGEN_TEST_SYCL) - # It should be safe to always run these tests as there is some fallback code for - # older compiler that don't support cxx11. - set(CMAKE_CXX_STANDARD 11) + set(EIGEN_SYCL ON) + # Forward CMake options as preprocessor definitions + if(EIGEN_SYCL_USE_DEFAULT_SELECTOR) + add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR}) + endif() + if(EIGEN_SYCL_NO_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_MAX_GLOBAL_RANGE) + add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM0) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM1) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1}) + endif() + if(EIGEN_SYCL_REG_M) + add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M}) + endif() + if(EIGEN_SYCL_REG_N) + add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N}) + endif() + if(EIGEN_SYCL_USE_PROGRAM_CLASS) + add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS}) + endif() + if(EIGEN_SYCL_ASYNC_EXECUTION) + add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION}) + endif() + if(EIGEN_SYCL_DISABLE_SKINNY) + add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY}) + endif() + if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER) + add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER}) + endif() + if(EIGEN_SYCL_DISABLE_RANK1) + add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1}) + endif() + if(EIGEN_SYCL_DISABLE_SCALAR) + add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR}) + endif() + if(EIGEN_SYCL_DISABLE_GEMV) + add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV}) + endif() + if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION) + add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION}) + endif() + + if(EIGEN_SYCL_TRISYCL) + # triSYCL now requires c++17. + set(CMAKE_CXX_STANDARD 17) + else() + if(MSVC) + # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 + # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers. + set(CMAKE_CXX_STANDARD 14) + list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) + else() + set(CMAKE_CXX_STANDARD 11) + list(APPEND COMPUTECPP_USER_FLAGS -Wall) + endif() + # The following flags are not supported by Clang and can cause warnings + # if used with -Werror so they are removed here. + if(COMPUTECPP_USE_COMPILER_DRIVER) + set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}) + string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + endif() + list(APPEND COMPUTECPP_USER_FLAGS + -DEIGEN_NO_ASSERTION_CHECKING=1 + -no-serial-memop + -Xclang + -cl-mad-enable) + endif() + + ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG}) + set(EIGEN_SYCL OFF) + endif() ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_meta) - ei_add_test(cxx11_tensor_simple) -# ei_add_test(cxx11_tensor_symmetry) - ei_add_test(cxx11_tensor_index_list) - ei_add_test(cxx11_tensor_mixed_indices) + ei_add_test(cxx11_maxsizevector) + ei_add_test(cxx11_tensor_argmax) + ei_add_test(cxx11_tensor_assign) + ei_add_test(cxx11_tensor_block_access) + ei_add_test(cxx11_tensor_block_eval) + ei_add_test(cxx11_tensor_block_io) + ei_add_test(cxx11_tensor_broadcasting) + ei_add_test(cxx11_tensor_casts) + ei_add_test(cxx11_tensor_chipping) + ei_add_test(cxx11_tensor_comparisons) + ei_add_test(cxx11_tensor_concatenation) + ei_add_test(cxx11_tensor_const) ei_add_test(cxx11_tensor_contraction) ei_add_test(cxx11_tensor_convolution) + ei_add_test(cxx11_tensor_custom_index) + ei_add_test(cxx11_tensor_custom_op) + ei_add_test(cxx11_tensor_dimension) + ei_add_test(cxx11_tensor_empty) + ei_add_test(cxx11_tensor_executor "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_expr) + ei_add_test(cxx11_tensor_fft) ei_add_test(cxx11_tensor_fixed_size) - ei_add_test(cxx11_tensor_of_const_values) - ei_add_test(cxx11_tensor_of_complex) - ei_add_test(cxx11_tensor_of_strings) - ei_add_test(cxx11_tensor_lvalue) - ei_add_test(cxx11_tensor_broadcasting) - ei_add_test(cxx11_tensor_chipping) - ei_add_test(cxx11_tensor_concatenation) + ei_add_test(cxx11_tensor_forced_eval) + ei_add_test(cxx11_tensor_generator) + ei_add_test(cxx11_tensor_ifft) + ei_add_test(cxx11_tensor_image_patch) + ei_add_test(cxx11_tensor_index_list) ei_add_test(cxx11_tensor_inflation) + ei_add_test(cxx11_tensor_intdiv) + ei_add_test(cxx11_tensor_io) + ei_add_test(cxx11_tensor_layout_swap) + ei_add_test(cxx11_tensor_lvalue) + ei_add_test(cxx11_tensor_map) + ei_add_test(cxx11_tensor_math) + ei_add_test(cxx11_tensor_mixed_indices) ei_add_test(cxx11_tensor_morphing) + ei_add_test(cxx11_tensor_move) + ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_tensor_of_complex) + ei_add_test(cxx11_tensor_of_const_values) + ei_add_test(cxx11_tensor_of_strings) ei_add_test(cxx11_tensor_padding) ei_add_test(cxx11_tensor_patch) - ei_add_test(cxx11_tensor_image_patch) - ei_add_test(cxx11_tensor_volume_patch) + ei_add_test(cxx11_tensor_random) ei_add_test(cxx11_tensor_reduction) - ei_add_test(cxx11_tensor_argmax) + ei_add_test(cxx11_tensor_ref) + ei_add_test(cxx11_tensor_roundings) + ei_add_test(cxx11_tensor_scan) ei_add_test(cxx11_tensor_shuffling) + ei_add_test(cxx11_tensor_simple) ei_add_test(cxx11_tensor_striding) - ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}") + ei_add_test(cxx11_tensor_sugar) + ei_add_test(cxx11_tensor_thread_local "-pthread" "${CMAKE_THREAD_LIBS_INIT}") ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}") - ei_add_test(cxx11_tensor_ref) - ei_add_test(cxx11_tensor_random) - ei_add_test(cxx11_tensor_generator) - ei_add_test(cxx11_tensor_custom_op) - ei_add_test(cxx11_tensor_custom_index) - ei_add_test(cxx11_tensor_fft) - ei_add_test(cxx11_tensor_ifft) - ei_add_test(cxx11_tensor_scan) + ei_add_test(cxx11_tensor_trace) + ei_add_test(cxx11_tensor_volume_patch) +# ei_add_test(cxx11_tensor_symmetry) +if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8" AND NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + # This test requires __uint128_t which is only available on 64bit systems + ei_add_test(cxx11_tensor_uint128) +endif() endif() @@ -213,7 +308,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) endif() if(EIGEN_TEST_CUDA_CLANG) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}") + endforeach() endif() set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") @@ -221,37 +320,98 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") endif() - if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3)) - set(EIGEN_CUDA_CXX11_FLAG "-std=c++11") - else() - # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11) - set(EIGEN_CUDA_CXX11_FLAG "") - endif() - - set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}") + set(NVCC_ARCH_FLAGS) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endforeach() + set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(cxx11_tensor_complex_cuda) - ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) - ei_add_test(cxx11_tensor_reduction_cuda) - ei_add_test(cxx11_tensor_argmax_cuda) - ei_add_test(cxx11_tensor_cast_float16_cuda) - ei_add_test(cxx11_tensor_scan_cuda) + ei_add_test(cxx11_tensor_complex_gpu) + ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) + ei_add_test(cxx11_tensor_reduction_gpu) + ei_add_test(cxx11_tensor_argmax_gpu) + ei_add_test(cxx11_tensor_cast_float16_gpu) + ei_add_test(cxx11_tensor_scan_gpu) + + set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH 9999) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + if(${ARCH} LESS ${EIGEN_CUDA_OLDEST_COMPUTE_ARCH}) + set(EIGEN_CUDA_OLDEST_COMPUTE_ARCH ${ARCH}) + endif() + endforeach() # Contractions require arch 3.0 or higher - if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29) + if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 29) ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_cuda) - ei_add_test(cxx11_tensor_contract_cuda) - ei_add_test(cxx11_tensor_of_float16_cuda) + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) endif() # The random number generation code requires arch 3.5 or greater. - if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) - ei_add_test(cxx11_tensor_random_cuda) + if (${EIGEN_CUDA_OLDEST_COMPUTE_ARCH} GREATER 34) + ei_add_test(cxx11_tensor_random_gpu) endif() unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() + +# Add HIP specific tests +if (EIGEN_TEST_HIP) + + set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") + + if (EXISTS ${HIP_PATH}) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + find_package(HIP REQUIRED) + if (HIP_FOUND) + + execute_process(COMMAND ${HIP_PATH}/bin/hipconfig --platform OUTPUT_VARIABLE HIP_PLATFORM) + + if ((${HIP_PLATFORM} STREQUAL "hcc") OR (${HIP_PLATFORM} STREQUAL "amd")) + + include_directories(${CMAKE_CURRENT_BINARY_DIR}) + include_directories(${HIP_PATH}/include) + + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") + # + # complex datatype is not yet supported by HIP + # so leaving out those tests for now + # + # ei_add_test(cxx11_tensor_complex_gpu) + # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) + # + ei_add_test(cxx11_tensor_reduction_gpu) + ei_add_test(cxx11_tensor_argmax_gpu) + ei_add_test(cxx11_tensor_cast_float16_gpu) + ei_add_test(cxx11_tensor_scan_gpu) + ei_add_test(cxx11_tensor_device) + + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) + ei_add_test(cxx11_tensor_random_gpu) + + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) + + elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) + message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") + else () + message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") + endif() + + endif() + + else () + + message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") + + endif() + +endif() + diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp index a8cb52864..0955795b6 100644 --- a/unsupported/test/EulerAngles.cpp +++ b/unsupported/test/EulerAngles.cpp @@ -13,146 +13,220 @@ using namespace Eigen; -template<typename EulerSystem, typename Scalar> -void verify_euler_ranged(const Matrix<Scalar,3,1>& ea, - bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma) +// Unfortunately, we need to specialize it in order to work. (We could add it in main.h test framework) +template <typename Scalar, class System> +bool verifyIsApprox(const Eigen::EulerAngles<Scalar, System>& a, const Eigen::EulerAngles<Scalar, System>& b) +{ + return verifyIsApprox(a.angles(), b.angles()); +} + +// Verify that x is in the approxed range [a, b] +#define VERIFY_APPROXED_RANGE(a, x, b) \ + do { \ + VERIFY_IS_APPROX_OR_LESS_THAN(a, x); \ + VERIFY_IS_APPROX_OR_LESS_THAN(x, b); \ + } while(0) + +const char X = EULER_X; +const char Y = EULER_Y; +const char Z = EULER_Z; + +template<typename Scalar, class EulerSystem> +void verify_euler(const EulerAngles<Scalar, EulerSystem>& e) { typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType; typedef Matrix<Scalar,3,3> Matrix3; typedef Matrix<Scalar,3,1> Vector3; typedef Quaternion<Scalar> QuaternionType; typedef AngleAxis<Scalar> AngleAxisType; - using std::abs; - - Scalar alphaRangeStart, alphaRangeEnd; - Scalar betaRangeStart, betaRangeEnd; - Scalar gammaRangeStart, gammaRangeEnd; - if (positiveRangeAlpha) - { - alphaRangeStart = Scalar(0); - alphaRangeEnd = Scalar(2 * EIGEN_PI); - } - else - { - alphaRangeStart = -Scalar(EIGEN_PI); - alphaRangeEnd = Scalar(EIGEN_PI); - } + const Scalar ONE = Scalar(1); + const Scalar HALF_PI = Scalar(EIGEN_PI / 2); + const Scalar PI = Scalar(EIGEN_PI); - if (positiveRangeBeta) - { - betaRangeStart = Scalar(0); - betaRangeEnd = Scalar(2 * EIGEN_PI); - } - else - { - betaRangeStart = -Scalar(EIGEN_PI); - betaRangeEnd = Scalar(EIGEN_PI); - } + // It's very important calc the acceptable precision depending on the distance from the pole. + const Scalar longitudeRadius = std::abs( + EulerSystem::IsTaitBryan ? + std::cos(e.beta()) : + std::sin(e.beta()) + ); + Scalar precision = test_precision<Scalar>() / longitudeRadius; - if (positiveRangeGamma) + Scalar betaRangeStart, betaRangeEnd; + if (EulerSystem::IsTaitBryan) { - gammaRangeStart = Scalar(0); - gammaRangeEnd = Scalar(2 * EIGEN_PI); + betaRangeStart = -HALF_PI; + betaRangeEnd = HALF_PI; } else { - gammaRangeStart = -Scalar(EIGEN_PI); - gammaRangeEnd = Scalar(EIGEN_PI); + if (!EulerSystem::IsBetaOpposite) + { + betaRangeStart = 0; + betaRangeEnd = PI; + } + else + { + betaRangeStart = -PI; + betaRangeEnd = 0; + } } - const int i = EulerSystem::AlphaAxisAbs - 1; - const int j = EulerSystem::BetaAxisAbs - 1; - const int k = EulerSystem::GammaAxisAbs - 1; + const Vector3 I_ = EulerAnglesType::AlphaAxisVector(); + const Vector3 J_ = EulerAnglesType::BetaAxisVector(); + const Vector3 K_ = EulerAnglesType::GammaAxisVector(); - const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1; - const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1; - const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1; - - const Vector3 I = EulerAnglesType::AlphaAxisVector(); - const Vector3 J = EulerAnglesType::BetaAxisVector(); - const Vector3 K = EulerAnglesType::GammaAxisVector(); - - EulerAnglesType e(ea[0], ea[1], ea[2]); + // Is approx checks + VERIFY(e.isApprox(e)); + VERIFY_IS_APPROX(e, e); + VERIFY_IS_NOT_APPROX(e, EulerAnglesType(e.alpha() + ONE, e.beta() + ONE, e.gamma() + ONE)); + + const Matrix3 m(e); + VERIFY_IS_APPROX(Scalar(m.determinant()), ONE); + + EulerAnglesType ebis(m); - Matrix3 m(e); - Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); + // When no roll(acting like polar representation), we have the best precision. + // One of those cases is when the Euler angles are on the pole, and because it's singular case, + // the computation returns no roll. + if (ebis.beta() == 0) + precision = test_precision<Scalar>(); // Check that eabis in range - VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd); - VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd); - VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd); + VERIFY_APPROXED_RANGE(-PI, ebis.alpha(), PI); + VERIFY_APPROXED_RANGE(betaRangeStart, ebis.beta(), betaRangeEnd); + VERIFY_APPROXED_RANGE(-PI, ebis.gamma(), PI); + + const Matrix3 mbis(AngleAxisType(ebis.alpha(), I_) * AngleAxisType(ebis.beta(), J_) * AngleAxisType(ebis.gamma(), K_)); + VERIFY_IS_APPROX(Scalar(mbis.determinant()), ONE); + VERIFY_IS_APPROX(mbis, ebis.toRotationMatrix()); + /*std::cout << "===================\n" << + "e: " << e << std::endl << + "eabis: " << eabis.transpose() << std::endl << + "m: " << m << std::endl << + "mbis: " << mbis << std::endl << + "X: " << (m * Vector3::UnitX()).transpose() << std::endl << + "X: " << (mbis * Vector3::UnitX()).transpose() << std::endl;*/ + VERIFY(m.isApprox(mbis, precision)); + + // Test if ea and eabis are the same + // Need to check both singular and non-singular cases + // There are two singular cases. + // 1. When I==K and sin(ea(1)) == 0 + // 2. When I!=K and cos(ea(1)) == 0 + + // TODO: Make this test work well, and use range saturation function. + /*// If I==K, and ea[1]==0, then there no unique solution. + // The remark apply in the case where I!=K, and |ea[1]| is close to +-pi/2. + if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) + VERIFY_IS_APPROX(ea, eabis);*/ - Vector3 eabis2 = m.eulerAngles(i, j, k); + // Quaternions + const QuaternionType q(e); + ebis = q; + const QuaternionType qbis(ebis); + VERIFY(internal::isApprox<Scalar>(std::abs(q.dot(qbis)), ONE, precision)); + //VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same - // Invert the relevant axes - eabis2[0] *= iFactor; - eabis2[1] *= jFactor; - eabis2[2] *= kFactor; + // A suggestion for simple product test when will be supported. + /*EulerAnglesType e2(PI/2, PI/2, PI/2); + Matrix3 m2(e2); + VERIFY_IS_APPROX(e*e2, m*m2);*/ +} + +template<signed char A, signed char B, signed char C, typename Scalar> +void verify_euler_vec(const Matrix<Scalar,3,1>& ea) +{ + verify_euler(EulerAngles<Scalar, EulerSystem<A, B, C> >(ea[0], ea[1], ea[2])); +} + +template<signed char A, signed char B, signed char C, typename Scalar> +void verify_euler_all_neg(const Matrix<Scalar,3,1>& ea) +{ + verify_euler_vec<+A,+B,+C>(ea); + verify_euler_vec<+A,+B,-C>(ea); + verify_euler_vec<+A,-B,+C>(ea); + verify_euler_vec<+A,-B,-C>(ea); - // Saturate the angles to the correct range - if (positiveRangeAlpha && (eabis2[0] < 0)) - eabis2[0] += Scalar(2 * EIGEN_PI); - if (positiveRangeBeta && (eabis2[1] < 0)) - eabis2[1] += Scalar(2 * EIGEN_PI); - if (positiveRangeGamma && (eabis2[2] < 0)) - eabis2[2] += Scalar(2 * EIGEN_PI); + verify_euler_vec<-A,+B,+C>(ea); + verify_euler_vec<-A,+B,-C>(ea); + verify_euler_vec<-A,-B,+C>(ea); + verify_euler_vec<-A,-B,-C>(ea); +} + +template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea) +{ + verify_euler_all_neg<X,Y,Z>(ea); + verify_euler_all_neg<X,Y,X>(ea); + verify_euler_all_neg<X,Z,Y>(ea); + verify_euler_all_neg<X,Z,X>(ea); - VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is + verify_euler_all_neg<Y,Z,X>(ea); + verify_euler_all_neg<Y,Z,Y>(ea); + verify_euler_all_neg<Y,X,Z>(ea); + verify_euler_all_neg<Y,X,Y>(ea); - Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K)); - VERIFY_IS_APPROX(m, mbis); + verify_euler_all_neg<Z,X,Y>(ea); + verify_euler_all_neg<Z,X,Z>(ea); + verify_euler_all_neg<Z,Y,X>(ea); + verify_euler_all_neg<Z,Y,Z>(ea); +} + +template<typename Scalar> void check_singular_cases(const Scalar& singularBeta) +{ + typedef Matrix<Scalar,3,1> Vector3; + const Scalar PI = Scalar(EIGEN_PI); - // Tests that are only relevant for no possitive range - if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma)) + for (Scalar epsilon = NumTraits<Scalar>::epsilon(); epsilon < 1; epsilon *= Scalar(1.2)) { - /* If I==K, and ea[1]==0, then there no unique solution. */ - /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ - if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) - VERIFY((ea-eabis).norm() <= test_precision<Scalar>()); - - // approx_or_less_than does not work for 0 - VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1))); + check_all_var(Vector3(PI/4, singularBeta, PI/3)); + check_all_var(Vector3(PI/4, singularBeta - epsilon, PI/3)); + check_all_var(Vector3(PI/4, singularBeta - Scalar(1.5)*epsilon, PI/3)); + check_all_var(Vector3(PI/4, singularBeta - 2*epsilon, PI/3)); + check_all_var(Vector3(PI*Scalar(0.8), singularBeta - epsilon, Scalar(0.9)*PI)); + check_all_var(Vector3(PI*Scalar(-0.9), singularBeta + epsilon, PI*Scalar(0.3))); + check_all_var(Vector3(PI*Scalar(-0.6), singularBeta + Scalar(1.5)*epsilon, PI*Scalar(0.3))); + check_all_var(Vector3(PI*Scalar(-0.5), singularBeta + 2*epsilon, PI*Scalar(0.4))); + check_all_var(Vector3(PI*Scalar(0.9), singularBeta + epsilon, Scalar(0.8)*PI)); } - // Quaternions - QuaternionType q(e); - eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles(); - VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same -} - -template<typename EulerSystem, typename Scalar> -void verify_euler(const Matrix<Scalar,3,1>& ea) -{ - verify_euler_ranged<EulerSystem>(ea, false, false, false); - verify_euler_ranged<EulerSystem>(ea, false, false, true); - verify_euler_ranged<EulerSystem>(ea, false, true, false); - verify_euler_ranged<EulerSystem>(ea, false, true, true); - verify_euler_ranged<EulerSystem>(ea, true, false, false); - verify_euler_ranged<EulerSystem>(ea, true, false, true); - verify_euler_ranged<EulerSystem>(ea, true, true, false); - verify_euler_ranged<EulerSystem>(ea, true, true, true); + // This one for sanity, it had a problem with near pole cases in float scalar. + check_all_var(Vector3(PI*Scalar(0.8), singularBeta - Scalar(1E-6), Scalar(0.9)*PI)); } -template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea) +template<typename Scalar> void eulerangles_manual() { - verify_euler<EulerSystemXYZ>(ea); - verify_euler<EulerSystemXYX>(ea); - verify_euler<EulerSystemXZY>(ea); - verify_euler<EulerSystemXZX>(ea); - - verify_euler<EulerSystemYZX>(ea); - verify_euler<EulerSystemYZY>(ea); - verify_euler<EulerSystemYXZ>(ea); - verify_euler<EulerSystemYXY>(ea); - - verify_euler<EulerSystemZXY>(ea); - verify_euler<EulerSystemZXZ>(ea); - verify_euler<EulerSystemZYX>(ea); - verify_euler<EulerSystemZYZ>(ea); + typedef Matrix<Scalar,3,1> Vector3; + typedef Matrix<Scalar,Dynamic,1> VectorX; + const Vector3 Zero = Vector3::Zero(); + const Scalar PI = Scalar(EIGEN_PI); + + check_all_var(Zero); + + // singular cases + check_singular_cases(PI/2); + check_singular_cases(-PI/2); + + check_singular_cases(Scalar(0)); + check_singular_cases(Scalar(-0)); + + check_singular_cases(PI); + check_singular_cases(-PI); + + // non-singular cases + VectorX alpha = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI); + VectorX beta = VectorX::LinSpaced(20, Scalar(-0.49) * PI, Scalar(0.49) * PI); + VectorX gamma = VectorX::LinSpaced(20, Scalar(-0.99) * PI, PI); + for (int i = 0; i < alpha.size(); ++i) { + for (int j = 0; j < beta.size(); ++j) { + for (int k = 0; k < gamma.size(); ++k) { + check_all_var(Vector3(alpha(i), beta(j), gamma(k))); + } + } + } } -template<typename Scalar> void eulerangles() +template<typename Scalar> void eulerangles_rand() { typedef Matrix<Scalar,3,3> Matrix3; typedef Matrix<Scalar,3,1> Vector3; @@ -199,10 +273,24 @@ template<typename Scalar> void eulerangles() check_all_var(ea); } -void test_EulerAngles() +EIGEN_DECLARE_TEST(EulerAngles) { + // Simple cast test + EulerAnglesXYZd onesEd(1, 1, 1); + EulerAnglesXYZf onesEf = onesEd.cast<float>(); + VERIFY_IS_APPROX(onesEd, onesEf.cast<double>()); + + // Simple Construction from Vector3 test + VERIFY_IS_APPROX(onesEd, EulerAnglesXYZd(Vector3d::Ones())); + + CALL_SUBTEST_1( eulerangles_manual<float>() ); + CALL_SUBTEST_2( eulerangles_manual<double>() ); + for(int i = 0; i < g_repeat; i++) { - CALL_SUBTEST_1( eulerangles<float>() ); - CALL_SUBTEST_2( eulerangles<double>() ); + CALL_SUBTEST_3( eulerangles_rand<float>() ); + CALL_SUBTEST_4( eulerangles_rand<double>() ); } + + // TODO: Add tests for auto diff + // TODO: Add tests for complex numbers } diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp index 8b7528fb7..cfe559ebd 100644 --- a/unsupported/test/FFTW.cpp +++ b/unsupported/test/FFTW.cpp @@ -225,7 +225,7 @@ void test_return_by_value(int len) VERIFY( (in1-in).norm() < test_precision<float>() ); } -void test_FFTW() +EIGEN_DECLARE_TEST(FFTW) { CALL_SUBTEST( test_return_by_value(32) ); //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) ); diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index 1d682dd83..c667b7247 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -15,6 +15,15 @@ // tolerance for chekcing number of iterations #define LM_EVAL_COUNT_TOL 4/3 +#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \ + ++g_test_level; \ + VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \ + VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \ + --g_test_level; \ + VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \ + VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \ + } + int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag) { /* subroutine fcn for chkder example. */ @@ -180,8 +189,7 @@ void testLmder1() // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 6); - VERIFY_IS_EQUAL(lm.njev, 5); + LM_CHECK_N_ITERS(lm, 6, 5); // check norm VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596); @@ -209,8 +217,7 @@ void testLmder() // check return values VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 6); - VERIFY_IS_EQUAL(lm.njev, 5); + LM_CHECK_N_ITERS(lm, 6, 5); // check norm fnorm = lm.fvec.blueNorm(); @@ -294,8 +301,7 @@ void testHybrj1() // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 11); - VERIFY_IS_EQUAL(solver.njev, 1); + LM_CHECK_N_ITERS(solver, 11, 1); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -329,8 +335,7 @@ void testHybrj() // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 11); - VERIFY_IS_EQUAL(solver.njev, 1); + LM_CHECK_N_ITERS(solver, 11, 1); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -485,8 +490,7 @@ void testLmstr1() // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 6); - VERIFY_IS_EQUAL(lm.njev, 5); + LM_CHECK_N_ITERS(lm, 6, 5); // check norm VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596); @@ -514,8 +518,7 @@ void testLmstr() // check return values VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 6); - VERIFY_IS_EQUAL(lm.njev, 5); + LM_CHECK_N_ITERS(lm, 6, 5); // check norm fnorm = lm.fvec.blueNorm(); @@ -565,7 +568,7 @@ void testLmdif1() // do the computation lmdif_functor functor; - DenseIndex nfev; + DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev); // check return value @@ -686,8 +689,7 @@ void testNistChwirut2(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 10); - VERIFY_IS_EQUAL(lm.njev, 8); + LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); // check x @@ -707,8 +709,7 @@ void testNistChwirut2(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 7); - VERIFY_IS_EQUAL(lm.njev, 6); + LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); // check x @@ -766,8 +767,7 @@ void testNistMisra1a(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 19); - VERIFY_IS_EQUAL(lm.njev, 15); + LM_CHECK_N_ITERS(lm, 19, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); // check x @@ -783,8 +783,7 @@ void testNistMisra1a(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 5); - VERIFY_IS_EQUAL(lm.njev, 4); + LM_CHECK_N_ITERS(lm, 5, 4); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); // check x @@ -856,8 +855,7 @@ void testNistHahn1(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 11); - VERIFY_IS_EQUAL(lm.njev, 10); + LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); // check x @@ -878,8 +876,7 @@ void testNistHahn1(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 11); - VERIFY_IS_EQUAL(lm.njev, 10); + LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); // check x @@ -942,8 +939,7 @@ void testNistMisra1d(void) // check return value VERIFY_IS_EQUAL(info, 3); - VERIFY_IS_EQUAL(lm.nfev, 9); - VERIFY_IS_EQUAL(lm.njev, 7); + LM_CHECK_N_ITERS(lm, 9, 7); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); // check x @@ -959,8 +955,7 @@ void testNistMisra1d(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 4); - VERIFY_IS_EQUAL(lm.njev, 3); + LM_CHECK_N_ITERS(lm, 4, 3); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); // check x @@ -1020,8 +1015,7 @@ void testNistLanczos1(void) // check return value VERIFY_IS_EQUAL(info, 2); - VERIFY_IS_EQUAL(lm.nfev, 79); - VERIFY_IS_EQUAL(lm.njev, 72); + LM_CHECK_N_ITERS(lm, 79, 72); // check norm^2 std::cout.precision(30); std::cout << lm.fvec.squaredNorm() << "\n"; @@ -1043,8 +1037,7 @@ void testNistLanczos1(void) // check return value VERIFY_IS_EQUAL(info, 2); - VERIFY_IS_EQUAL(lm.nfev, 9); - VERIFY_IS_EQUAL(lm.njev, 8); + LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x @@ -1108,8 +1101,7 @@ void testNistRat42(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 10); - VERIFY_IS_EQUAL(lm.njev, 8); + LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); // check x @@ -1126,8 +1118,7 @@ void testNistRat42(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 6); - VERIFY_IS_EQUAL(lm.njev, 5); + LM_CHECK_N_ITERS(lm, 6, 5); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); // check x @@ -1186,8 +1177,7 @@ void testNistMGH10(void) // check return value VERIFY_IS_EQUAL(info, 2); - VERIFY_IS_EQUAL(lm.nfev, 284 ); - VERIFY_IS_EQUAL(lm.njev, 249 ); + LM_CHECK_N_ITERS(lm, 284, 249); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); // check x @@ -1204,8 +1194,7 @@ void testNistMGH10(void) // check return value VERIFY_IS_EQUAL(info, 3); - VERIFY_IS_EQUAL(lm.nfev, 126); - VERIFY_IS_EQUAL(lm.njev, 116); + LM_CHECK_N_ITERS(lm, 126, 116); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); // check x @@ -1265,8 +1254,7 @@ void testNistBoxBOD(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY(lm.nfev < 31); // 31 - VERIFY(lm.njev < 25); // 25 + LM_CHECK_N_ITERS(lm, 31, 25); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); // check x @@ -1284,9 +1272,8 @@ void testNistBoxBOD(void) info = lm.minimize(x); // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 15 ); - VERIFY_IS_EQUAL(lm.njev, 14 ); + VERIFY_IS_EQUAL(info, 1); + LM_CHECK_N_ITERS(lm, 15, 14); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); // check x @@ -1356,12 +1343,7 @@ void testNistMGH17(void) // check return value VERIFY_IS_EQUAL(info, 2); - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev, 602); // 602 - VERIFY_IS_EQUAL(lm.njev, 545); // 545 - --g_test_level; - VERIFY(lm.nfev < 602 * LM_EVAL_COUNT_TOL); - VERIFY(lm.njev < 545 * LM_EVAL_COUNT_TOL); + LM_CHECK_N_ITERS(lm, 602, 545); /* * Second try @@ -1373,8 +1355,7 @@ void testNistMGH17(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 18); - VERIFY_IS_EQUAL(lm.njev, 15); + LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05); // check x @@ -1438,9 +1419,8 @@ void testNistMGH09(void) info = lm.minimize(x); // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 490 ); - VERIFY_IS_EQUAL(lm.njev, 376 ); + VERIFY_IS_EQUAL(info, 1); + LM_CHECK_N_ITERS(lm, 490, 376); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); // check x @@ -1459,8 +1439,7 @@ void testNistMGH09(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 18); - VERIFY_IS_EQUAL(lm.njev, 16); + LM_CHECK_N_ITERS(lm, 18, 16); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); // check x @@ -1525,8 +1504,7 @@ void testNistBennett5(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 758); - VERIFY_IS_EQUAL(lm.njev, 744); + LM_CHECK_N_ITERS(lm, 758, 744); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); // check x @@ -1543,8 +1521,7 @@ void testNistBennett5(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 203); - VERIFY_IS_EQUAL(lm.njev, 192); + LM_CHECK_N_ITERS(lm, 203, 192); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); // check x @@ -1613,8 +1590,7 @@ void testNistThurber(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 39); - VERIFY_IS_EQUAL(lm.njev, 36); + LM_CHECK_N_ITERS(lm, 39,36); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); // check x @@ -1638,8 +1614,7 @@ void testNistThurber(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 29); - VERIFY_IS_EQUAL(lm.njev, 28); + LM_CHECK_N_ITERS(lm, 29, 28); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); // check x @@ -1705,8 +1680,7 @@ void testNistRat43(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 27); - VERIFY_IS_EQUAL(lm.njev, 20); + LM_CHECK_N_ITERS(lm, 27, 20); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); // check x @@ -1727,8 +1701,7 @@ void testNistRat43(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 9); - VERIFY_IS_EQUAL(lm.njev, 8); + LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); // check x @@ -1790,8 +1763,7 @@ void testNistEckerle4(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 18); - VERIFY_IS_EQUAL(lm.njev, 15); + LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); // check x @@ -1808,8 +1780,7 @@ void testNistEckerle4(void) // check return value VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 7); - VERIFY_IS_EQUAL(lm.njev, 6); + LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); // check x @@ -1818,7 +1789,7 @@ void testNistEckerle4(void) VERIFY_IS_APPROX(x[2], 4.5154121844E+02); } -void test_NonLinearOptimization() +EIGEN_DECLARE_TEST(NonLinearOptimization) { // Tests using the examples provided by (c)minpack CALL_SUBTEST/*_1*/(testChkder()); diff --git a/unsupported/test/NumericalDiff.cpp b/unsupported/test/NumericalDiff.cpp index 27d888056..6d836413b 100644 --- a/unsupported/test/NumericalDiff.cpp +++ b/unsupported/test/NumericalDiff.cpp @@ -24,7 +24,7 @@ struct Functor int m_inputs, m_values; Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {} - Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {} + Functor(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {} int inputs() const { return m_inputs; } int values() const { return m_values; } @@ -107,7 +107,7 @@ void test_central() VERIFY_IS_APPROX(jac, actual_jac); } -void test_NumericalDiff() +EIGEN_DECLARE_TEST(NumericalDiff) { CALL_SUBTEST(test_forward()); CALL_SUBTEST(test_central()); diff --git a/unsupported/test/alignedvector3.cpp b/unsupported/test/alignedvector3.cpp index 252cb1d3f..f442e416a 100644 --- a/unsupported/test/alignedvector3.cpp +++ b/unsupported/test/alignedvector3.cpp @@ -70,13 +70,16 @@ void alignedvector3() VERIFY_IS_APPROX(f6,r1-r4); } + FastType f8, f9(0,0,0); + VERIFY_IS_APPROX(f9-f1,-f1); + std::stringstream ss1, ss2; ss1 << f1; ss2 << r1; VERIFY(ss1.str()==ss2.str()); } -void test_alignedvector3() +EIGEN_DECLARE_TEST(alignedvector3) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST( alignedvector3<float>() ); diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp index 85743137e..2cea56ba5 100644 --- a/unsupported/test/autodiff.cpp +++ b/unsupported/test/autodiff.cpp @@ -44,7 +44,7 @@ struct TestFunc1 int m_inputs, m_values; TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {} - TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {} + TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {} int inputs() const { return m_inputs; } int values() const { return m_values; } @@ -306,6 +306,8 @@ double bug_1222() { return denom.value(); } +#ifdef EIGEN_TEST_PART_5 + double bug_1223() { using std::min; typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD; @@ -326,8 +328,8 @@ double bug_1223() { // regression test for some compilation issues with specializations of ScalarBinaryOpTraits void bug_1260() { - Matrix4d A; - Vector4d v; + Matrix4d A = Matrix4d::Ones(); + Vector4d v = Vector4d::Ones(); A*v; } @@ -336,7 +338,7 @@ double bug_1261() { typedef AutoDiffScalar<Matrix2d> AD; typedef Matrix<AD,2,1> VectorAD; - VectorAD v; + VectorAD v(0.,0.); const AD maxVal = v.maxCoeff(); const AD minVal = v.minCoeff(); return maxVal.value() + minVal.value(); @@ -344,13 +346,30 @@ double bug_1261() { double bug_1264() { typedef AutoDiffScalar<Vector2d> AD; - const AD s; - const Matrix<AD, 3, 1> v1; + const AD s = 0.; + const Matrix<AD, 3, 1> v1(0.,0.,0.); const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1; return v2(0).value(); } -void test_autodiff() +// check with expressions on constants +double bug_1281() { + int n = 2; + typedef AutoDiffScalar<VectorXd> AD; + const AD c = 1.; + AD x0(2,n,0); + AD y1 = (AD(c)+AD(c))*x0; + y1 = x0 * (AD(c)+AD(c)); + AD y2 = (-AD(c))+x0; + y2 = x0+(-AD(c)); + AD y3 = (AD(c)*(-AD(c))+AD(c))*x0; + y3 = x0 * (AD(c)*(-AD(c))+AD(c)); + return (y1+y2+y3).value(); +} + +#endif + +EIGEN_DECLARE_TEST(autodiff) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( test_autodiff_scalar<1>() ); @@ -359,9 +378,10 @@ void test_autodiff() CALL_SUBTEST_4( test_autodiff_hessian<1>() ); } - bug_1222(); - bug_1223(); - bug_1260(); - bug_1261(); + CALL_SUBTEST_5( bug_1222() ); + CALL_SUBTEST_5( bug_1223() ); + CALL_SUBTEST_5( bug_1260() ); + CALL_SUBTEST_5( bug_1261() ); + CALL_SUBTEST_5( bug_1281() ); } diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp index 9cf11280c..e81a7788b 100644 --- a/unsupported/test/autodiff_scalar.cpp +++ b/unsupported/test/autodiff_scalar.cpp @@ -81,12 +81,15 @@ void check_limits_specialization() typedef std::numeric_limits<AD> A; typedef std::numeric_limits<Scalar> B; + // workaround "unused typedef" warning: + VERIFY(!bool(internal::is_same<B, A>::value)); + #if EIGEN_HAS_CXX11 VERIFY(bool(std::is_base_of<B, A>::value)); #endif } -void test_autodiff_scalar() +EIGEN_DECLARE_TEST(autodiff_scalar) { for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( check_atan2<float>() ); diff --git a/unsupported/test/bessel_functions.cpp b/unsupported/test/bessel_functions.cpp new file mode 100644 index 000000000..06765bfab --- /dev/null +++ b/unsupported/test/bessel_functions.cpp @@ -0,0 +1,370 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" +#include "../Eigen/SpecialFunctions" + +template<typename X, typename Y> +void verify_component_wise(const X& x, const Y& y) +{ + for(Index i=0; i<x.size(); ++i) + { + if((numext::isfinite)(y(i))) { + VERIFY_IS_APPROX( x(i), y(i) ); + } + else if((numext::isnan)(y(i))) + VERIFY((numext::isnan)(x(i))); + else + VERIFY_IS_EQUAL( x(i), y(i) ); + } +} + +template<typename ArrayType> void array_bessel_functions() +{ + // Test Bessel function i0. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << 4.35582826e+07, 6.21841242e+06, 8.93446228e+05, 1.29418563e+05, + 1.89489253e+04, 2.81571663e+03, 4.27564116e+02, 6.72344070e+01, + 1.13019220e+01, 2.27958530e+00, 1.00000000e+00, 2.27958530e+00, + 1.13019220e+01, 6.72344070e+01, 4.27564116e+02, 2.81571663e+03, + 1.89489253e+04, 1.29418563e+05, 8.93446228e+05, 6.21841242e+06, + 4.35582826e+07; + + CALL_SUBTEST(res = bessel_i0(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function i0e. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << 0.0897803118848, 0.0947062952128, 0.100544127361, + 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, + 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, + 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, + 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, + 0.0897803118848; + + CALL_SUBTEST(res = bessel_i0e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function i1. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << -4.24549734e+07, -6.04313324e+06, -8.65059436e+05, -1.24707259e+05, + -1.81413488e+04, -2.67098830e+03, -3.99873137e+02, -6.13419368e+01, + -9.75946515e+00, -1.59063685e+00, 0.00000000e+00, 1.59063685e+00, + 9.75946515e+00, 6.13419368e+01, 3.99873137e+02, 2.67098830e+03, + 1.81413488e+04, 1.24707259e+05, 8.65059436e+05, 6.04313324e+06, + 4.24549734e+07; + + CALL_SUBTEST(res = bessel_i1(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function i1e. Reference results obtained with SciPy. + { + ArrayType x(21); + ArrayType expected(21); + ArrayType res(21); + + x << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, -2.0, 0.0, + 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected << -0.0875062221833, -0.092036796872, -0.0973496147565, + -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, + -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, + 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, + 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, + 0.0875062221833; + + CALL_SUBTEST(res = bessel_i1e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function j0. Reference results obtained with SciPy. + { + ArrayType x(77); + ArrayType expected(77); + ArrayType res(77); + + x << -38., -37., -36., -35., -34., -33., -32., -31., -30., + -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19., + -18., -17., -16., -15., -14., -13., -12., -11., -10., -9., -8., + -7., -6., -5., -4., -3., -2., -1., 0., 1., 2., 3., + 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., + 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., + 37., 38.; + + expected << 0.11433274, 0.01086237, -0.10556738, + -0.12684568, -0.03042119, 0.09727067, 0.13807901, 0.05120815, + -0.08636798, -0.14784876, -0.07315701, 0.07274192, 0.15599932, + 0.09626678, -0.05623027, -0.16241278, -0.12065148, 0.03657907, + 0.16702466, 0.14662944, -0.01335581, -0.16985425, -0.17489907, + -0.01422447, 0.17107348, 0.2069261 , 0.04768931, -0.1711903 , + -0.24593576, -0.09033361, 0.17165081, 0.30007927, 0.15064526, + -0.17759677, -0.39714981, -0.26005195, 0.22389078, 0.76519769, + 1. , 0.76519769, 0.22389078, -0.26005195, -0.39714981, + -0.17759677, 0.15064526, 0.30007927, 0.17165081, -0.09033361, + -0.24593576, -0.1711903 , 0.04768931, 0.2069261 , 0.17107348, + -0.01422447, -0.17489907, -0.16985425, -0.01335581, 0.14662944, + 0.16702466, 0.03657907, -0.12065148, -0.16241278, -0.05623027, + 0.09626678, 0.15599932, 0.07274192, -0.07315701, -0.14784876, + -0.08636798, 0.05120815, 0.13807901, 0.09727067, -0.03042119, + -0.12684568, -0.10556738, 0.01086237, 0.11433274; + + CALL_SUBTEST(res = bessel_j0(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function j1. Reference results obtained with SciPy. + { + ArrayType x(81); + ArrayType expected(81); + ArrayType res(81); + + x << -40., -39., -38., -37., -36., -35., -34., -33., -32., -31., -30., + -29., -28., -27., -26., -25., -24., -23., -22., -21., -20., -19., + -18., -17., -16., -15., -14., -13., -12., -11., -10., -9., -8., + -7., -6., -5., -4., -3., -2., -1., 0., 1., 2., 3., + 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., + 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., + 37., 38., 39., 40.; + + expected << -0.12603832, -0.0640561 , 0.05916189, 0.13058004, 0.08232981, + -0.04399094, -0.13297118, -0.10061965, 0.02658903, 0.13302432, + 0.11875106, -0.0069342 , -0.13055149, -0.13658472, -0.01504573, + 0.12535025, 0.15403807, 0.03951932, -0.11717779, -0.17112027, + -0.06683312, 0.10570143, 0.18799489, 0.09766849, -0.09039718, + -0.20510404, -0.13337515, 0.07031805, 0.2234471 , 0.1767853 , + -0.04347275, -0.24531179, -0.23463635, 0.00468282, 0.27668386, + 0.32757914, 0.06604333, -0.33905896, -0.57672481, -0.44005059, + 0. , 0.44005059, 0.57672481, 0.33905896, -0.06604333, + -0.32757914, -0.27668386, -0.00468282, 0.23463635, 0.24531179, + 0.04347275, -0.1767853 , -0.2234471 , -0.07031805, 0.13337515, + 0.20510404, 0.09039718, -0.09766849, -0.18799489, -0.10570143, + 0.06683312, 0.17112027, 0.11717779, -0.03951932, -0.15403807, + -0.12535025, 0.01504573, 0.13658472, 0.13055149, 0.0069342 , + -0.11875106, -0.13302432, -0.02658903, 0.10061965, 0.13297118, + 0.04399094, -0.08232981, -0.13058004, -0.05916189, 0.0640561 , + 0.12603832; + + CALL_SUBTEST(res = bessel_j1(x); + verify_component_wise(res, expected);); + } + // Test Bessel function k0e. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822, + 0.6977616 , 0.60929767, 0.54780756, 0.50186313, 0.4658451 , + 0.43662302, 0.41229555, 0.39163193, 0.3737955 , 0.35819488, + 0.34439865, 0.33208364, 0.32100235, 0.31096159, 0.30180802, + 0.29341821, 0.28569149, 0.27854488, 0.2719092 , 0.26572635, + 0.25994703, 0.25452917, 0.2494366 , 0.24463801, 0.24010616, + 0.23581722, 0.23175022, 0.22788667, 0.22421014, 0.22070602, + 0.21736123, 0.21416406, 0.21110397, 0.20817141, 0.20535778, + 0.20265524, 0.20005668, 0.19755558; + + CALL_SUBTEST(res = bessel_k0e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function k0. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << 1.54150675, 0.92441907, 4.21024438e-01, 1.13893873e-01, + 3.47395044e-02, 1.11596761e-02, 3.69109833e-03, 1.24399433e-03, + 4.24795742e-04, 1.46470705e-04, 5.08813130e-05, 1.77800623e-05, + 6.24302055e-06, 2.20082540e-06, 7.78454386e-07, 2.76137082e-07, + 9.81953648e-08, 3.49941166e-08, 1.24946640e-08, 4.46875334e-09, + 1.60067129e-09, 5.74123782e-10, 2.06176797e-10, 7.41235161e-11, + 2.66754511e-11, 9.60881878e-12, 3.46416156e-12, 1.24987740e-12, + 4.51286453e-13, 1.63053459e-13, 5.89495073e-14, 2.13247750e-14, + 7.71838266e-15, 2.79505752e-15, 1.01266123e-15, 3.67057597e-16, + 1.33103515e-16, 4.82858338e-17, 1.75232770e-17, 6.36161716e-18, + 2.31029936e-18, 8.39286110e-19; + + CALL_SUBTEST(res = bessel_k0(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function k0e. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << 1.97933385, 1.52410939, 1.14446308, 0.84156822, + 0.6977616 , 0.60929767, 0.54780756, 0.50186313, + 0.4658451 , 0.43662302, 0.41229555, 0.39163193, + 0.3737955 , 0.35819488, 0.34439865, 0.33208364, + 0.32100235, 0.31096159, 0.30180802, 0.29341821, + 0.28569149, 0.27854488, 0.2719092 , 0.26572635, + 0.25994703, 0.25452917, 0.2494366 , 0.24463801, + 0.24010616, 0.23581722, 0.23175022, 0.22788667, + 0.22421014, 0.22070602, 0.21736123, 0.21416406, + 0.21110397, 0.20817141, 0.20535778, 0.20265524, + 0.20005668, 0.19755558; + + CALL_SUBTEST(res = bessel_k0e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function k1. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << 3.74702597, 1.65644112, 6.01907230e-01, 1.39865882e-01, + 4.01564311e-02, 1.24834989e-02, 4.04461345e-03, 1.34391972e-03, + 4.54182487e-04, 1.55369212e-04, 5.36370164e-05, 1.86487735e-05, + 6.52086067e-06, 2.29075746e-06, 8.07858841e-07, 2.85834365e-07, + 1.01417294e-07, 3.60715712e-08, 1.28570417e-08, 4.59124963e-09, + 1.64226697e-09, 5.88305797e-10, 2.11029922e-10, 7.57898116e-11, + 2.72493059e-11, 9.80699893e-12, 3.53277807e-12, 1.27369078e-12, + 4.59568940e-13, 1.65940011e-13, 5.99574032e-14, 2.16773200e-14, + 7.84189960e-15, 2.83839927e-15, 1.02789171e-15, 3.72416929e-16, + 1.34991783e-16, 4.89519373e-17, 1.77585196e-17, 6.44478588e-18, + 2.33973340e-18, 8.49713195e-19; + + CALL_SUBTEST(res = bessel_k1(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function k1e. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << 4.81127659, 2.73100971, 1.63615349, 1.03347685, + 0.80656348, 0.68157595, 0.60027386, 0.54217591, + 0.49807158, 0.46314909, 0.43462525, 0.41076657, + 0.39043094, 0.37283175, 0.35740757, 0.34374563, + 0.33153489, 0.32053597, 0.31056123, 0.30146131, + 0.29311559, 0.2854255 , 0.27830958, 0.27169987, + 0.26553913, 0.25977879, 0.25437733, 0.249299 , + 0.24451285, 0.23999191, 0.2357126 , 0.23165413, + 0.22779816, 0.22412841, 0.22063036, 0.21729103, + 0.21409878, 0.21104314, 0.20811462, 0.20530466, + 0.20260547, 0.20000997; + + CALL_SUBTEST(res = bessel_k1e(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function y0. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << -0.93157302, -0.44451873, 0.08825696, 0.51037567, 0.37685001, + -0.01694074, -0.30851763, -0.28819468, -0.02594974, 0.22352149, + 0.2499367 , 0.05567117, -0.16884732, -0.22523731, -0.07820786, + 0.12719257, 0.2054643 , 0.095811 , -0.0926372 , -0.18755216, + -0.10951969, 0.0626406 , 0.17020176, 0.1198876 , -0.03598179, + -0.15283403, -0.12724943, 0.01204463, 0.13521498, 0.13183647, + 0.00948116, -0.11729573, -0.13383266, -0.02874248, 0.09913483, + 0.13340405, 0.04579799, -0.08085609, -0.13071488, -0.06066076, + 0.06262353, 0.12593642; + + CALL_SUBTEST(res = bessel_y0(x); + verify_component_wise(res, expected);); + } + + // Test Bessel function y1. Reference results obtained with SciPy. + { + ArrayType x(42); + ArrayType expected(42); + ArrayType res(42); + + x << 0.25, 0.5, 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., + 13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., + 26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., + 39., 40.; + + expected << -2.70410523, -1.47147239, -0.78121282, -0.10703243, + 0.32467442, 0.39792571, 0.14786314, -0.17501034, -0.30266724, + -0.15806046, 0.10431458, 0.24901542, 0.16370554, -0.05709922, + -0.21008141, -0.16664484, 0.02107363, 0.17797517, 0.16720504, + 0.00815513, -0.14956011, -0.16551161, -0.03253926, 0.12340586, + 0.1616692 , 0.05305978, -0.09882996, -0.15579655, -0.07025124, + 0.07552213, 0.14803412, 0.08442557, -0.05337283, -0.13854483, + -0.09578012, 0.03238588, 0.12751273, 0.10445477, -0.01262946, + -0.11514066, -0.11056411, -0.00579351; + + CALL_SUBTEST(res = bessel_y1(x); + verify_component_wise(res, expected);); + } +} + +EIGEN_DECLARE_TEST(bessel_functions) +{ + CALL_SUBTEST_1(array_bessel_functions<ArrayXf>()); + CALL_SUBTEST_2(array_bessel_functions<ArrayXd>()); +} diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp index 3b598bf42..7bf4e965f 100644 --- a/unsupported/test/cxx11_eventcount.cpp +++ b/unsupported/test/cxx11_eventcount.cpp @@ -30,11 +30,11 @@ static void test_basic_eventcount() EventCount ec(waiters); EventCount::Waiter& w = waiters[0]; ec.Notify(false); - ec.Prewait(&w); + ec.Prewait(); ec.Notify(true); ec.CommitWait(&w); - ec.Prewait(&w); - ec.CancelWait(&w); + ec.Prewait(); + ec.CancelWait(); } // Fake bounded counter-based queue. @@ -112,7 +112,7 @@ static void test_stress_eventcount() unsigned idx = rand_reentrant(&rnd) % kQueues; if (queues[idx].Pop()) continue; j--; - ec.Prewait(&w); + ec.Prewait(); bool empty = true; for (int q = 0; q < kQueues; q++) { if (!queues[q].Empty()) { @@ -121,7 +121,7 @@ static void test_stress_eventcount() } } if (!empty) { - ec.CancelWait(&w); + ec.CancelWait(); continue; } ec.CommitWait(&w); @@ -135,7 +135,7 @@ static void test_stress_eventcount() } } -void test_cxx11_eventcount() +EIGEN_DECLARE_TEST(cxx11_eventcount) { CALL_SUBTEST(test_basic_eventcount()); CALL_SUBTEST(test_stress_eventcount()); diff --git a/unsupported/test/cxx11_maxsizevector.cpp b/unsupported/test/cxx11_maxsizevector.cpp new file mode 100644 index 000000000..46b689a8e --- /dev/null +++ b/unsupported/test/cxx11_maxsizevector.cpp @@ -0,0 +1,77 @@ +#include "main.h" + +#include <exception> // std::exception + +#include <unsupported/Eigen/CXX11/Tensor> + +struct Foo +{ + static Index object_count; + static Index object_limit; + EIGEN_ALIGN_TO_BOUNDARY(128) int dummy; + + Foo(int x=0) : dummy(x) + { +#ifdef EIGEN_EXCEPTIONS + // TODO: Is this the correct way to handle this? + if (Foo::object_count > Foo::object_limit) { std::cout << "\nThrow!\n"; throw Foo::Fail(); } +#endif + std::cout << '+'; + ++Foo::object_count; + eigen_assert((internal::UIntPtr(this) & (127)) == 0); + } + Foo(const Foo&) + { + std::cout << 'c'; + ++Foo::object_count; + eigen_assert((internal::UIntPtr(this) & (127)) == 0); + } + + ~Foo() + { + std::cout << '~'; + --Foo::object_count; + } + + class Fail : public std::exception {}; +}; + +Index Foo::object_count = 0; +Index Foo::object_limit = 0; + + + +EIGEN_DECLARE_TEST(cxx11_maxsizevector) +{ + typedef MaxSizeVector<Foo> VectorX; + Foo::object_count = 0; + for(int r = 0; r < g_repeat; r++) { + Index rows = internal::random<Index>(3,30); + Foo::object_limit = internal::random<Index>(0, rows - 2); + std::cout << "object_limit = " << Foo::object_limit << std::endl; + bool exception_raised = false; +#ifdef EIGEN_EXCEPTIONS + try + { +#endif + std::cout << "\nVectorX m(" << rows << ");\n"; + VectorX vect(rows); + for(int i=0; i<rows; ++i) + vect.push_back(Foo()); +#ifdef EIGEN_EXCEPTIONS + VERIFY(false); // not reached if exceptions are enabled + } + catch (const Foo::Fail&) { exception_raised = true; } + VERIFY(exception_raised); +#endif + VERIFY_IS_EQUAL(Index(0), Foo::object_count); + + { + Foo::object_limit = rows+1; + VectorX vect2(rows, Foo()); + VERIFY_IS_EQUAL(Foo::object_count, rows); + } + VERIFY_IS_EQUAL(Index(0), Foo::object_count); + std::cout << '\n'; + } +} diff --git a/unsupported/test/cxx11_meta.cpp b/unsupported/test/cxx11_meta.cpp index 8911c59d8..510e11032 100644 --- a/unsupported/test/cxx11_meta.cpp +++ b/unsupported/test/cxx11_meta.cpp @@ -340,7 +340,7 @@ static void test_array_misc() VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5); } -void test_cxx11_meta() +EIGEN_DECLARE_TEST(cxx11_meta) { CALL_SUBTEST(test_gen_numeric_list()); CALL_SUBTEST(test_concat()); diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp index 5f9bb938b..993ee1789 100644 --- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp +++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp @@ -11,22 +11,23 @@ #define EIGEN_USE_THREADS #include "main.h" #include "Eigen/CXX11/ThreadPool" +#include "Eigen/CXX11/Tensor" static void test_create_destroy_empty_pool() { // Just create and destroy the pool. This will wind up and tear down worker // threads. Ensure there are no issues in that logic. for (int i = 0; i < 16; ++i) { - NonBlockingThreadPool tp(i); + ThreadPool tp(i); } } -static void test_parallelism() +static void test_parallelism(bool allow_spinning) { // Test we never-ever fail to match available tasks with idle threads. const int kThreads = 16; // code below expects that this is a multiple of 4 - NonBlockingThreadPool tp(kThreads); + ThreadPool tp(kThreads, allow_spinning); VERIFY_IS_EQUAL(tp.NumThreads(), kThreads); VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1); for (int iter = 0; iter < 100; ++iter) { @@ -100,8 +101,80 @@ static void test_parallelism() } } -void test_cxx11_non_blocking_thread_pool() + +static void test_cancel() +{ + ThreadPool tp(2); + + // Schedule a large number of closure that each sleeps for one second. This + // will keep the thread pool busy for much longer than the default test timeout. + for (int i = 0; i < 1000; ++i) { + tp.Schedule([]() { + std::this_thread::sleep_for(std::chrono::milliseconds(2000)); + }); + } + + // Cancel the processing of all the closures that are still pending. + tp.Cancel(); +} + +static void test_pool_partitions() { + const int kThreads = 2; + ThreadPool tp(kThreads); + + // Assign each thread to its own partition, so that stealing other work only + // occurs globally when a thread is idle. + std::vector<std::pair<unsigned, unsigned>> steal_partitions(kThreads); + for (int i = 0; i < kThreads; ++i) { + steal_partitions[i] = std::make_pair(i, i + 1); + } + tp.SetStealPartitions(steal_partitions); + + std::atomic<int> running(0); + std::atomic<int> done(0); + std::atomic<int> phase(0); + + // Schedule kThreads tasks and ensure that they all are running. + for (int i = 0; i < kThreads; ++i) { + tp.Schedule([&]() { + const int thread_id = tp.CurrentThreadId(); + VERIFY_GE(thread_id, 0); + VERIFY_LE(thread_id, kThreads - 1); + ++running; + while (phase < 1) { + } + ++done; + }); + } + while (running != kThreads) { + } + // Schedule each closure to only run on thread 'i' and verify that it does. + for (int i = 0; i < kThreads; ++i) { + tp.ScheduleWithHint( + [&, i]() { + ++running; + const int thread_id = tp.CurrentThreadId(); + VERIFY_IS_EQUAL(thread_id, i); + while (phase < 2) { + } + ++done; + }, + i, i + 1); + } + running = 0; + phase = 1; + while (running != kThreads) { + } + running = 0; + phase = 2; +} + + +EIGEN_DECLARE_TEST(cxx11_non_blocking_thread_pool) { CALL_SUBTEST(test_create_destroy_empty_pool()); - CALL_SUBTEST(test_parallelism()); + CALL_SUBTEST(test_parallelism(true)); + CALL_SUBTEST(test_parallelism(false)); + CALL_SUBTEST(test_cancel()); + CALL_SUBTEST(test_pool_partitions()); } diff --git a/unsupported/test/cxx11_runqueue.cpp b/unsupported/test/cxx11_runqueue.cpp index 91f690114..8fc5a3074 100644 --- a/unsupported/test/cxx11_runqueue.cpp +++ b/unsupported/test/cxx11_runqueue.cpp @@ -227,7 +227,7 @@ void test_stress_runqueue() VERIFY(total.load() == 0); } -void test_cxx11_runqueue() +EIGEN_DECLARE_TEST(cxx11_runqueue) { CALL_SUBTEST_1(test_basic_runqueue()); CALL_SUBTEST_2(test_empty_runqueue()); diff --git a/unsupported/test/cxx11_tensor_argmax.cpp b/unsupported/test/cxx11_tensor_argmax.cpp index 037767270..4a0c8967b 100644 --- a/unsupported/test/cxx11_tensor_argmax.cpp +++ b/unsupported/test/cxx11_tensor_argmax.cpp @@ -273,7 +273,7 @@ static void test_argmin_dim() } } -void test_cxx11_tensor_argmax() +EIGEN_DECLARE_TEST(cxx11_tensor_argmax) { CALL_SUBTEST(test_simple_index_tuples<RowMajor>()); CALL_SUBTEST(test_simple_index_tuples<ColMajor>()); diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index 653443dc5..79f4066e9 100644 --- a/unsupported/test/cxx11_tensor_argmax_cuda.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -9,19 +9,18 @@ #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_cuda + #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; template <int Layout> -void test_cuda_simple_argmax() +void test_gpu_simple_argmax() { Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); @@ -37,13 +36,13 @@ void test_cuda_simple_argmax() double* d_in; DenseIndex* d_out_max; DenseIndex* d_out_min; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out_max), out_bytes); - cudaMalloc((void**)(&d_out_min), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out_max), out_bytes); + gpuMalloc((void**)(&d_out_min), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); @@ -53,20 +52,20 @@ void test_cuda_simple_argmax() gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_min.device(gpu_device) = gpu_in.argmin(); - assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); - cudaFree(d_in); - cudaFree(d_out_max); - cudaFree(d_out_min); + gpuFree(d_in); + gpuFree(d_out_max); + gpuFree(d_out_min); } template <int DataLayout> -void test_cuda_argmax_dim() +void test_gpu_argmax_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -100,12 +99,12 @@ void test_cuda_argmax_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -113,8 +112,8 @@ void test_cuda_argmax_dim() gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), size_t(2*3*5*7 / tensor.dimension(dim))); @@ -137,25 +136,25 @@ void test_cuda_argmax_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } template <int DataLayout> -void test_cuda_argmin_dim() +void test_gpu_argmin_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -189,12 +188,12 @@ void test_cuda_argmin_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -202,8 +201,8 @@ void test_cuda_argmin_dim() gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), 2*3*5*7 / tensor.dimension(dim)); @@ -226,29 +225,29 @@ void test_cuda_argmin_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } -void test_cxx11_tensor_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_argmax_gpu) { - CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); - CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>()); } diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp new file mode 100644 index 000000000..7ac71286e --- /dev/null +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -0,0 +1,258 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 + +#include "main.h" + +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template <typename DataType, int Layout, typename DenseIndex> +static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) { + Tensor<DataType, 3, Layout, DenseIndex> in(Eigen::array<DenseIndex, 3>{{2, 2, 2}}); + Tensor<DenseIndex, 0, Layout, DenseIndex> out_max; + Tensor<DenseIndex, 0, Layout, DenseIndex> out_min; + in.setRandom(); + in *= in.constant(100.0); + in(0, 0, 0) = -1000.0; + in(1, 1, 1) = 1000.0; + + std::size_t in_bytes = in.size() * sizeof(DataType); + std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); + + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out_max = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + DenseIndex* d_out_min = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, Layout, DenseIndex> > gpu_in(d_in, + Eigen::array<DenseIndex, 3>{{2, 2, 2}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_max(d_out_max); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 0, Layout, DenseIndex> > gpu_out_min(d_out_min); + sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes); + + gpu_out_max.device(sycl_device) = gpu_in.argmax(); + gpu_out_min.device(sycl_device) = gpu_in.argmin(); + + sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); + sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); + + VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1); + VERIFY_IS_EQUAL(out_min(), 0); + + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out_max); + sycl_device.deallocate(d_out_min); +} + +template <typename DataType, int DataLayout, typename DenseIndex> +static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); + + std::vector<DenseIndex> dims; + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + array<DenseIndex, 3> out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; + + Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) + // = 10.0 + tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in( + d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmax(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + +template <typename DataType, int DataLayout, typename DenseIndex> +static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor<DataType, 4, DataLayout, DenseIndex> tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); + + std::vector<DenseIndex> dims; + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); + for (DenseIndex dim = 0; dim < 4; ++dim) { + array<DenseIndex, 3> out_shape; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; + + Tensor<DenseIndex, 3, DataLayout, DenseIndex> tensor_arg(out_shape); + + array<DenseIndex, 4> ix; + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0 + tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0; + } + } + } + } + + std::size_t in_bytes = tensor.size() * sizeof(DataType); + std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + + DataType* d_in = static_cast<DataType*>(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast<DenseIndex*>(sycl_device.allocate(out_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, DenseIndex> > gpu_in( + d_in, Eigen::array<DenseIndex, 4>{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); + Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout, DenseIndex> > gpu_out(d_out, out_shape); + + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + VERIFY_IS_EQUAL(static_cast<size_t>(tensor_arg.size()), + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the first index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + } + + sycl_device.synchronize(); + + for (DenseIndex i = 0; i < sizeDim0; ++i) { + for (DenseIndex j = 0; j < sizeDim1; ++j) { + for (DenseIndex k = 0; k < sizeDim2; ++k) { + for (DenseIndex l = 0; l < sizeDim3; ++l) { + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0 + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0; + } + } + } + } + + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); + gpu_out.device(sycl_device) = gpu_in.argmin(dim); + sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); + + for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { + // Expect max to be in the last index of the reduced dimension + VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); + } + sycl_device.deallocate(d_in); + sycl_device.deallocate(d_out); + } +} + +template <typename DataType, typename Device_Selector> +void sycl_argmax_test_per_device(const Device_Selector& d) { + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_simple_argmax<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_simple_argmax<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmax_dim<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmax_dim<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_argmin_dim<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_argmin_dim<DataType, RowMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_assign.cpp b/unsupported/test/cxx11_tensor_assign.cpp index 8fe85d83c..ce9d24369 100644 --- a/unsupported/test/cxx11_tensor_assign.cpp +++ b/unsupported/test/cxx11_tensor_assign.cpp @@ -358,7 +358,7 @@ static void test_std_initializers_tensor() { #endif // EIGEN_HAS_VARIADIC_TEMPLATES } -void test_cxx11_tensor_assign() +EIGEN_DECLARE_TEST(cxx11_tensor_assign) { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); diff --git a/unsupported/test/cxx11_tensor_block_access.cpp b/unsupported/test/cxx11_tensor_block_access.cpp new file mode 100644 index 000000000..5fb12e0e0 --- /dev/null +++ b/unsupported/test/cxx11_tensor_block_access.cpp @@ -0,0 +1,576 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Andy Davis <andydavis@google.com> +// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <algorithm> +#include <set> + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::Index; +using Eigen::RowMajor; +using Eigen::ColMajor; +using Eigen::internal::TensorBlockShapeType; + +static TensorOpCost zeroCost() { return {0, 0, 0}; } + +template<typename T> +static const T& choose(int layout, const T& col, const T& row) { + return layout == ColMajor ? col : row; +} + +static TensorBlockShapeType RandomShape() { + return internal::random<bool>() + ? TensorBlockShapeType::kUniformAllDims + : TensorBlockShapeType::kSkewedInnerDims; +} + +template <int NumDims> +static size_t RandomTargetSize(const DSizes<Index, NumDims>& dims) { + return internal::random<size_t>(1, dims.TotalSize()); +} + +template <int NumDims> +static DSizes<Index, NumDims> RandomDims() { + array<Index, NumDims> dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random<int>(1, 20); + } + return DSizes<Index, NumDims>(dims); +} + +template <typename T> +static T* GenerateRandomData(const Index& size) { + T* data = new T[size]; + for (int i = 0; i < size; ++i) { + data[i] = internal::random<T>(); + } + return data; +} + +template <int NumDims> +static void Debug(DSizes<Index, NumDims> dims) { + for (int i = 0; i < NumDims; ++i) { + std::cout << dims[i] << "; "; + } + std::cout << std::endl; +} + +template <int Layout> +static void test_block_mapper_sanity() +{ + typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper; + + DSizes<Index, 2> tensor_dims(100, 100); + + // Test uniform blocks. + TensorBlockMapper uniform_block_mapper( + tensor_dims, {TensorBlockShapeType::kUniformAllDims, 100, zeroCost()}); + + VERIFY_IS_EQUAL(uniform_block_mapper.blockCount(), 100); + VERIFY_IS_EQUAL(uniform_block_mapper.blockTotalSize(), 100); + + // 10x10 blocks + auto uniform_b0 = uniform_block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(uniform_b0.dimensions().at(0), 10); + VERIFY_IS_EQUAL(uniform_b0.dimensions().at(1), 10); + + // Test skewed to inner dims blocks. + TensorBlockMapper skewed_block_mapper( + tensor_dims, {TensorBlockShapeType::kSkewedInnerDims, 100, zeroCost()}); + + VERIFY_IS_EQUAL(skewed_block_mapper.blockCount(), 100); + VERIFY_IS_EQUAL(skewed_block_mapper.blockTotalSize(), 100); + + // 1x100 (100x1) rows/cols depending on a tensor layout. + auto skewed_b0 = skewed_block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(skewed_b0.dimensions().at(0), choose(Layout, 100, 1)); + VERIFY_IS_EQUAL(skewed_b0.dimensions().at(1), choose(Layout, 1, 100)); +} + +// Given a TensorBlock "visit" every element accessible though it, and a keep an +// index in the visited set. Verify that every coeff accessed only once. +template<int NumDims, int Layout> +static void UpdateCoeffSet( + const DSizes<Index, NumDims>& tensor_strides, + const internal::TensorBlockDescriptor<NumDims>& block, + Index first_coeff_index, int dim_index, std::set<Index>* visited_coeffs) { + const DSizes<Index, NumDims>& block_sizes = block.dimensions(); + + for (int i = 0; i < block_sizes[dim_index]; ++i) { + if (tensor_strides[dim_index] == 1) { + typedef std::pair<std::set<Index>::iterator, bool> ReturnType; + ReturnType inserted = visited_coeffs->insert(first_coeff_index + i); + VERIFY_IS_EQUAL(inserted.second, true); + } else { + int next_dim_index = dim_index + choose(Layout, -1, 1); + UpdateCoeffSet<NumDims, Layout>(tensor_strides, block, first_coeff_index, + next_dim_index, visited_coeffs); + first_coeff_index += tensor_strides[dim_index]; + } + } +} + +template <typename T, int NumDims, int Layout> +static void test_block_mapper_maps_every_element() { + typedef internal::TensorBlockMapper<NumDims, Layout> TensorBlockMapper; + + DSizes<Index, NumDims> dims = RandomDims<NumDims>(); + DSizes<Index, NumDims> strides = internal::strides<Layout>(dims); + + // Keep track of elements indices available via block access. + std::set<Index> coeff_set; + + // Try different combinations of block types and sizes. + TensorBlockMapper block_mapper( + dims, {RandomShape(), RandomTargetSize(dims), zeroCost()}); + + for (int i = 0; i < block_mapper.blockCount(); ++i) { + auto block = block_mapper.blockDescriptor(i); + UpdateCoeffSet<NumDims, Layout>(strides, block, block.offset(), + choose(Layout, NumDims - 1, 0), + &coeff_set); + } + + // Verify that every coefficient in the original Tensor is accessible through + // TensorBlock only once. + Index total_coeffs = dims.TotalSize(); + VERIFY_IS_EQUAL(Index(coeff_set.size()), total_coeffs); + VERIFY_IS_EQUAL(*coeff_set.begin(), 0); + VERIFY_IS_EQUAL(*coeff_set.rbegin(), total_coeffs - 1); +} + +template <int Layout, int NumDims> +static Index GetInputIndex(Index output_index, + const array<Index, NumDims>& output_to_input_dim_map, + const array<Index, NumDims>& input_strides, + const array<Index, NumDims>& output_strides) { + int input_index = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[NumDims - 1]]; + } +} + +template <int Layout, int NumDims> +static array<Index, NumDims> ComputeStrides( + const array<Index, NumDims>& sizes) { + array<Index, NumDims> strides; + if (Layout == ColMajor) { + strides[0] = 1; + for (int i = 1; i < NumDims; ++i) { + strides[i] = strides[i - 1] * sizes[i - 1]; + } + } else { + strides[NumDims - 1] = 1; + for (int i = NumDims - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * sizes[i + 1]; + } + } + return strides; +} + +template<typename Scalar, typename StorageIndex, int Dim> +class EqualityChecker +{ + const Scalar* input_data; + const DSizes<StorageIndex, Dim> &input_dims, &input_strides, &output_dims, &output_strides; + void check_recursive(const Scalar* input, const Scalar* output, int depth=0) const + { + if(depth==Dim) + { + VERIFY_IS_EQUAL(*input, *output); + return; + } + + for(int i=0; i<output_dims[depth]; ++i) + { + check_recursive(input + i % input_dims[depth] * input_strides[depth], output + i*output_strides[depth], depth+1); + } + } +public: + EqualityChecker(const Scalar* input_data_, + const DSizes<StorageIndex, Dim> &input_dims_, const DSizes<StorageIndex, Dim> &input_strides_, + const DSizes<StorageIndex, Dim> &output_dims_, const DSizes<StorageIndex, Dim> &output_strides_) + : input_data(input_data_) + , input_dims(input_dims_), input_strides(input_strides_) + , output_dims(output_dims_), output_strides(output_strides_) + {} + + void operator()(const Scalar* output_data) const + { + check_recursive(input_data, output_data); + } +}; + +template <int Layout> +static void test_uniform_block_shape() +{ + typedef internal::TensorBlockDescriptor<5> TensorBlock; + typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper; + + { + // Test shape 'UniformAllDims' with uniform 'max_coeff count'. + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 5 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + for (int i = 0; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // partially into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 7 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 5 * 5 * 5 * 5 * 6; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(6, block.dimensions()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first inner-most dimension. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 5 * 5 * 5 * 5; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(5, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 5 * 5 * 5 * 5 * 7; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(5, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with larger 'max_coeff count' which spills + // fully into first few inner-most dimensions. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const Index max_coeff_count = 7 * 5 * 6 * 7 * 5; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[0]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(6, block.dimensions()[2]); + VERIFY_IS_EQUAL(7, block.dimensions()[3]); + VERIFY_IS_EQUAL(5, block.dimensions()[4]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const Index max_coeff_count = 5 * 5 * 5 * 6 * 7; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY_IS_EQUAL(6, block.dimensions()[3]); + VERIFY_IS_EQUAL(5, block.dimensions()[2]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(5, block.dimensions()[0]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'UniformAllDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(7, 5, 6, 17, 7); + const Index max_coeff_count = 7 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[0]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(6, block.dimensions()[2]); + VERIFY_IS_EQUAL(17, block.dimensions()[3]); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(7, 5, 6, 9, 7); + const Index max_coeff_count = 7 * 5 * 6 * 9 * 7; + TensorBlockMapper block_mapper(dims, {TensorBlockShapeType::kUniformAllDims, + max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY_IS_EQUAL(9, block.dimensions()[3]); + VERIFY_IS_EQUAL(6, block.dimensions()[2]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(7, block.dimensions()[0]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_skewed_inner_dim_block_shape() +{ + typedef internal::TensorBlockDescriptor<5> TensorBlock; + typedef internal::TensorBlockMapper<5, Layout> TensorBlockMapper; + + // Test shape 'SkewedInnerDims' with partial allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 10 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(10, block.dimensions()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 1 * 1 * 1 * 1 * 6; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(6, block.dimensions()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 1 * 1 * 1 * 1; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + for (int i = 1; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 1 * 1 * 1 * 1 * 7; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + for (int i = 3; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to second inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 3 * 1 * 1 * 1; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + VERIFY_IS_EQUAL(3, block.dimensions()[1]); + for (int i = 2; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 1 * 1 * 1 * 15 * 7; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY_IS_EQUAL(15, block.dimensions()[3]); + for (int i = 2; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to inner-most dim, + // and partial allocation to third inner-dim. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 5 * 5 * 1 * 1; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(5, block.dimensions()[2]); + for (int i = 3; i < 5; ++i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 1 * 1 * 5 * 17 * 7; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY_IS_EQUAL(17, block.dimensions()[3]); + VERIFY_IS_EQUAL(5, block.dimensions()[2]); + for (int i = 1; i >= 0; --i) { + VERIFY_IS_EQUAL(1, block.dimensions()[i]); + } + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } + + // Test shape 'SkewedInnerDims' with full allocation to all dims. + if (Layout == ColMajor) { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(6, block.dimensions()[2]); + VERIFY_IS_EQUAL(17, block.dimensions()[3]); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } else { + DSizes<Index, 5> dims(11, 5, 6, 17, 7); + const Index max_coeff_count = 11 * 5 * 6 * 17 * 7; + TensorBlockMapper block_mapper( + dims, + {TensorBlockShapeType::kSkewedInnerDims, max_coeff_count, zeroCost()}); + TensorBlock block = block_mapper.blockDescriptor(0); + VERIFY_IS_EQUAL(7, block.dimensions()[4]); + VERIFY_IS_EQUAL(17, block.dimensions()[3]); + VERIFY_IS_EQUAL(6, block.dimensions()[2]); + VERIFY_IS_EQUAL(5, block.dimensions()[1]); + VERIFY_IS_EQUAL(11, block.dimensions()[0]); + VERIFY(block.dimensions().TotalSize() <= max_coeff_count); + } +} + +template <int Layout> +static void test_empty_dims(const internal::TensorBlockShapeType block_shape) +{ + // Test blocking of tensors with zero dimensions: + // - we must not crash on asserts and divisions by zero + // - we must not return block with zero dimensions + // (recipe for overflows/underflows, divisions by zero and NaNs later) + // - total block count must be zero + { + typedef internal::TensorBlockMapper<1, Layout> TensorBlockMapper; + + DSizes<Index, 1> dims(0); + for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper( + dims, {block_shape, max_coeff_count, zeroCost()}); + VERIFY_IS_EQUAL(block_mapper.blockCount(), 0); + VERIFY(block_mapper.blockTotalSize() >= 1); + } + } + + { + typedef internal::TensorBlockMapper<2, Layout> TensorBlockMapper; + + for (int dim1 = 0; dim1 < 3; ++dim1) { + for (int dim2 = 0; dim2 < 3; ++dim2) { + DSizes<Index, 2> dims(dim1, dim2); + for (size_t max_coeff_count = 0; max_coeff_count < 2; ++max_coeff_count) { + TensorBlockMapper block_mapper( + dims, {block_shape, max_coeff_count, zeroCost()}); + if (dim1 * dim2 == 0) { + VERIFY_IS_EQUAL(block_mapper.blockCount(), 0); + } + VERIFY(block_mapper.blockTotalSize() >= 1); + } + } + } + } +} + +#define TEST_LAYOUTS(NAME) \ + CALL_SUBTEST(NAME<ColMajor>()); \ + CALL_SUBTEST(NAME<RowMajor>()) + +#define TEST_LAYOUTS_AND_DIMS(TYPE, NAME) \ + CALL_SUBTEST((NAME<TYPE, 1, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 1, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 2, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 2, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 3, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 3, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 4, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 4, RowMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 5, ColMajor>())); \ + CALL_SUBTEST((NAME<TYPE, 5, RowMajor>())) + +#define TEST_LAYOUTS_WITH_ARG(NAME, ARG) \ + CALL_SUBTEST(NAME<ColMajor>(ARG)); \ + CALL_SUBTEST(NAME<RowMajor>(ARG)) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_access) { + TEST_LAYOUTS(test_block_mapper_sanity); + TEST_LAYOUTS_AND_DIMS(float, test_block_mapper_maps_every_element); + TEST_LAYOUTS(test_uniform_block_shape); + TEST_LAYOUTS(test_skewed_inner_dim_block_shape); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kUniformAllDims); + TEST_LAYOUTS_WITH_ARG(test_empty_dims, TensorBlockShapeType::kSkewedInnerDims); +} + +#undef TEST_LAYOUTS +#undef TEST_LAYOUTS_WITH_ARG diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp new file mode 100644 index 000000000..b2e26ebb7 --- /dev/null +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -0,0 +1,858 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// clang-format off +#include "main.h" +#include <Eigen/CXX11/Tensor> +// clang-format on + +using Eigen::internal::TensorBlockDescriptor; +using Eigen::internal::TensorExecutor; + +// -------------------------------------------------------------------------- // +// Utility functions to generate random tensors, blocks, and evaluate them. + +template <int NumDims> +static DSizes<Index, NumDims> RandomDims(Index min, Index max) { + DSizes<Index, NumDims> dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random<Index>(min, max); + } + return DSizes<Index, NumDims>(dims); +} + +// Block offsets and extents allows to construct a TensorSlicingOp corresponding +// to a TensorBlockDescriptor. +template <int NumDims> +struct TensorBlockParams { + DSizes<Index, NumDims> offsets; + DSizes<Index, NumDims> sizes; + TensorBlockDescriptor<NumDims, Index> desc; +}; + +template <int Layout, int NumDims> +static TensorBlockParams<NumDims> RandomBlock(DSizes<Index, NumDims> dims, + Index min, Index max) { + // Choose random offsets and sizes along all tensor dimensions. + DSizes<Index, NumDims> offsets(RandomDims<NumDims>(min, max)); + DSizes<Index, NumDims> sizes(RandomDims<NumDims>(min, max)); + + // Make sure that offset + size do not overflow dims. + for (int i = 0; i < NumDims; ++i) { + offsets[i] = numext::mini(dims[i] - 1, offsets[i]); + sizes[i] = numext::mini(sizes[i], dims[i] - offsets[i]); + } + + Index offset = 0; + DSizes<Index, NumDims> strides = Eigen::internal::strides<Layout>(dims); + for (int i = 0; i < NumDims; ++i) { + offset += strides[i] * offsets[i]; + } + + return {offsets, sizes, TensorBlockDescriptor<NumDims, Index>(offset, sizes)}; +} + +// Generate block with block sizes skewed towards inner dimensions. This type of +// block is required for evaluating broadcast expressions. +template <int Layout, int NumDims> +static TensorBlockParams<NumDims> SkewedInnerBlock( + DSizes<Index, NumDims> dims) { + using BlockMapper = internal::TensorBlockMapper<NumDims, Layout, Index>; + BlockMapper block_mapper(dims, + {internal::TensorBlockShapeType::kSkewedInnerDims, + internal::random<size_t>(1, dims.TotalSize()), + {0, 0, 0}}); + + Index total_blocks = block_mapper.blockCount(); + Index block_index = internal::random<Index>(0, total_blocks - 1); + auto block = block_mapper.blockDescriptor(block_index); + DSizes<Index, NumDims> sizes = block.dimensions(); + + auto strides = internal::strides<Layout>(dims); + DSizes<Index, NumDims> offsets; + + // Compute offsets for the first block coefficient. + Index index = block.offset(); + if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = index / strides[i]; + index -= idx * strides[i]; + offsets[i] = idx; + } + if (NumDims > 0) offsets[0] = index; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = index / strides[i]; + index -= idx * strides[i]; + offsets[i] = idx; + } + if (NumDims > 0) offsets[NumDims - 1] = index; + } + + return {offsets, sizes, block}; +} + +template <int NumDims> +static TensorBlockParams<NumDims> FixedSizeBlock(DSizes<Index, NumDims> dims) { + DSizes<Index, NumDims> offsets; + for (int i = 0; i < NumDims; ++i) offsets[i] = 0; + + return {offsets, dims, TensorBlockDescriptor<NumDims, Index>(0, dims)}; +} + +inline Eigen::IndexList<Index, Eigen::type2index<1>> NByOne(Index n) { + Eigen::IndexList<Index, Eigen::type2index<1>> ret; + ret.set(0, n); + return ret; +} +inline Eigen::IndexList<Eigen::type2index<1>, Index> OneByM(Index m) { + Eigen::IndexList<Eigen::type2index<1>, Index> ret; + ret.set(1, m); + return ret; +} + +// -------------------------------------------------------------------------- // +// Verify that block expression evaluation produces the same result as a +// TensorSliceOp (reading a tensor block is same to taking a tensor slice). + +template <typename T, int NumDims, int Layout, typename Expression, + typename GenBlockParams> +static void VerifyBlockEvaluator(Expression expr, GenBlockParams gen_block) { + using Device = DefaultDevice; + auto d = Device(); + + // Scratch memory allocator for block evaluation. + typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; + TensorBlockScratch scratch(d); + + // TensorEvaluator is needed to produce tensor blocks of the expression. + auto eval = TensorEvaluator<const decltype(expr), Device>(expr, d); + eval.evalSubExprsIfNeeded(nullptr); + + // Choose a random offsets, sizes and TensorBlockDescriptor. + TensorBlockParams<NumDims> block_params = gen_block(); + + // Evaluate TensorBlock expression into a tensor. + Tensor<T, NumDims, Layout> block(block_params.desc.dimensions()); + + // Dimensions for the potential destination buffer. + DSizes<Index, NumDims> dst_dims; + if (internal::random<bool>()) { + dst_dims = block_params.desc.dimensions(); + } else { + for (int i = 0; i < NumDims; ++i) { + Index extent = internal::random<Index>(0, 5); + dst_dims[i] = block_params.desc.dimension(i) + extent; + } + } + + // Maybe use this tensor as a block desc destination. + Tensor<T, NumDims, Layout> dst(dst_dims); + dst.setZero(); + if (internal::random<bool>()) { + block_params.desc.template AddDestinationBuffer<Layout>( + dst.data(), internal::strides<Layout>(dst.dimensions())); + } + + const bool root_of_expr = internal::random<bool>(); + auto tensor_block = eval.block(block_params.desc, scratch, root_of_expr); + + if (tensor_block.kind() == internal::TensorBlockKind::kMaterializedInOutput) { + // Copy data from destination buffer. + if (dimensions_match(dst.dimensions(), block.dimensions())) { + block = dst; + } else { + DSizes<Index, NumDims> offsets; + for (int i = 0; i < NumDims; ++i) offsets[i] = 0; + block = dst.slice(offsets, block.dimensions()); + } + + } else { + // Assign to block from expression. + auto b_expr = tensor_block.expr(); + + // We explicitly disable vectorization and tiling, to run a simple coefficient + // wise assignment loop, because it's very simple and should be correct. + using BlockAssign = TensorAssignOp<decltype(block), const decltype(b_expr)>; + using BlockExecutor = TensorExecutor<const BlockAssign, Device, false, + internal::TiledEvaluation::Off>; + BlockExecutor::run(BlockAssign(block, b_expr), d); + } + + // Cleanup temporary buffers owned by a tensor block. + tensor_block.cleanup(); + + // Compute a Tensor slice corresponding to a Tensor block. + Tensor<T, NumDims, Layout> slice(block_params.desc.dimensions()); + auto s_expr = expr.slice(block_params.offsets, block_params.sizes); + + // Explicitly use coefficient assignment to evaluate slice expression. + using SliceAssign = TensorAssignOp<decltype(slice), const decltype(s_expr)>; + using SliceExecutor = TensorExecutor<const SliceAssign, Device, false, + internal::TiledEvaluation::Off>; + SliceExecutor::run(SliceAssign(slice, s_expr), d); + + // Tensor block and tensor slice must be the same. + for (Index i = 0; i < block.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(block.coeff(i), slice.coeff(i)); + } +} + +// -------------------------------------------------------------------------- // + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_block() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Identity tensor expression transformation. + VerifyBlockEvaluator<T, NumDims, Layout>( + input, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_unary_expr_block() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.abs(), [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_binary_expr_block() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> lhs(dims), rhs(dims); + lhs.setRandom(); + rhs.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + lhs * rhs, [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_binary_with_unary_expr_block() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> lhs(dims), rhs(dims); + lhs.setRandom(); + rhs.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + (lhs.square() + rhs.square()).sqrt(), + [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_broadcast() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + DSizes<Index, NumDims> bcast = RandomDims<NumDims>(1, 5); + + DSizes<Index, NumDims> bcasted_dims; + for (int i = 0; i < NumDims; ++i) bcasted_dims[i] = dims[i] * bcast[i]; + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.broadcast(bcast), + [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.broadcast(bcast), + [&bcasted_dims]() { return RandomBlock<Layout>(bcasted_dims, 5, 10); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.broadcast(bcast), + [&bcasted_dims]() { return FixedSizeBlock(bcasted_dims); }); + + // Check that desc.destination() memory is not shared between two broadcast + // materializations. + VerifyBlockEvaluator<T, NumDims, Layout>( + input.broadcast(bcast) * input.abs().broadcast(bcast), + [&bcasted_dims]() { return SkewedInnerBlock<Layout>(bcasted_dims); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_reshape() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 10); + + DSizes<Index, NumDims> shuffled = dims; + std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed)); + + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.reshape(shuffled), + [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.reshape(shuffled), + [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_cast() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.template cast<int>().template cast<T>(), + [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_select() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> lhs(dims); + Tensor<T, NumDims, Layout> rhs(dims); + Tensor<bool, NumDims, Layout> cond(dims); + lhs.setRandom(); + rhs.setRandom(); + cond.setRandom(); + + VerifyBlockEvaluator<T, NumDims, Layout>(cond.select(lhs, rhs), [&dims]() { + return RandomBlock<Layout>(dims, 1, 20); + }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_padding() { + const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1; + + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + DSizes<Index, NumDims> pad_before = RandomDims<NumDims>(0, 4); + DSizes<Index, NumDims> pad_after = RandomDims<NumDims>(0, 4); + array<std::pair<Index, Index>, NumDims> paddings; + for (int i = 0; i < NumDims; ++i) { + paddings[i] = std::make_pair(pad_before[i], pad_after[i]); + } + + // Test squeezing reads from inner dim. + if (internal::random<bool>()) { + pad_before[inner_dim] = 0; + pad_after[inner_dim] = 0; + paddings[inner_dim] = std::make_pair(0, 0); + } + + DSizes<Index, NumDims> padded_dims; + for (int i = 0; i < NumDims; ++i) { + padded_dims[i] = dims[i] + pad_before[i] + pad_after[i]; + } + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.pad(paddings), + [&padded_dims]() { return FixedSizeBlock(padded_dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.pad(paddings), + [&padded_dims]() { return RandomBlock<Layout>(padded_dims, 1, 10); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.pad(paddings), + [&padded_dims]() { return SkewedInnerBlock<Layout>(padded_dims); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_chipping() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + Index chip_dim = internal::random<int>(0, NumDims - 1); + Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2); + + DSizes<Index, NumDims - 1> chipped_dims; + for (Index i = 0; i < chip_dim; ++i) { + chipped_dims[i] = dims[i]; + } + for (Index i = chip_dim + 1; i < NumDims; ++i) { + chipped_dims[i - 1] = dims[i]; + } + + // Block buffer forwarding. + VerifyBlockEvaluator<T, NumDims - 1, Layout>( + input.chip(chip_offset, chip_dim), + [&chipped_dims]() { return FixedSizeBlock(chipped_dims); }); + + VerifyBlockEvaluator<T, NumDims - 1, Layout>( + input.chip(chip_offset, chip_dim), + [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); }); + + // Block expression assignment. + VerifyBlockEvaluator<T, NumDims - 1, Layout>( + input.abs().chip(chip_offset, chip_dim), + [&chipped_dims]() { return FixedSizeBlock(chipped_dims); }); + + VerifyBlockEvaluator<T, NumDims - 1, Layout>( + input.abs().chip(chip_offset, chip_dim), + [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); }); +} + + +template<typename T, int NumDims> +struct SimpleTensorGenerator { + T operator()(const array<Index, NumDims>& coords) const { + T result = static_cast<T>(0); + for (int i = 0; i < NumDims; ++i) { + result += static_cast<T>((i + 1) * coords[i]); + } + return result; + } +}; + +// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC. +template<int NumDims> +struct SimpleTensorGenerator<bool, NumDims> { + bool operator()(const array<Index, NumDims>& coords) const { + bool result = false; + for (int i = 0; i < NumDims; ++i) { + result ^= coords[i]; + } + return result; + } +}; + + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_generator() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + auto generator = SimpleTensorGenerator<T, NumDims>(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.generate(generator), [&dims]() { return FixedSizeBlock(dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.generate(generator), + [&dims]() { return RandomBlock<Layout>(dims, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_reverse() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Randomly reverse dimensions. + Eigen::DSizes<bool, NumDims> reverse; + for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>(); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.reverse(reverse), [&dims]() { return FixedSizeBlock(dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>(input.reverse(reverse), [&dims]() { + return RandomBlock<Layout>(dims, 1, 10); + }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_slice() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Pick a random slice of an input tensor. + DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10); + DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.slice(slice_start, slice_size), + [&slice_size]() { return FixedSizeBlock(slice_size); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.slice(slice_start, slice_size), + [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); }); +} + +template <typename T, int NumDims, int Layout> +static void test_eval_tensor_shuffle() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + DSizes<Index, NumDims> shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + do { + DSizes<Index, NumDims> shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]]; + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.shuffle(shuffle), + [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); }); + + VerifyBlockEvaluator<T, NumDims, Layout>( + input.shuffle(shuffle), [&shuffled_dims]() { + return RandomBlock<Layout>(shuffled_dims, 1, 5); + }); + + break; + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); +} + +template <typename T, int Layout> +static void test_eval_tensor_reshape_with_bcast() { + Index dim = internal::random<Index>(1, 100); + + Tensor<T, 2, Layout> lhs(1, dim); + Tensor<T, 2, Layout> rhs(dim, 1); + lhs.setRandom(); + rhs.setRandom(); + + auto reshapeLhs = NByOne(dim); + auto reshapeRhs = OneByM(dim); + + auto bcastLhs = OneByM(dim); + auto bcastRhs = NByOne(dim); + + DSizes<Index, 2> dims(dim, dim); + + VerifyBlockEvaluator<T, 2, Layout>( + lhs.reshape(reshapeLhs).broadcast(bcastLhs) * + rhs.reshape(reshapeRhs).broadcast(bcastRhs), + [dims]() { return SkewedInnerBlock<Layout, 2>(dims); }); +} + +template <typename T, int Layout> +static void test_eval_tensor_forced_eval() { + Index dim = internal::random<Index>(1, 100); + + Tensor<T, 2, Layout> lhs(dim, 1); + Tensor<T, 2, Layout> rhs(1, dim); + lhs.setRandom(); + rhs.setRandom(); + + auto bcastLhs = OneByM(dim); + auto bcastRhs = NByOne(dim); + + DSizes<Index, 2> dims(dim, dim); + + VerifyBlockEvaluator<T, 2, Layout>( + (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims), + [dims]() { return SkewedInnerBlock<Layout, 2>(dims); }); + + VerifyBlockEvaluator<T, 2, Layout>( + (lhs.broadcast(bcastLhs) * rhs.broadcast(bcastRhs)).eval().reshape(dims), + [dims]() { return RandomBlock<Layout, 2>(dims, 1, 50); }); +} + +template <typename T, int Layout> +static void test_eval_tensor_chipping_of_bcast() { + if (Layout != static_cast<int>(RowMajor)) return; + + Index dim0 = internal::random<Index>(1, 10); + Index dim1 = internal::random<Index>(1, 10); + Index dim2 = internal::random<Index>(1, 10); + + Tensor<T, 3, Layout> input(1, dim1, dim2); + input.setRandom(); + + Eigen::array<Index, 3> bcast = {{dim0, 1, 1}}; + DSizes<Index, 2> chipped_dims(dim0, dim2); + + VerifyBlockEvaluator<T, 2, Layout>( + input.broadcast(bcast).chip(0, 1), + [chipped_dims]() { return FixedSizeBlock(chipped_dims); }); + + VerifyBlockEvaluator<T, 2, Layout>( + input.broadcast(bcast).chip(0, 1), + [chipped_dims]() { return SkewedInnerBlock<Layout, 2>(chipped_dims); }); + + VerifyBlockEvaluator<T, 2, Layout>( + input.broadcast(bcast).chip(0, 1), + [chipped_dims]() { return RandomBlock<Layout, 2>(chipped_dims, 1, 5); }); +} + +// -------------------------------------------------------------------------- // +// Verify that assigning block to a Tensor expression produces the same result +// as an assignment to TensorSliceOp (writing a block is is identical to +// assigning one tensor to a slice of another tensor). + +template <typename T, int NumDims, int Layout, int NumExprDims = NumDims, + typename Expression, typename GenBlockParams> +static void VerifyBlockAssignment(Tensor<T, NumDims, Layout>& tensor, + Expression expr, GenBlockParams gen_block) { + using Device = DefaultDevice; + auto d = Device(); + + // We use tensor evaluator as a target for block and slice assignments. + auto eval = TensorEvaluator<decltype(expr), Device>(expr, d); + + // Generate a random block, or choose a block that fits in full expression. + TensorBlockParams<NumExprDims> block_params = gen_block(); + + // Generate random data of the selected block size. + Tensor<T, NumExprDims, Layout> block(block_params.desc.dimensions()); + block.setRandom(); + + // ************************************************************************ // + // (1) Assignment from a block. + + // Construct a materialize block from a random generated block tensor. + internal::TensorMaterializedBlock<T, NumExprDims, Layout> blk( + internal::TensorBlockKind::kView, block.data(), block.dimensions()); + + // Reset all underlying tensor values to zero. + tensor.setZero(); + + // Use evaluator to write block into a tensor. + eval.writeBlock(block_params.desc, blk); + + // Make a copy of the result after assignment. + Tensor<T, NumDims, Layout> block_assigned = tensor; + + // ************************************************************************ // + // (2) Assignment to a slice + + // Reset all underlying tensor values to zero. + tensor.setZero(); + + // Assign block to a slice of original expression + auto s_expr = expr.slice(block_params.offsets, block_params.sizes); + + // Explicitly use coefficient assignment to evaluate slice expression. + using SliceAssign = TensorAssignOp<decltype(s_expr), const decltype(block)>; + using SliceExecutor = TensorExecutor<const SliceAssign, Device, false, + internal::TiledEvaluation::Off>; + SliceExecutor::run(SliceAssign(s_expr, block), d); + + // Make a copy of the result after assignment. + Tensor<T, NumDims, Layout> slice_assigned = tensor; + + for (Index i = 0; i < tensor.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(block_assigned.coeff(i), slice_assigned.coeff(i)); + } +} + +// -------------------------------------------------------------------------- // + +template <typename T, int NumDims, int Layout> +static void test_assign_to_tensor() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> tensor(dims); + + TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map, [&dims]() { return RandomBlock<Layout>(dims, 10, 20); }); + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map, [&dims]() { return FixedSizeBlock(dims); }); +} + +template <typename T, int NumDims, int Layout> +static void test_assign_to_tensor_reshape() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> tensor(dims); + + TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims); + + DSizes<Index, NumDims> shuffled = dims; + std::shuffle(&shuffled[0], &shuffled[NumDims - 1], std::mt19937(g_seed)); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.reshape(shuffled), + [&shuffled]() { return RandomBlock<Layout>(shuffled, 1, 10); }); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.reshape(shuffled), + [&shuffled]() { return SkewedInnerBlock<Layout>(shuffled); }); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.reshape(shuffled), + [&shuffled]() { return FixedSizeBlock(shuffled); }); +} + +template <typename T, int NumDims, int Layout> +static void test_assign_to_tensor_chipping() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> tensor(dims); + + Index chip_dim = internal::random<int>(0, NumDims - 1); + Index chip_offset = internal::random<Index>(0, dims[chip_dim] - 2); + + DSizes<Index, NumDims - 1> chipped_dims; + for (Index i = 0; i < chip_dim; ++i) { + chipped_dims[i] = dims[i]; + } + for (Index i = chip_dim + 1; i < NumDims; ++i) { + chipped_dims[i - 1] = dims[i]; + } + + TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims); + + VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>( + tensor, map.chip(chip_offset, chip_dim), + [&chipped_dims]() { return RandomBlock<Layout>(chipped_dims, 1, 10); }); + + VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>( + tensor, map.chip(chip_offset, chip_dim), + [&chipped_dims]() { return SkewedInnerBlock<Layout>(chipped_dims); }); + + VerifyBlockAssignment<T, NumDims, Layout, NumDims - 1>( + tensor, map.chip(chip_offset, chip_dim), + [&chipped_dims]() { return FixedSizeBlock(chipped_dims); }); +} + +template <typename T, int NumDims, int Layout> +static void test_assign_to_tensor_slice() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(10, 20); + Tensor<T, NumDims, Layout> tensor(dims); + + // Pick a random slice of tensor. + DSizes<Index, NumDims> slice_start = RandomDims<NumDims>(5, 10); + DSizes<Index, NumDims> slice_size = RandomDims<NumDims>(5, 10); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return RandomBlock<Layout>(slice_size, 1, 10); }); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return SkewedInnerBlock<Layout>(slice_size); }); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.slice(slice_start, slice_size), + [&slice_size]() { return FixedSizeBlock(slice_size); }); +} + +template <typename T, int NumDims, int Layout> +static void test_assign_to_tensor_shuffle() { + DSizes<Index, NumDims> dims = RandomDims<NumDims>(5, 15); + Tensor<T, NumDims, Layout> tensor(dims); + + DSizes<Index, NumDims> shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + TensorMap<Tensor<T, NumDims, Layout>> map(tensor.data(), dims); + + do { + DSizes<Index, NumDims> shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[i] = dims[shuffle[i]]; + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.shuffle(shuffle), + [&shuffled_dims]() { return FixedSizeBlock(shuffled_dims); }); + + VerifyBlockAssignment<T, NumDims, Layout>( + tensor, map.shuffle(shuffle), [&shuffled_dims]() { + return RandomBlock<Layout>(shuffled_dims, 1, 5); + }); + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); +} + +// -------------------------------------------------------------------------- // + +#define CALL_SUBTEST_PART(PART) \ + CALL_SUBTEST_##PART + +#define CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 1, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 2, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 3, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 4, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 5, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 1, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 2, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<int, 5, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 1, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 2, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 3, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 4, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 5, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 1, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 2, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, 5, ColMajor>())) + +#define CALL_SUBTESTS_DIMS_LAYOUTS(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME<float, 1, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 2, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 3, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 5, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 1, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 2, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 4, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, 5, ColMajor>())) + +#define CALL_SUBTESTS_LAYOUTS_TYPES(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_eval) { + // clang-format off + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_block); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(1, test_eval_tensor_binary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(1, test_eval_tensor_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS(2, test_eval_tensor_binary_with_unary_expr_block); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_broadcast); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(2, test_eval_tensor_reshape); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_cast); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_select); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(3, test_eval_tensor_padding); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_generator); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(4, test_eval_tensor_reverse); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_slice); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(5, test_eval_tensor_shuffle); + + CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_reshape_with_bcast); + CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_forced_eval); + CALL_SUBTESTS_LAYOUTS_TYPES(6, test_eval_tensor_chipping_of_bcast); + + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_reshape); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(7, test_assign_to_tensor_chipping); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_slice); + CALL_SUBTESTS_DIMS_LAYOUTS_TYPES(8, test_assign_to_tensor_shuffle); + + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8 + + // clang-format on +} diff --git a/unsupported/test/cxx11_tensor_block_io.cpp b/unsupported/test/cxx11_tensor_block_io.cpp new file mode 100644 index 000000000..52f7dde9b --- /dev/null +++ b/unsupported/test/cxx11_tensor_block_io.cpp @@ -0,0 +1,445 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +// clang-format off +#include "main.h" +#include <Eigen/CXX11/Tensor> +// clang-format on + +// -------------------------------------------------------------------------- // +// A set of tests for TensorBlockIO: copying data between tensor blocks. + +template <int NumDims> +static DSizes<Index, NumDims> RandomDims(Index min, Index max) { + DSizes<Index, NumDims> dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random<Index>(min, max); + } + return DSizes<Index, NumDims>(dims); +} + +static internal::TensorBlockShapeType RandomBlockShape() { + return internal::random<bool>() + ? internal::TensorBlockShapeType::kUniformAllDims + : internal::TensorBlockShapeType::kSkewedInnerDims; +} + +template <int NumDims> +static size_t RandomTargetBlockSize(const DSizes<Index, NumDims>& dims) { + return internal::random<size_t>(1, dims.TotalSize()); +} + +template <int Layout, int NumDims> +static Index GetInputIndex(Index output_index, + const array<Index, NumDims>& output_to_input_dim_map, + const array<Index, NumDims>& input_strides, + const array<Index, NumDims>& output_strides) { + int input_index = 0; + if (Layout == ColMajor) { + for (int i = NumDims - 1; i > 0; --i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[0]]; + } else { + for (int i = 0; i < NumDims - 1; ++i) { + const Index idx = output_index / output_strides[i]; + input_index += idx * input_strides[output_to_input_dim_map[i]]; + output_index -= idx * output_strides[i]; + } + return input_index + + output_index * input_strides[output_to_input_dim_map[NumDims - 1]]; + } +} + +template <typename T, int NumDims, int Layout> +static void test_block_io_copy_data_from_source_to_target() { + using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Generate a random input Tensor. + DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Write data to an output Tensor. + Tensor<T, NumDims, Layout> output(dims); + + // Construct a tensor block mapper. + using TensorBlockMapper = + internal::TensorBlockMapper<NumDims, Layout, Index>; + TensorBlockMapper block_mapper( + dims, {RandomBlockShape(), RandomTargetBlockSize(dims), {0, 0, 0}}); + + // We will copy data from input to output through this buffer. + Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions()); + + // Precompute strides for TensorBlockIO::Copy. + auto input_strides = internal::strides<Layout>(dims); + auto output_strides = internal::strides<Layout>(dims); + + const T* input_data = input.data(); + T* output_data = output.data(); + T* block_data = block.data(); + + for (int i = 0; i < block_mapper.blockCount(); ++i) { + auto desc = block_mapper.blockDescriptor(i); + + auto blk_dims = desc.dimensions(); + auto blk_strides = internal::strides<Layout>(blk_dims); + + { + // Read from input into a block buffer. + IODst dst(blk_dims, blk_strides, block_data, 0); + IOSrc src(input_strides, input_data, desc.offset()); + + TensorBlockIO::Copy(dst, src); + } + + { + // Write from block buffer to output. + IODst dst(blk_dims, output_strides, output_data, desc.offset()); + IOSrc src(blk_strides, block_data, 0); + + TensorBlockIO::Copy(dst, src); + } + } + + for (int i = 0; i < dims.TotalSize(); ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } +} + +template <typename T, int NumDims, int Layout> +static void test_block_io_copy_using_reordered_dimensions() { + // Generate a random input Tensor. + DSizes<Index, NumDims> dims = RandomDims<NumDims>(1, 30); + Tensor<T, NumDims, Layout> input(dims); + input.setRandom(); + + // Create a random dimension re-ordering/shuffle. + std::vector<int> shuffle; + + for (int i = 0; i < NumDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937(g_seed)); + + DSizes<Index, NumDims> output_tensor_dims; + DSizes<Index, NumDims> input_to_output_dim_map; + DSizes<Index, NumDims> output_to_input_dim_map; + for (Index i = 0; i < NumDims; ++i) { + output_tensor_dims[shuffle[i]] = dims[i]; + input_to_output_dim_map[i] = shuffle[i]; + output_to_input_dim_map[shuffle[i]] = i; + } + + // Write data to an output Tensor. + Tensor<T, NumDims, Layout> output(output_tensor_dims); + + // Construct a tensor block mapper. + // NOTE: Tensor block mapper works with shuffled dimensions. + using TensorBlockMapper = + internal::TensorBlockMapper<NumDims, Layout, Index>; + TensorBlockMapper block_mapper(output_tensor_dims, + {RandomBlockShape(), + RandomTargetBlockSize(output_tensor_dims), + {0, 0, 0}}); + + // We will copy data from input to output through this buffer. + Tensor<T, NumDims, Layout> block(block_mapper.blockDimensions()); + + // Precompute strides for TensorBlockIO::Copy. + auto input_strides = internal::strides<Layout>(dims); + auto output_strides = internal::strides<Layout>(output_tensor_dims); + + const T* input_data = input.data(); + T* output_data = output.data(); + T* block_data = block.data(); + + for (Index i = 0; i < block_mapper.blockCount(); ++i) { + auto desc = block_mapper.blockDescriptor(i); + + const Index first_coeff_index = GetInputIndex<Layout, NumDims>( + desc.offset(), output_to_input_dim_map, input_strides, + output_strides); + + // NOTE: Block dimensions are in the same order as output dimensions. + + using TensorBlockIO = internal::TensorBlockIO<T, Index, NumDims, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + auto blk_dims = desc.dimensions(); + auto blk_strides = internal::strides<Layout>(blk_dims); + + { + // Read from input into a block buffer. + IODst dst(blk_dims, blk_strides, block_data, 0); + IOSrc src(input_strides, input_data, first_coeff_index); + + // TODO(ezhulenev): Remove when fully switched to TensorBlock. + DSizes<int, NumDims> dim_map; + for (int j = 0; j < NumDims; ++j) + dim_map[j] = static_cast<int>(output_to_input_dim_map[j]); + TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map); + } + + { + // We need to convert block dimensions from output to input order. + auto dst_dims = blk_dims; + for (int out_dim = 0; out_dim < NumDims; ++out_dim) { + dst_dims[output_to_input_dim_map[out_dim]] = blk_dims[out_dim]; + } + + // Write from block buffer to output. + IODst dst(dst_dims, input_strides, output_data, first_coeff_index); + IOSrc src(blk_strides, block_data, 0); + + // TODO(ezhulenev): Remove when fully switched to TensorBlock. + DSizes<int, NumDims> dim_map; + for (int j = 0; j < NumDims; ++j) + dim_map[j] = static_cast<int>(input_to_output_dim_map[j]); + TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/dim_map); + } + } + + for (Index i = 0; i < dims.TotalSize(); ++i) { + VERIFY_IS_EQUAL(input_data[i], output_data[i]); + } +} + +// This is the special case for reading data with reordering, when dimensions +// before/after reordering are the same. Squeezing reads along inner dimensions +// in this case is illegal, because we reorder innermost dimension. +template <int Layout> +static void test_block_io_copy_using_reordered_dimensions_do_not_squeeze() { + DSizes<Index, 3> tensor_dims(7, 9, 7); + DSizes<Index, 3> block_dims = tensor_dims; + + DSizes<int, 3> block_to_tensor_dim; + block_to_tensor_dim[0] = 2; + block_to_tensor_dim[1] = 1; + block_to_tensor_dim[2] = 0; + + auto tensor_strides = internal::strides<Layout>(tensor_dims); + auto block_strides = internal::strides<Layout>(block_dims); + + Tensor<float, 3, Layout> block(block_dims); + Tensor<float, 3, Layout> tensor(tensor_dims); + tensor.setRandom(); + + float* tensor_data = tensor.data(); + float* block_data = block.data(); + + using TensorBlockIO = internal::TensorBlockIO<float, Index, 3, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Read from a tensor into a block. + IODst dst(block_dims, block_strides, block_data, 0); + IOSrc src(tensor_strides, tensor_data, 0); + + TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim); + + TensorMap<Tensor<float, 3, Layout> > block_tensor(block_data, block_dims); + TensorMap<Tensor<float, 3, Layout> > tensor_tensor(tensor_data, tensor_dims); + + for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { + for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { + for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { + float block_value = block_tensor(d2, d1, d0); + float tensor_value = tensor_tensor(d0, d1, d2); + VERIFY_IS_EQUAL(block_value, tensor_value); + } + } + } +} + +// This is the special case for reading data with reordering, when dimensions +// before/after reordering are the same. Squeezing reads in this case is allowed +// because we reorder outer dimensions. +template <int Layout> +static void test_block_io_copy_using_reordered_dimensions_squeeze() { + DSizes<Index, 4> tensor_dims(7, 5, 9, 9); + DSizes<Index, 4> block_dims = tensor_dims; + + DSizes<int, 4> block_to_tensor_dim; + block_to_tensor_dim[0] = 0; + block_to_tensor_dim[1] = 1; + block_to_tensor_dim[2] = 3; + block_to_tensor_dim[3] = 2; + + auto tensor_strides = internal::strides<Layout>(tensor_dims); + auto block_strides = internal::strides<Layout>(block_dims); + + Tensor<float, 4, Layout> block(block_dims); + Tensor<float, 4, Layout> tensor(tensor_dims); + tensor.setRandom(); + + float* tensor_data = tensor.data(); + float* block_data = block.data(); + + using TensorBlockIO = internal::TensorBlockIO<float, Index, 4, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Read from a tensor into a block. + IODst dst(block_dims, block_strides, block_data, 0); + IOSrc src(tensor_strides, tensor_data, 0); + + TensorBlockIO::Copy(dst, src, /*dst_to_src_dim_map=*/block_to_tensor_dim); + + TensorMap<Tensor<float, 4, Layout> > block_tensor(block_data, block_dims); + TensorMap<Tensor<float, 4, Layout> > tensor_tensor(tensor_data, tensor_dims); + + for (Index d0 = 0; d0 < tensor_dims[0]; ++d0) { + for (Index d1 = 0; d1 < tensor_dims[1]; ++d1) { + for (Index d2 = 0; d2 < tensor_dims[2]; ++d2) { + for (Index d3 = 0; d3 < tensor_dims[3]; ++d3) { + float block_value = block_tensor(d0, d1, d3, d2); + float tensor_value = tensor_tensor(d0, d1, d2, d3); + VERIFY_IS_EQUAL(block_value, tensor_value); + } + } + } + } +} + +template <int Layout> +static void test_block_io_zero_stride() { + DSizes<Index, 5> rnd_dims = RandomDims<5>(1, 30); + + DSizes<Index, 5> input_tensor_dims = rnd_dims; + input_tensor_dims[0] = 1; + input_tensor_dims[2] = 1; + input_tensor_dims[4] = 1; + + Tensor<float, 5, Layout> input(input_tensor_dims); + input.setRandom(); + + DSizes<Index, 5> output_tensor_dims = rnd_dims; + + auto input_tensor_strides = internal::strides<Layout>(input_tensor_dims); + auto output_tensor_strides = internal::strides<Layout>(output_tensor_dims); + + auto input_tensor_strides_with_zeros = input_tensor_strides; + input_tensor_strides_with_zeros[0] = 0; + input_tensor_strides_with_zeros[2] = 0; + input_tensor_strides_with_zeros[4] = 0; + + Tensor<float, 5, Layout> output(output_tensor_dims); + output.setRandom(); + + using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Write data from input to output with broadcasting in dims [0, 2, 4]. + IODst dst(output_tensor_dims, output_tensor_strides, output.data(), 0); + IOSrc src(input_tensor_strides_with_zeros, input.data(), 0); + TensorBlockIO::Copy(dst, src); + + for (int i = 0; i < output_tensor_dims[0]; ++i) { + for (int j = 0; j < output_tensor_dims[1]; ++j) { + for (int k = 0; k < output_tensor_dims[2]; ++k) { + for (int l = 0; l < output_tensor_dims[3]; ++l) { + for (int m = 0; m < output_tensor_dims[4]; ++m) { + float input_value = input(0, j, 0, l, 0); + float output_value = output(i, j, k, l, m); + VERIFY_IS_EQUAL(input_value, output_value); + } + } + } + } + } +} + +template <int Layout> +static void test_block_io_squeeze_ones() { + using TensorBlockIO = internal::TensorBlockIO<float, Index, 5, Layout>; + using IODst = typename TensorBlockIO::Dst; + using IOSrc = typename TensorBlockIO::Src; + + // Total size > 1. + { + DSizes<Index, 5> block_sizes(1, 2, 1, 2, 1); + auto strides = internal::strides<Layout>(block_sizes); + + // Create a random input tensor. + Tensor<float, 5> input(block_sizes); + input.setRandom(); + + Tensor<float, 5> output(block_sizes); + + IODst dst(block_sizes, strides, output.data(), 0); + IOSrc src(strides, input.data()); + TensorBlockIO::Copy(dst, src); + + for (Index i = 0; i < block_sizes.TotalSize(); ++i) { + VERIFY_IS_EQUAL(output.data()[i], input.data()[i]); + } + } + + // Total size == 1. + { + DSizes<Index, 5> block_sizes(1, 1, 1, 1, 1); + auto strides = internal::strides<Layout>(block_sizes); + + // Create a random input tensor. + Tensor<float, 5> input(block_sizes); + input.setRandom(); + + Tensor<float, 5> output(block_sizes); + + IODst dst(block_sizes, strides, output.data(), 0); + IOSrc src(strides, input.data()); + TensorBlockIO::Copy(dst, src); + + for (Index i = 0; i < block_sizes.TotalSize(); ++i) { + VERIFY_IS_EQUAL(output.data()[i], input.data()[i]); + } + } +} + +#define CALL_SUBTESTS(NAME) \ + CALL_SUBTEST((NAME<float, 1, RowMajor>())); \ + CALL_SUBTEST((NAME<float, 2, RowMajor>())); \ + CALL_SUBTEST((NAME<float, 4, RowMajor>())); \ + CALL_SUBTEST((NAME<float, 5, RowMajor>())); \ + CALL_SUBTEST((NAME<float, 1, ColMajor>())); \ + CALL_SUBTEST((NAME<float, 2, ColMajor>())); \ + CALL_SUBTEST((NAME<float, 4, ColMajor>())); \ + CALL_SUBTEST((NAME<float, 5, ColMajor>())); \ + CALL_SUBTEST((NAME<bool, 1, RowMajor>())); \ + CALL_SUBTEST((NAME<bool, 2, RowMajor>())); \ + CALL_SUBTEST((NAME<bool, 4, RowMajor>())); \ + CALL_SUBTEST((NAME<bool, 5, RowMajor>())); \ + CALL_SUBTEST((NAME<bool, 1, ColMajor>())); \ + CALL_SUBTEST((NAME<bool, 2, ColMajor>())); \ + CALL_SUBTEST((NAME<bool, 4, ColMajor>())); \ + CALL_SUBTEST((NAME<bool, 5, ColMajor>())) + +EIGEN_DECLARE_TEST(cxx11_tensor_block_io) { + // clang-format off + CALL_SUBTESTS(test_block_io_copy_data_from_source_to_target); + CALL_SUBTESTS(test_block_io_copy_using_reordered_dimensions); + + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<RowMajor>()); + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_do_not_squeeze<ColMajor>()); + + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<RowMajor>()); + CALL_SUBTEST(test_block_io_copy_using_reordered_dimensions_squeeze<ColMajor>()); + + CALL_SUBTEST(test_block_io_zero_stride<RowMajor>()); + CALL_SUBTEST(test_block_io_zero_stride<ColMajor>()); + + CALL_SUBTEST(test_block_io_squeeze_ones<RowMajor>()); + CALL_SUBTEST(test_block_io_squeeze_ones<ColMajor>()); + // clang-format on +} diff --git a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp index 7201bfe37..20f84b8e0 100644 --- a/unsupported/test/cxx11_tensor_broadcast_sycl.cpp +++ b/unsupported/test/cxx11_tensor_broadcast_sycl.cpp @@ -13,8 +13,8 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -25,50 +25,120 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){ +template <typename DataType, int DataLayout, typename IndexType> +static void test_broadcast_sycl_fixed(const Eigen::SyclDevice &sycl_device){ // BROADCAST test: - array<int, 4> in_range = {{2, 3, 5, 7}}; - array<int, 4> broadcasts = {{2, 3, 1, 4}}; - array<int, 4> out_range; // = in_range * broadcasts + IndexType inDim1=2; + IndexType inDim2=3; + IndexType inDim3=5; + IndexType inDim4=7; + IndexType bDim1=2; + IndexType bDim2=3; + IndexType bDim3=1; + IndexType bDim4=4; + array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}}; + array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}}; + array<IndexType, 4> out_range; // = in_range * broadcasts for (size_t i = 0; i < out_range.size(); ++i) out_range[i] = in_range[i] * broadcasts[i]; - Tensor<float, 4> input(in_range); - Tensor<float, 4> out(out_range); + Tensor<DataType, 4, DataLayout, IndexType> input(in_range); + Tensor<DataType, 4, DataLayout, IndexType> out(out_range); for (size_t i = 0; i < in_range.size(); ++i) VERIFY_IS_EQUAL(out.dimension(i), out_range[i]); - for (int i = 0; i < input.size(); ++i) - input(i) = static_cast<float>(i); + for (IndexType i = 0; i < input.size(); ++i) + input(i) = static_cast<DataType>(i); - float * gpu_in_data = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float))); - float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); + DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); - TensorMap<Tensor<float, 4>> gpu_in(gpu_in_data, in_range); - TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range); - sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float)); + TensorMap<TensorFixedSize<DataType, Sizes<2, 3, 5, 7>, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range); + sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType)); gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 9; ++j) { - for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 28; ++l) { + for (IndexType i = 0; i < inDim1*bDim1; ++i) { + for (IndexType j = 0; j < inDim2*bDim2; ++j) { + for (IndexType k = 0; k < inDim3*bDim3; ++k) { + for (IndexType l = 0; l < inDim4*bDim4; ++l) { VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l)); } } } } + printf("Broadcast Test with fixed size Passed\n"); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){ + + // BROADCAST test: + IndexType inDim1=2; + IndexType inDim2=3; + IndexType inDim3=5; + IndexType inDim4=7; + IndexType bDim1=2; + IndexType bDim2=3; + IndexType bDim3=1; + IndexType bDim4=4; + array<IndexType, 4> in_range = {{inDim1, inDim2, inDim3, inDim4}}; + array<IndexType, 4> broadcasts = {{bDim1, bDim2, bDim3, bDim4}}; + array<IndexType, 4> out_range; // = in_range * broadcasts + for (size_t i = 0; i < out_range.size(); ++i) + out_range[i] = in_range[i] * broadcasts[i]; + + Tensor<DataType, 4, DataLayout, IndexType> input(in_range); + Tensor<DataType, 4, DataLayout, IndexType> out(out_range); + + for (size_t i = 0; i < in_range.size(); ++i) + VERIFY_IS_EQUAL(out.dimension(i), out_range[i]); + + + for (IndexType i = 0; i < input.size(); ++i) + input(i) = static_cast<DataType>(i); + + DataType * gpu_in_data = static_cast<DataType*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_in(gpu_in_data, in_range); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_out(gpu_out_data, out_range); + sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < inDim1*bDim1; ++i) { + for (IndexType j = 0; j < inDim2*bDim2; ++j) { + for (IndexType k = 0; k < inDim3*bDim3; ++k) { + for (IndexType l = 0; l < inDim4*bDim4; ++l) { + VERIFY_IS_APPROX(input(i%inDim1,j%inDim2,k%inDim3,l%inDim4), out(i,j,k,l)); + } + } + } + } printf("Broadcast Test Passed\n"); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -void test_cxx11_tensor_broadcast_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST(test_broadcast_sycl(sycl_device)); +template<typename DataType> void sycl_broadcast_test_per_device(const cl::sycl::device& d){ + std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_broadcast_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_broadcast_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_broadcast_sycl_fixed<DataType, RowMajor, int64_t>(sycl_device); + test_broadcast_sycl_fixed<DataType, ColMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_broadcast_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_broadcast_test_per_device<float>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index 5c0ea5889..d3dab891f 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -91,7 +91,16 @@ static void test_vectorized_broadcasting() } } +#if EIGEN_HAS_VARIADIC_TEMPLATES tensor.resize(11,3,5); +#else + array<Index, 3> new_dims; + new_dims[0] = 11; + new_dims[1] = 3; + new_dims[2] = 5; + tensor.resize(new_dims); +#endif + tensor.setRandom(); broadcast = tensor.broadcast(broadcasts); @@ -115,7 +124,7 @@ static void test_static_broadcasting() Tensor<float, 3, DataLayout> tensor(8,3,5); tensor.setRandom(); -#if EIGEN_HAS_CONSTEXPR +#if defined(EIGEN_HAS_INDEX_LIST) Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts; #else Eigen::array<int, 3> broadcasts; @@ -139,7 +148,16 @@ static void test_static_broadcasting() } } +#if EIGEN_HAS_VARIADIC_TEMPLATES tensor.resize(11,3,5); +#else + array<Index, 3> new_dims; + new_dims[0] = 11; + new_dims[1] = 3; + new_dims[2] = 5; + tensor.resize(new_dims); +#endif + tensor.setRandom(); broadcast = tensor.broadcast(broadcasts); @@ -180,8 +198,119 @@ static void test_fixed_size_broadcasting() #endif } +template <int DataLayout> +static void test_simple_broadcasting_one_by_n() +{ + Tensor<float, 4, DataLayout> tensor(1,13,5,7); + tensor.setRandom(); + array<ptrdiff_t, 4> broadcasts; + broadcasts[0] = 9; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 1; + Tensor<float, 4, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 9); + VERIFY_IS_EQUAL(broadcast.dimension(1), 13); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 7); + + for (int i = 0; i < 9; ++i) { + for (int j = 0; j < 13; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i%1,j%13,k%5,l%7), broadcast(i,j,k,l)); + } + } + } + } +} + +template <int DataLayout> +static void test_simple_broadcasting_n_by_one() +{ + Tensor<float, 4, DataLayout> tensor(7,3,5,1); + tensor.setRandom(); + array<ptrdiff_t, 4> broadcasts; + broadcasts[0] = 1; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 19; + Tensor<float, 4, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 7); + VERIFY_IS_EQUAL(broadcast.dimension(1), 3); + VERIFY_IS_EQUAL(broadcast.dimension(2), 5); + VERIFY_IS_EQUAL(broadcast.dimension(3), 19); + + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + for (int l = 0; l < 19; ++l) { + VERIFY_IS_EQUAL(tensor(i%7,j%3,k%5,l%1), broadcast(i,j,k,l)); + } + } + } + } +} + +template <int DataLayout> +static void test_simple_broadcasting_one_by_n_by_one_1d() +{ + Tensor<float, 3, DataLayout> tensor(1,7,1); + tensor.setRandom(); + array<ptrdiff_t, 3> broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 13; + Tensor<float, 3, DataLayout> broadcasted; + broadcasted = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcasted.dimension(0), 5); + VERIFY_IS_EQUAL(broadcasted.dimension(1), 7); + VERIFY_IS_EQUAL(broadcasted.dimension(2), 13); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + VERIFY_IS_EQUAL(tensor(0,j%7,0), broadcasted(i,j,k)); + } + } + } +} + +template <int DataLayout> +static void test_simple_broadcasting_one_by_n_by_one_2d() +{ + Tensor<float, 4, DataLayout> tensor(1,7,13,1); + tensor.setRandom(); + array<ptrdiff_t, 4> broadcasts; + broadcasts[0] = 5; + broadcasts[1] = 1; + broadcasts[2] = 1; + broadcasts[3] = 19; + Tensor<float, 4, DataLayout> broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), 5); + VERIFY_IS_EQUAL(broadcast.dimension(1), 7); + VERIFY_IS_EQUAL(broadcast.dimension(2), 13); + VERIFY_IS_EQUAL(broadcast.dimension(3), 19); + + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 7; ++j) { + for (int k = 0; k < 13; ++k) { + for (int l = 0; l < 19; ++l) { + VERIFY_IS_EQUAL(tensor(0,j%7,k%13,0), broadcast(i,j,k,l)); + } + } + } + } +} -void test_cxx11_tensor_broadcasting() +EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting) { CALL_SUBTEST(test_simple_broadcasting<ColMajor>()); CALL_SUBTEST(test_simple_broadcasting<RowMajor>()); @@ -191,4 +320,12 @@ void test_cxx11_tensor_broadcasting() CALL_SUBTEST(test_static_broadcasting<RowMajor>()); CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>()); CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n<RowMajor>()); + CALL_SUBTEST(test_simple_broadcasting_n_by_one<RowMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_n_by_one<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>()); + CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp new file mode 100644 index 000000000..72cb62fd5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -0,0 +1,354 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +// Functions used to compare the TensorMap implementation on the device with +// the equivalent on the host +namespace cl { +namespace sycl { +template <typename T> T abs(T x) { return cl::sycl::fabs(x); } +template <typename T> T square(T x) { return x * x; } +template <typename T> T cube(T x) { return x * x * x; } +template <typename T> T inverse(T x) { return T(1) / x; } +template <typename T> T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } +template <typename T> T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } +} +} + +struct EqualAssignement { + template <typename Lhs, typename Rhs> + void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } +}; + +struct PlusEqualAssignement { + template <typename Lhs, typename Rhs> + void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } +}; + +template <typename DataType, int DataLayout, + typename Assignement, typename Operator> +void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + Operator op; + Assignement asgn; + { + /* Assignement(out, Operator(in)) */ + Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in = in.random() + DataType(0.01); + out = out.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data = static_cast<DataType *>( + sycl_device.allocate(in.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(in(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); + } + { + /* Assignement(out, Operator(out)) */ + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + out = out.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu_out)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(reference(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data_out); + } +} + +#define DECLARE_UNARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template <typename T> \ + auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ + return cl::sycl::FUNC(x); \ + } \ + template <typename T> \ + auto operator()(const TensorMap<T>& x) -> decltype(x.FUNC()) { \ + return x.FUNC(); \ + } \ + }; + +DECLARE_UNARY_STRUCT(abs) +DECLARE_UNARY_STRUCT(sqrt) +DECLARE_UNARY_STRUCT(rsqrt) +DECLARE_UNARY_STRUCT(square) +DECLARE_UNARY_STRUCT(cube) +DECLARE_UNARY_STRUCT(inverse) +DECLARE_UNARY_STRUCT(tanh) +DECLARE_UNARY_STRUCT(exp) +DECLARE_UNARY_STRUCT(expm1) +DECLARE_UNARY_STRUCT(log) +DECLARE_UNARY_STRUCT(ceil) +DECLARE_UNARY_STRUCT(floor) +DECLARE_UNARY_STRUCT(round) +DECLARE_UNARY_STRUCT(log1p) +DECLARE_UNARY_STRUCT(sign) +DECLARE_UNARY_STRUCT(isnan) +DECLARE_UNARY_STRUCT(isfinite) +DECLARE_UNARY_STRUCT(isinf) + +template <typename DataType, int DataLayout, typename Assignement> +void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { +#define RUN_UNARY_TEST(FUNC) \ + test_unary_builtins_for_scalar<DataType, DataLayout, Assignement, \ + op_##FUNC>(sycl_device, tensor_range) + RUN_UNARY_TEST(abs); + RUN_UNARY_TEST(sqrt); + RUN_UNARY_TEST(rsqrt); + RUN_UNARY_TEST(square); + RUN_UNARY_TEST(cube); + RUN_UNARY_TEST(inverse); + RUN_UNARY_TEST(tanh); + RUN_UNARY_TEST(exp); + RUN_UNARY_TEST(expm1); + RUN_UNARY_TEST(log); + RUN_UNARY_TEST(ceil); + RUN_UNARY_TEST(floor); + RUN_UNARY_TEST(round); + RUN_UNARY_TEST(log1p); + RUN_UNARY_TEST(sign); +} + +template <typename DataType, int DataLayout, typename Operator> +void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in) */ + Operator op; + Tensor<DataType, 3, DataLayout, int64_t> in(tensor_range); + Tensor<bool, 3, DataLayout, int64_t> out(tensor_range); + in = in.random() + DataType(0.01); + DataType *gpu_data = static_cast<DataType *>( + sycl_device.allocate(in.size() * sizeof(DataType))); + bool *gpu_data_out = + static_cast<bool *>(sycl_device.allocate(out.size() * sizeof(bool))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu(gpu_data, tensor_range); + TensorMap<Tensor<bool, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(bool)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_EQUAL(out(i), op(in(i))); + } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); +} + +template <typename DataType, int DataLayout> +void test_unary_builtins(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + test_unary_builtins_for_assignement<DataType, DataLayout, + PlusEqualAssignement>(sycl_device, tensor_range); + test_unary_builtins_for_assignement<DataType, DataLayout, + EqualAssignement>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isnan>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isfinite>(sycl_device, tensor_range); + test_unary_builtins_return_bool<DataType, DataLayout, + op_isinf>(sycl_device, tensor_range); +} + +template <typename DataType> +static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + + test_unary_builtins<DataType, RowMajor>(sycl_device, tensor_range); + test_unary_builtins<DataType, ColMajor>(sycl_device, tensor_range); +} + +template <typename DataType, int DataLayout, typename Operator> +void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in_1, in_2) */ + Operator op; + Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> in_2(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in_1 = in_1.random() + DataType(0.01); + in_2 = in_2.random() + DataType(0.01); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_1 = static_cast<DataType *>( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_2 = static_cast<DataType *>( + sycl_device.allocate(in_2.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_2(gpu_data_2, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), + (in_2.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, gpu_2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i))); + } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_2); + sycl_device.deallocate(gpu_data_out); +} + +template <typename DataType, int DataLayout, typename Operator> +void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + /* out = op(in_1, 2) */ + Operator op; + const DataType arg2(2); + Tensor<DataType, 3, DataLayout, int64_t> in_1(tensor_range); + Tensor<DataType, 3, DataLayout, int64_t> out(tensor_range); + in_1 = in_1.random(); + Tensor<DataType, 3, DataLayout, int64_t> reference(out); + DataType *gpu_data_1 = static_cast<DataType *>( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast<DataType *>( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_1(gpu_data_1, tensor_range); + TensorMap<Tensor<DataType, 3, DataLayout, int64_t>> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, arg2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), arg2)); + } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_out); +} + +#define DECLARE_BINARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template <typename T1, typename T2> \ + auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \ + return cl::sycl::FUNC(x, y); \ + } \ + template <typename T1, typename T2> \ + auto operator()(const TensorMap<T1>& x, const TensorMap<T2>& y) -> decltype(x.FUNC(y)) { \ + return x.FUNC(y); \ + } \ + }; + +DECLARE_BINARY_STRUCT(cwiseMax) +DECLARE_BINARY_STRUCT(cwiseMin) + +#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ + struct op_##NAME { \ + template <typename T1, typename T2> \ + auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \ + return x OPERATOR y; \ + } \ + }; + +DECLARE_BINARY_STRUCT_OP(plus, +) +DECLARE_BINARY_STRUCT_OP(minus, -) +DECLARE_BINARY_STRUCT_OP(times, *) +DECLARE_BINARY_STRUCT_OP(divide, /) +DECLARE_BINARY_STRUCT_OP(modulo, %) + +template <typename DataType, int DataLayout> +void test_binary_builtins(const Eigen::SyclDevice& sycl_device, + const array<int64_t, 3>& tensor_range) { + test_binary_builtins_func<DataType, DataLayout, + op_cwiseMax>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_cwiseMin>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_plus>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_minus>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_times>(sycl_device, tensor_range); + test_binary_builtins_func<DataType, DataLayout, + op_divide>(sycl_device, tensor_range); +} + +template <typename DataType> +static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins<DataType, RowMajor>(sycl_device, tensor_range); + test_binary_builtins<DataType, ColMajor>(sycl_device, tensor_range); +} + +template <typename DataType> +static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array<int64_t, 3> tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins_fixed_arg2<DataType, RowMajor, + op_modulo>(sycl_device, tensor_range); + test_binary_builtins_fixed_arg2<DataType, ColMajor, + op_modulo>(sycl_device, tensor_range); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + QueueInterface queueInterface(device); + Eigen::SyclDevice sycl_device(&queueInterface); + CALL_SUBTEST_1(test_builtin_unary_sycl<float>(sycl_device)); + CALL_SUBTEST_2(test_floating_builtin_binary_sycl<float>(sycl_device)); + CALL_SUBTEST_3(test_integer_builtin_binary_sycl<int>(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu index 88c233994..97923d15f 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu @@ -9,20 +9,17 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; -void test_cuda_conversion() { - Eigen::CudaStreamDevice stream; +void test_gpu_conversion() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -75,8 +72,8 @@ void test_fallback_conversion() { } -void test_cxx11_tensor_cast_float16_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_cast_float16_gpu) { - CALL_SUBTEST(test_cuda_conversion()); + CALL_SUBTEST(test_gpu_conversion()); CALL_SUBTEST(test_fallback_conversion()); } diff --git a/unsupported/test/cxx11_tensor_casts.cpp b/unsupported/test/cxx11_tensor_casts.cpp index 3c6d0d2ff..45456f3ef 100644 --- a/unsupported/test/cxx11_tensor_casts.cpp +++ b/unsupported/test/cxx11_tensor_casts.cpp @@ -8,6 +8,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "main.h" +#include "random_without_cast_overflow.h" #include <Eigen/CXX11/Tensor> @@ -104,12 +105,82 @@ static void test_small_to_big_type_cast() } } +template <typename FromType, typename ToType> +static void test_type_cast() { + Tensor<FromType, 2> ftensor(100, 200); + // Generate random values for a valid cast. + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 200; ++j) { + ftensor(i, j) = internal::random_without_cast_overflow<FromType,ToType>::value(); + } + } + + Tensor<ToType, 2> ttensor(100, 200); + ttensor = ftensor.template cast<ToType>(); + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 200; ++j) { + const ToType ref = internal::cast<FromType,ToType>(ftensor(i, j)); + VERIFY_IS_APPROX(ttensor(i, j), ref); + } + } +} + +template<typename Scalar, typename EnableIf = void> +struct test_cast_runner { + static void run() { + test_type_cast<Scalar, bool>(); + test_type_cast<Scalar, int8_t>(); + test_type_cast<Scalar, int16_t>(); + test_type_cast<Scalar, int32_t>(); + test_type_cast<Scalar, int64_t>(); + test_type_cast<Scalar, uint8_t>(); + test_type_cast<Scalar, uint16_t>(); + test_type_cast<Scalar, uint32_t>(); + test_type_cast<Scalar, uint64_t>(); + test_type_cast<Scalar, half>(); + test_type_cast<Scalar, bfloat16>(); + test_type_cast<Scalar, float>(); + test_type_cast<Scalar, double>(); + test_type_cast<Scalar, std::complex<float>>(); + test_type_cast<Scalar, std::complex<double>>(); + } +}; + +// Only certain types allow cast from std::complex<>. +template<typename Scalar> +struct test_cast_runner<Scalar, typename internal::enable_if<NumTraits<Scalar>::IsComplex>::type> { + static void run() { + test_type_cast<Scalar, half>(); + test_type_cast<Scalar, bfloat16>(); + test_type_cast<Scalar, std::complex<float>>(); + test_type_cast<Scalar, std::complex<double>>(); + } +}; + -void test_cxx11_tensor_casts() +EIGEN_DECLARE_TEST(cxx11_tensor_casts) { - CALL_SUBTEST(test_simple_cast()); - CALL_SUBTEST(test_vectorized_cast()); - CALL_SUBTEST(test_float_to_int_cast()); - CALL_SUBTEST(test_big_to_small_type_cast()); - CALL_SUBTEST(test_small_to_big_type_cast()); + CALL_SUBTEST(test_simple_cast()); + CALL_SUBTEST(test_vectorized_cast()); + CALL_SUBTEST(test_float_to_int_cast()); + CALL_SUBTEST(test_big_to_small_type_cast()); + CALL_SUBTEST(test_small_to_big_type_cast()); + + CALL_SUBTEST(test_cast_runner<bool>::run()); + CALL_SUBTEST(test_cast_runner<int8_t>::run()); + CALL_SUBTEST(test_cast_runner<int16_t>::run()); + CALL_SUBTEST(test_cast_runner<int32_t>::run()); + CALL_SUBTEST(test_cast_runner<int64_t>::run()); + CALL_SUBTEST(test_cast_runner<uint8_t>::run()); + CALL_SUBTEST(test_cast_runner<uint16_t>::run()); + CALL_SUBTEST(test_cast_runner<uint32_t>::run()); + CALL_SUBTEST(test_cast_runner<uint64_t>::run()); + CALL_SUBTEST(test_cast_runner<half>::run()); + CALL_SUBTEST(test_cast_runner<bfloat16>::run()); + CALL_SUBTEST(test_cast_runner<float>::run()); + CALL_SUBTEST(test_cast_runner<double>::run()); + CALL_SUBTEST(test_cast_runner<std::complex<float>>::run()); + CALL_SUBTEST(test_cast_runner<std::complex<double>>::run()); + } diff --git a/unsupported/test/cxx11_tensor_chipping.cpp b/unsupported/test/cxx11_tensor_chipping.cpp index 1832dec8b..922274462 100644 --- a/unsupported/test/cxx11_tensor_chipping.cpp +++ b/unsupported/test/cxx11_tensor_chipping.cpp @@ -43,7 +43,7 @@ static void test_simple_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -75,7 +75,7 @@ static void test_simple_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } @@ -126,7 +126,7 @@ static void test_dynamic_chip() VERIFY_IS_EQUAL(chip2.dimension(2), 7); VERIFY_IS_EQUAL(chip2.dimension(3), 11); for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { + for (int j = 0; j < 5; ++j) { for (int k = 0; k < 7; ++k) { for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l)); @@ -158,7 +158,7 @@ static void test_dynamic_chip() for (int i = 0; i < 2; ++i) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 5; ++k) { - for (int l = 0; l < 7; ++l) { + for (int l = 0; l < 11; ++l) { VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l)); } } @@ -410,7 +410,7 @@ static void test_chip_raw_data_row_major() VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0)); } -void test_cxx11_tensor_chipping() +EIGEN_DECLARE_TEST(cxx11_tensor_chipping) { CALL_SUBTEST(test_simple_chip<ColMajor>()); CALL_SUBTEST(test_simple_chip<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp new file mode 100644 index 000000000..1e7093104 --- /dev/null +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -0,0 +1,623 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_static_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(1l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<1l>(1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.template chip<2l>(2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.template chip<3l>(5l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.template chip<4l>(7l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_dynamic_chip_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + + tensor.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.chip(1l,0l); + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + VERIFY_IS_EQUAL(chip1.dimension(0), sizeDim2); + VERIFY_IS_EQUAL(chip1.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip1.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip1.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim2; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1l,i,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip2TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip2(chip2TensorRange); + const size_t chip2TensorBuffSize =chip2.size()*sizeof(DataType); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + gpu_chip2.device(sycl_device)=gpu_tensor.chip(1l,1l); + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + VERIFY_IS_EQUAL(chip2.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip2.dimension(1), sizeDim3); + VERIFY_IS_EQUAL(chip2.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip2.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim3; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1l,j,k,l)); + } + } + } + } + + array<IndexType, 4> chip3TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip3(chip3TensorRange); + const size_t chip3TensorBuffSize =chip3.size()*sizeof(DataType); + DataType* gpu_data_chip3 = static_cast<DataType*>(sycl_device.allocate(chip3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip3(gpu_data_chip3, chip3TensorRange); + + gpu_chip3.device(sycl_device)=gpu_tensor.chip(2l,2l); + sycl_device.memcpyDeviceToHost(chip3.data(), gpu_data_chip3, chip3TensorBuffSize); + + VERIFY_IS_EQUAL(chip3.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip3.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip3.dimension(2), sizeDim4); + VERIFY_IS_EQUAL(chip3.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim4; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2l,k,l)); + } + } + } + } + + array<IndexType, 4> chip4TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> chip4(chip4TensorRange); + const size_t chip4TensorBuffSize =chip4.size()*sizeof(DataType); + DataType* gpu_data_chip4 = static_cast<DataType*>(sycl_device.allocate(chip4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip4(gpu_data_chip4, chip4TensorRange); + + gpu_chip4.device(sycl_device)=gpu_tensor.chip(5l,3l); + sycl_device.memcpyDeviceToHost(chip4.data(), gpu_data_chip4, chip4TensorBuffSize); + + VERIFY_IS_EQUAL(chip4.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip4.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip4.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip4.dimension(3), sizeDim5); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim5; ++l) { + VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5l,l)); + } + } + } + } + + + array<IndexType, 4> chip5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> chip5(chip5TensorRange); + const size_t chip5TensorBuffSize =chip5.size()*sizeof(DataType); + DataType* gpu_data_chip5 = static_cast<DataType*>(sycl_device.allocate(chip5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip5(gpu_data_chip5, chip5TensorRange); + + gpu_chip5.device(sycl_device)=gpu_tensor.chip(7l,4l); + sycl_device.memcpyDeviceToHost(chip5.data(), gpu_data_chip5, chip5TensorBuffSize); + + VERIFY_IS_EQUAL(chip5.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(chip5.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(chip5.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(chip5.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7l)); + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_chip2); + sycl_device.deallocate(gpu_data_chip3); + sycl_device.deallocate(gpu_data_chip4); + sycl_device.deallocate(gpu_data_chip5); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_chip_in_expr(const Eigen::SyclDevice& sycl_device) { + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> chip1TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + + Tensor<DataType, 4, DataLayout,IndexType> chip1(chip1TensorRange); + Tensor<DataType, 4, DataLayout,IndexType> tensor1(chip1TensorRange); + tensor.setRandom(); + tensor1.setRandom(); + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t chip1TensorBuffSize =chip1.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_chip1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + DataType* gpu_data_tensor1 = static_cast<DataType*>(sycl_device.allocate(chip1TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_chip1(gpu_data_chip1, chip1TensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor1(gpu_data_tensor1, chip1TensorRange); + + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + sycl_device.memcpyHostToDevice(gpu_data_tensor1, tensor1.data(), chip1TensorBuffSize); + gpu_chip1.device(sycl_device)=gpu_tensor.template chip<0l>(0l) + gpu_tensor1; + sycl_device.memcpyDeviceToHost(chip1.data(), gpu_data_chip1, chip1TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim3; ++j) { + for (int k = 0; k < sizeDim4; ++k) { + for (int l = 0; l < sizeDim5; ++l) { + float expected = tensor(0l,i,j,k,l) + tensor1(i,j,k,l); + VERIFY_IS_EQUAL(chip1(i,j,k,l), expected); + } + } + } + } + + array<IndexType, 3> chip2TensorRange = {{sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 3, DataLayout,IndexType> tensor2(chip2TensorRange); + Tensor<DataType, 3, DataLayout,IndexType> chip2(chip2TensorRange); + tensor2.setRandom(); + const size_t chip2TensorBuffSize =tensor2.size()*sizeof(DataType); + DataType* gpu_data_tensor2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + DataType* gpu_data_chip2 = static_cast<DataType*>(sycl_device.allocate(chip2TensorBuffSize)); + TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_tensor2(gpu_data_tensor2, chip2TensorRange); + TensorMap<Tensor<DataType, 3, DataLayout,IndexType>> gpu_chip2(gpu_data_chip2, chip2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor2, tensor2.data(), chip2TensorBuffSize); + gpu_chip2.device(sycl_device)=gpu_tensor.template chip<0l>(0l).template chip<1l>(2l) + gpu_tensor2; + sycl_device.memcpyDeviceToHost(chip2.data(), gpu_data_chip2, chip2TensorBuffSize); + + for (int i = 0; i < sizeDim2; ++i) { + for (int j = 0; j < sizeDim4; ++j) { + for (int k = 0; k < sizeDim5; ++k) { + float expected = tensor(0l,i,2l,j,k) + tensor2(i,j,k); + VERIFY_IS_EQUAL(chip2(i,j,k), expected); + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_tensor1); + sycl_device.deallocate(gpu_data_chip1); + sycl_device.deallocate(gpu_data_tensor2); + sycl_device.deallocate(gpu_data_chip2); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + array<IndexType, 4> input2TensorRange = {{sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + + Tensor<DataType, 5, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 5, DataLayout,IndexType> input1(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> input2(input2TensorRange); + input1.setRandom(); + input2.setRandom(); + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); + std::cout << tensorBuffSize << " , "<< input2TensorBuffSize << std::endl; + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input1 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_input2 = static_cast<DataType*>(sycl_device.allocate(input2TensorBuffSize)); + + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input1(gpu_data_input1, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input2(gpu_data_input2, input2TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input1, input1.data(), tensorBuffSize); + gpu_tensor.device(sycl_device)=gpu_input1; + sycl_device.memcpyHostToDevice(gpu_data_input2, input2.data(), input2TensorBuffSize); + gpu_tensor.template chip<0l>(1l).device(sycl_device)=gpu_input2; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k < sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (i != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input3TensorRange = {{sizeDim1, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input3(input3TensorRange); + input3.setRandom(); + + const size_t input3TensorBuffSize =input3.size()*sizeof(DataType); + DataType* gpu_data_input3 = static_cast<DataType*>(sycl_device.allocate(input3TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input3(gpu_data_input3, input3TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input3, input3.data(), input3TensorBuffSize); + gpu_tensor.template chip<1l>(1l).device(sycl_device)=gpu_input3; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (j != 1) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input4TensorRange = {{sizeDim1, sizeDim2, sizeDim4, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input4(input4TensorRange); + input4.setRandom(); + + const size_t input4TensorBuffSize =input4.size()*sizeof(DataType); + DataType* gpu_data_input4 = static_cast<DataType*>(sycl_device.allocate(input4TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input4(gpu_data_input4, input4TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input4, input4.data(), input4TensorBuffSize); + gpu_tensor.template chip<2l>(3l).device(sycl_device)=gpu_input4; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (k != 3) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m)); + } + } + } + } + } + } + + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input5TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim5}}; + Tensor<DataType, 4, DataLayout,IndexType> input5(input5TensorRange); + input5.setRandom(); + + const size_t input5TensorBuffSize =input5.size()*sizeof(DataType); + DataType* gpu_data_input5 = static_cast<DataType*>(sycl_device.allocate(input5TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input5(gpu_data_input5, input5TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input5, input5.data(), input5TensorBuffSize); + gpu_tensor.template chip<3l>(4l).device(sycl_device)=gpu_input5; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (l != 4) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m)); + } + } + } + } + } + } + gpu_tensor.device(sycl_device)=gpu_input1; + array<IndexType, 4> input6TensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> input6(input6TensorRange); + input6.setRandom(); + + const size_t input6TensorBuffSize =input6.size()*sizeof(DataType); + DataType* gpu_data_input6 = static_cast<DataType*>(sycl_device.allocate(input6TensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_input6(gpu_data_input6, input6TensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input6, input6.data(), input6TensorBuffSize); + gpu_tensor.template chip<4l>(5l).device(sycl_device)=gpu_input6; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (m != 5) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l)); + } + } + } + } + } + } + + + gpu_tensor.device(sycl_device)=gpu_input1; + Tensor<DataType, 5, DataLayout,IndexType> input7(tensorRange); + input7.setRandom(); + + DataType* gpu_data_input7 = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_input7(gpu_data_input7, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_input7, input7.data(), tensorBuffSize); + gpu_tensor.chip(0l,0l).device(sycl_device)=gpu_input7.chip(0l,0l); + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data_tensor, tensorBuffSize); + + for (int i = 0; i < sizeDim1; ++i) { + for (int j = 0; j < sizeDim2; ++j) { + for (int k = 0; k <sizeDim3; ++k) { + for (int l = 0; l < sizeDim4; ++l) { + for (int m = 0; m < sizeDim5; ++m) { + if (i != 0) { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m)); + } else { + VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m)); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_input1); + sycl_device.deallocate(gpu_data_input2); + sycl_device.deallocate(gpu_data_input3); + sycl_device.deallocate(gpu_data_input4); + sycl_device.deallocate(gpu_data_input5); + sycl_device.deallocate(gpu_data_input6); + sycl_device.deallocate(gpu_data_input7); + +} + +template<typename DataType, typename dev_Selector> void sycl_chipping_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + /* test_static_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_static_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_dynamic_chip_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_dynamic_chip_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_chip_in_expr<DataType, RowMajor, int64_t>(sycl_device); + test_chip_in_expr<DataType, ColMajor, int64_t>(sycl_device);*/ + test_chip_as_lvalue_sycl<DataType, RowMajor, int64_t>(sycl_device); + // test_chip_as_lvalue_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_chipping_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_comparisons.cpp b/unsupported/test/cxx11_tensor_comparisons.cpp index b1ff8aecb..1a18e07cc 100644 --- a/unsupported/test/cxx11_tensor_comparisons.cpp +++ b/unsupported/test/cxx11_tensor_comparisons.cpp @@ -77,7 +77,7 @@ static void test_equality() } -void test_cxx11_tensor_comparisons() +EIGEN_DECLARE_TEST(cxx11_tensor_comparisons) { CALL_SUBTEST(test_orderings()); CALL_SUBTEST(test_equality()); diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu index 2baf5eaad..99447b21d 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu @@ -8,12 +8,9 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops + #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -31,7 +28,7 @@ void test_cuda_complex_cwise_ops() { cudaMalloc((void**)(&d_in2), complex_bytes); cudaMalloc((void**)(&d_out), complex_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1( @@ -51,11 +48,13 @@ void test_cuda_complex_cwise_ops() { Add = 0, Sub, Mul, - Div + Div, + Neg, + NbOps }; Tensor<std::complex<T>, 1, 0, int> actual(kNumItems); - for (int op = Add; op <= Div; op++) { + for (int op = Add; op < NbOps; op++) { std::complex<T> expected; switch (static_cast<CwiseOp>(op)) { case Add: @@ -74,6 +73,12 @@ void test_cuda_complex_cwise_ops() { gpu_out.device(gpu_device) = gpu_in1 / gpu_in2; expected = a / b; break; + case Neg: + gpu_out.device(gpu_device) = -gpu_in1; + expected = -a; + break; + case NbOps: + break; } assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); @@ -90,7 +95,7 @@ void test_cuda_complex_cwise_ops() { } -void test_cxx11_tensor_complex_cwise_ops() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex_cwise_ops) { CALL_SUBTEST(test_cuda_complex_cwise_ops<float>()); CALL_SUBTEST(test_cuda_complex_cwise_ops<double>()); diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu index d4e111f5d..f8b8ae704 100644 --- a/unsupported/test/cxx11_tensor_complex_cuda.cu +++ b/unsupported/test/cxx11_tensor_complex_gpu.cu @@ -8,12 +8,9 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_complex + #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -37,7 +34,7 @@ void test_cuda_nullary() { cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( @@ -73,7 +70,7 @@ void test_cuda_nullary() { static void test_cuda_sum_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -107,10 +104,45 @@ static void test_cuda_sum_reductions() { gpu_device.deallocate(gpu_out_ptr); } +static void test_cuda_mean_reductions() { + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + const int num_rows = internal::random<int>(1024, 5*1024); + const int num_cols = internal::random<int>(1024, 5*1024); + + Tensor<std::complex<float>, 2> in(num_rows, num_cols); + in.setRandom(); + + Tensor<std::complex<float>, 0> full_redux; + full_redux = in.mean(); + + std::size_t in_bytes = in.size() * sizeof(std::complex<float>); + std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>); + std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes)); + std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes)); + gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes); + + TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols); + TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr); + + out_gpu.device(gpu_device) = in_gpu.mean(); + + Tensor<std::complex<float>, 0> full_redux_gpu; + gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes); + gpu_device.synchronize(); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux(), full_redux_gpu()); + + gpu_device.deallocate(gpu_in_ptr); + gpu_device.deallocate(gpu_out_ptr); +} static void test_cuda_product_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -145,9 +177,10 @@ static void test_cuda_product_reductions() { } -void test_cxx11_tensor_complex() +EIGEN_DECLARE_TEST(test_cxx11_tensor_complex) { CALL_SUBTEST(test_cuda_nullary()); CALL_SUBTEST(test_cuda_sum_reductions()); + CALL_SUBTEST(test_cuda_mean_reductions()); CALL_SUBTEST(test_cuda_product_reductions()); } diff --git a/unsupported/test/cxx11_tensor_concatenation.cpp b/unsupported/test/cxx11_tensor_concatenation.cpp index 03ef12e63..bb9418d33 100644 --- a/unsupported/test/cxx11_tensor_concatenation.cpp +++ b/unsupported/test/cxx11_tensor_concatenation.cpp @@ -50,7 +50,13 @@ static void test_static_dimension_failure() .reshape(Tensor<int, 3>::Dimensions(2, 3, 1)) .concatenate(right, 0); Tensor<int, 2, DataLayout> alternative = left - .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0); + // Clang compiler break with {{{}}} with an ambiguous error on copy constructor + // the variadic DSize constructor added for #ifndef EIGEN_EMULATE_CXX11_META_H. + // Solution: + // either the code should change to + // Tensor<int, 2>::Dimensions{{2, 3}} + // or Tensor<int, 2>::Dimensions{Tensor<int, 2>::Dimensions{{2, 3}}} + .concatenate(right.reshape(Tensor<int, 2>::Dimensions(2, 3)), 0); } template<int DataLayout> @@ -123,7 +129,7 @@ static void test_concatenation_as_lvalue() } -void test_cxx11_tensor_concatenation() +EIGEN_DECLARE_TEST(cxx11_tensor_concatenation) { CALL_SUBTEST(test_dimension_failures<ColMajor>()); CALL_SUBTEST(test_dimension_failures<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_concatenation_sycl.cpp b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp new file mode 100644 index 000000000..765991b35 --- /dev/null +++ b/unsupported/test/cxx11_tensor_concatenation_sycl.cpp @@ -0,0 +1,180 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template<typename DataType, int DataLayout, typename IndexType> +static void test_simple_concatenation(const Eigen::SyclDevice& sycl_device) +{ + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + IndexType leftDim3 = 1; + Eigen::array<IndexType, 3> leftRange = {{leftDim1, leftDim2, leftDim3}}; + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + IndexType rightDim3 = 1; + Eigen::array<IndexType, 3> rightRange = {{rightDim1, rightDim2, rightDim3}}; + + //IndexType concatDim1 = 3; +// IndexType concatDim2 = 3; +// IndexType concatDim3 = 1; + //Eigen::array<IndexType, 3> concatRange = {{concatDim1, concatDim2, concatDim3}}; + + Tensor<DataType, 3, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 3, DataLayout, IndexType> right(rightRange); + left.setRandom(); + right.setRandom(); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + /// + Tensor<DataType, 3, DataLayout, IndexType> concatenation1(leftDim1+rightDim1, leftDim2, leftDim3); + DataType * gpu_out_data1 = static_cast<DataType*>(sycl_device.allocate(concatenation1.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out1(gpu_out_data1, concatenation1.dimensions()); + + //concatenation = left.concatenate(right, 0); + gpu_out1.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 0); + sycl_device.memcpyDeviceToHost(concatenation1.data(), gpu_out_data1,(concatenation1.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(concatenation1.dimension(0), 4); + VERIFY_IS_EQUAL(concatenation1.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation1.dimension(2), 1); + for (IndexType j = 0; j < 3; ++j) { + for (IndexType i = 0; i < 2; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), left(i, j, 0)); + } + for (IndexType i = 2; i < 4; ++i) { + VERIFY_IS_EQUAL(concatenation1(i, j, 0), right(i - 2, j, 0)); + } + } + + sycl_device.deallocate(gpu_out_data1); + Tensor<DataType, 3, DataLayout, IndexType> concatenation2(leftDim1, leftDim2 +rightDim2, leftDim3); + DataType * gpu_out_data2 = static_cast<DataType*>(sycl_device.allocate(concatenation2.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out2(gpu_out_data2, concatenation2.dimensions()); + gpu_out2.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 1); + sycl_device.memcpyDeviceToHost(concatenation2.data(), gpu_out_data2,(concatenation2.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 1); + VERIFY_IS_EQUAL(concatenation2.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation2.dimension(1), 6); + VERIFY_IS_EQUAL(concatenation2.dimension(2), 1); + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), left(i, j, 0)); + } + for (IndexType j = 3; j < 6; ++j) { + VERIFY_IS_EQUAL(concatenation2(i, j, 0), right(i, j - 3, 0)); + } + } + sycl_device.deallocate(gpu_out_data2); + Tensor<DataType, 3, DataLayout, IndexType> concatenation3(leftDim1, leftDim2, leftDim3+rightDim3); + DataType * gpu_out_data3 = static_cast<DataType*>(sycl_device.allocate(concatenation3.dimensions().TotalSize()*sizeof(DataType))); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out3(gpu_out_data3, concatenation3.dimensions()); + gpu_out3.device(sycl_device) =gpu_in1.concatenate(gpu_in2, 2); + sycl_device.memcpyDeviceToHost(concatenation3.data(), gpu_out_data3,(concatenation3.dimensions().TotalSize())*sizeof(DataType)); + + //concatenation = left.concatenate(right, 2); + VERIFY_IS_EQUAL(concatenation3.dimension(0), 2); + VERIFY_IS_EQUAL(concatenation3.dimension(1), 3); + VERIFY_IS_EQUAL(concatenation3.dimension(2), 2); + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(concatenation3(i, j, 0), left(i, j, 0)); + VERIFY_IS_EQUAL(concatenation3(i, j, 1), right(i, j, 0)); + } + } + sycl_device.deallocate(gpu_out_data3); + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); +} +template<typename DataType, int DataLayout, typename IndexType> +static void test_concatenation_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + IndexType leftDim1 = 2; + IndexType leftDim2 = 3; + Eigen::array<IndexType, 2> leftRange = {{leftDim1, leftDim2}}; + + IndexType rightDim1 = 2; + IndexType rightDim2 = 3; + Eigen::array<IndexType, 2> rightRange = {{rightDim1, rightDim2}}; + + IndexType concatDim1 = 4; + IndexType concatDim2 = 3; + Eigen::array<IndexType, 2> resRange = {{concatDim1, concatDim2}}; + + Tensor<DataType, 2, DataLayout, IndexType> left(leftRange); + Tensor<DataType, 2, DataLayout, IndexType> right(rightRange); + Tensor<DataType, 2, DataLayout, IndexType> result(resRange); + + left.setRandom(); + right.setRandom(); + result.setRandom(); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(left.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(right.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); + + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in1(gpu_in1_data, leftRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_in2(gpu_in2_data, rightRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(gpu_out_data, resRange); + + sycl_device.memcpyHostToDevice(gpu_in1_data, left.data(),(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, right.data(),(right.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_out_data, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + +// t1.concatenate(t2, 0) = result; + gpu_in1.concatenate(gpu_in2, 0).device(sycl_device) =gpu_out; + sycl_device.memcpyDeviceToHost(left.data(), gpu_in1_data,(left.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(right.data(), gpu_in2_data,(right.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + VERIFY_IS_EQUAL(left(i, j), result(i, j)); + VERIFY_IS_EQUAL(right(i, j), result(i+2, j)); + } + } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); +} + + +template <typename DataType, typename Dev_selector> void tensorConcat_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_concatenation<DataType, RowMajor, int64_t>(sycl_device); + test_simple_concatenation<DataType, ColMajor, int64_t>(sycl_device); + test_concatenation_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_concatenation_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConcat_perDevice<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_const.cpp b/unsupported/test/cxx11_tensor_const.cpp index ad9c9da39..9d806ee3c 100644 --- a/unsupported/test/cxx11_tensor_const.cpp +++ b/unsupported/test/cxx11_tensor_const.cpp @@ -55,7 +55,7 @@ static void test_assign_of_const_tensor() } -void test_cxx11_tensor_const() +EIGEN_DECLARE_TEST(cxx11_tensor_const) { CALL_SUBTEST(test_simple_assign()); CALL_SUBTEST(test_assign_of_const_tensor()); diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index dd68430ce..575bdc1f9 100644 --- a/unsupported/test/cxx11_tensor_contract_cuda.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -10,21 +10,20 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_gpu_contraction(int m_size, int k_size, int n_size) { std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is @@ -47,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > @@ -68,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { continue; @@ -81,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } @@ -111,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > @@ -131,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); if (fabs(t_result() - t_result_gpu()) > 1e-4f && !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { std::cout << "mismatch detected: " << t_result() @@ -139,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } template<int DataLayout> -void test_cuda_contraction_m() { +void test_gpu_contraction_m() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(k, 128, 128); - test_cuda_contraction<RowMajor>(k, 128, 128); + test_gpu_contraction<ColMajor>(k, 128, 128); + test_gpu_contraction<RowMajor>(k, 128, 128); } } template<int DataLayout> -void test_cuda_contraction_k() { +void test_gpu_contraction_k() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(128, k, 128); - test_cuda_contraction<RowMajor>(128, k, 128); + test_gpu_contraction<ColMajor>(128, k, 128); + test_gpu_contraction<RowMajor>(128, k, 128); } } template<int DataLayout> -void test_cuda_contraction_n() { +void test_gpu_contraction_n() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(128, 128, k); - test_cuda_contraction<RowMajor>(128, 128, k); + test_gpu_contraction<ColMajor>(128, 128, k); + test_gpu_contraction<RowMajor>(128, 128, k); } } template<int DataLayout> -void test_cuda_contraction_sizes() { +void test_gpu_contraction_sizes() { int m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; @@ -188,29 +187,32 @@ void test_cuda_contraction_sizes() { for (int i = 0; i < 15; i++) { for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); + test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); } } } } -void test_cxx11_tensor_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_contract_gpu) { - CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); - CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); - CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); - CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); + CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>()); + CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>()); - CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); - CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); + CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>()); + CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>()); - CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); - CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); + CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>()); + CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>()); - CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); - CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); +#if !defined(EIGEN_USE_HIP) +// disable these subtests for HIP + CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>()); + CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>()); +#endif } diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp new file mode 100644 index 000000000..fbcc29358 --- /dev/null +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -0,0 +1,1026 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include <algorithm> +#include <chrono> +#include <ctime> +#include <iostream> + +#include "main.h" + +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void static test_sycl_contraction(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(m_size, n_size); + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, result_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_m(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, k, 128, + 128); + } +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_k(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, k, + 128); + } +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_n(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>(sycl_device, 128, + 128, k); + } +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_sycl_contraction_sizes(const Device &sycl_device) { + IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; + + IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; + + IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129, + 255, 257, 511, 512, 513, 1023, 1024, 1025}; + + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction<DataLayout, DataType, IndexType>( + sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + } + } + } +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 2, DataLayout, IndexType> t_result(m_size, n_size); + + Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> result_dims = {{m_size, n_size}}; + + t_left.setRandom(); + t_right.setRandom(); + + // Allocate buffers twice as big to check for invalid read and write + auto padded_left_size = 2 * t_left.size(); + auto padded_right_size = 2 * t_right.size(); + auto padded_result_size = 2 * t_result.size(); + + std::size_t t_left_bytes = padded_left_size * sizeof(DataType); + std::size_t t_right_bytes = padded_right_size * sizeof(DataType); + std::size_t t_result_bytes = padded_result_size * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + // TensorMaps are still of the same size than the Tensors + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, result_dims); + + // Write nan after the actual buffer to propagate nans everywhere in case of + // invalid reads + DataType nan = std::numeric_limits<DataType>::quiet_NaN(); + auto host_left_data = new DataType[padded_left_size]; + std::copy_n(t_left.data(), t_left.size(), host_left_data); + std::fill_n(host_left_data + t_left.size(), t_left.size(), nan); + auto host_right_data = new DataType[padded_right_size]; + std::copy_n(t_right.data(), t_right.size(), host_right_data); + std::fill_n(host_right_data + t_right.size(), t_right.size(), nan); + auto host_result_data = new DataType[padded_result_size]; + std::fill_n(host_result_data, padded_result_size, nan); + + sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - host_result_data[i]))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), host_result_data[i], + error_threshold)) { + continue; + } + if (std::isnan(host_result_data[i])) { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid read detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } else { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); + } + // Make sure that the rest of the result is still nans + for (IndexType i = t_result.size(); i < padded_result_size; i++) { + if (std::isnan(host_result_data[i])) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid write detected at IndexType " << i << ": " + << host_result_data[i] << std::endl; + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); + + delete[] host_left_data; + delete[] host_right_data; + delete[] host_result_data; +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size, + IndexType n_size) { + // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << + // ")" << std::endl; + // with these dimensions, the output has 300 * 140 elements, which is + // more than 30 * 1024, which is the number of threads in blocks on + // a 15 SM GK110 GPU + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Tensor<DataType, 2, DataLayout, IndexType> t_left(m_size, k_size); + Tensor<DataType, 2, DataLayout, IndexType> t_right(k_size, n_size); + Tensor<DataType, 0, DataLayout, IndexType> t_result; + Tensor<DataType, 0, DataLayout, IndexType> t_result_gpu; + Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}}; + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 0, DataLayout, IndexType>> + gpu_t_result(d_t_result); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result() - t_result_gpu()))) > error_threshold && + !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { + std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size + << " : mismatch detected: " << t_result() << " vs " + << t_result_gpu() << std::endl; + VERIFY_IS_APPROX(t_result_gpu(), t_result()); + } + + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_batch(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, IndexType m_batch, + IndexType start, IndexType limit) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + typedef Eigen::array<IndexType, 3> TensorDim; + typedef Eigen::Tensor<DataType, 3, DataLayout, IndexType> TensorType; + TensorDim left_dims = {{m_batch, k_size, m_size}}; + TensorDim right_dims = {{m_batch, n_size, k_size}}; + TensorDim res_dims = {{m_batch, m_size, n_size}}; + Eigen::array<DimPair, 1> contract_pairs = {{DimPair(0, 1)}}; + + TensorType t_left(left_dims); + TensorType t_right(right_dims); + TensorType t_result_gpu(res_dims); + TensorType t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<TensorType> gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<TensorType> gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<TensorType> gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + for (int i = start; i < limit; ++i) { + auto x = gpu_t_left.template chip<0>(i); + auto y = gpu_t_right.template chip<0>(i); + auto z = gpu_t_result.template chip<0>(i); + z.device(sycl_device) = x.contract(y, contract_pairs); + } + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + for (int i = start; i < limit; ++i) { + auto x = t_left.template chip<0>(i); + auto y = t_right.template chip<0>(i); + auto z = t_result.template chip<0>(i); + z = x.contract(y, contract_pairs); + } + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{m_size, k_size}}; + Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(1, 1)}}; + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType j = 0; j < m_size; j++) { + for (IndexType i = 0; i < n_size; i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType m: " << j << " n: " << i + << " CPU : " << t_result(j, i) + << " vs SYCL:" << t_result_gpu(j, i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i)); + } + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; + Eigen::array<IndexType, 2> right_dims = {{k_size, n_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}}; + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <int DataLayout, typename DataType, typename IndexType, + typename Device> +void contraction_both_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor<DataType, 1, DataLayout, IndexType>::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array<IndexType, 2> left_dims = {{k_size, m_size}}; + Eigen::array<IndexType, 2> right_dims = {{n_size, k_size}}; + Eigen::array<IndexType, 2> res_dims = {{m_size, n_size}}; + Eigen::array<DimPair, 1> dims = {{DimPair(0, 1)}}; + + Tensor<DataType, 2, DataLayout, IndexType> t_left(left_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_right(right_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result_gpu(res_dims); + Tensor<DataType, 2, DataLayout, IndexType> t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast<DataType *>(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast<DataType *>(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast<DataType *>(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType>> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); + } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template <typename Dev> +void inline tensorOutofBound(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Test out of bound for Tensor-Tensor + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, + 1024); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, + 4096); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 4096, 1024, + 2048); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, + 1024); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 2048, 1024, + 784); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 10, 1024, + 10); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 513, 4096, + 513); + test_no_out_of_bounds<RowMajor, DataType, IndexType>(sycl_device, 783, 1024, + 783); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 784, 2048, + 784); + test_no_out_of_bounds<ColMajor, DataType, IndexType>(sycl_device, 11, 1024, + 11); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor out of bound tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 128, 128, + 128); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 128, 128, + 128); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_m(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_m<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_m<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_n(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_n<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_n<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_k(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + test_sycl_contraction_k<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_k<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorTensor_sizes(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_sizes<ColMajor, DataType, IndexType>(sycl_device); + test_sycl_contraction_sizes<RowMajor, DataType, IndexType>(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} +template <typename Dev> +void inline vectorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // VECTOR-VECTOR + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1, + 1025); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1025, 1, + 1025); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1024, 1, + 1024); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1, + 1024); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1, + 1023); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1, + 1023); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "contracted tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline vectorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Vector-Tensor + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1025, + 1025); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1025, + 1025); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1024, + 1024); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1024, + 1024); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 1023, + 1023); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 1023, + 1023); + + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4097, + 4097); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4097, + 4097); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4096, + 4096); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4096, + 4096); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 4095, + 4095); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1, 4095, + 4095); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1, 802816, + 32); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Matrix-Vector + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1025, 1025, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1125, 1025, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1224, 1024, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1024, 1024, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 1023, 1023, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 1023, 1023, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4097, 4197, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4097, 4097, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4096, 4096, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4096, 8196, + 1); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 4095, 4095, + 1); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 4095, 4095, + 1); +// If the GEMV disabled it will creates one kernel to calculate the contraction. +// Therefore the acumuation of float number will overflow the precision +// threshold for float and cause the test to fail. While it the GMV multiple +// kernel will be created and each one run the overflow of accumutation breaks +// among the kernels. +#ifndef EIGEN_SYCL_DISABLE_GEMV + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 32, 802032, + 1); +#endif + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensorScalar(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // SCALAR Contraction + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 127, 127, 127); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 127, 127, 127); + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 128, 128, 128); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 128, 128, 128); + test_scalar<ColMajor, DataType, IndexType>(sycl_device, 129, 129, 129); + test_scalar<RowMajor, DataType, IndexType>(sycl_device, 129, 129, 129); + + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline skinnyTensor_row(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 4, 16); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 257, 131073, + 257); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 256, 131072, + 256); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 16, 131073, + 16); + test_sycl_contraction<RowMajor, DataType, IndexType>(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline skinnyTensor_col(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 4, 16); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 257, 131073, + 257); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 256, 131072, + 256); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 16, 131073, + 16); + test_sycl_contraction<ColMajor, DataType, IndexType>(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_batch_per_device(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_batch<RowMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, + 0, 4); + contraction_batch<ColMajor, DataType, IndexType>(sycl_device, 64, 75, 30, 4, + 0, 4); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_lhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 8, 4, + 8); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, + 64); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 784, + 2048, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, + 10, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, + 1024, 1024); + contraction_lhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, + 4096, 1024); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_rhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 16, 4, + 16); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, + 17); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, 16, + 64); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 10, + 1024, 1024); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 1024, + 1024, 4096); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 4096, + 1024, 2048); + contraction_rhs_transposed<RowMajor, DataType, IndexType>(sycl_device, 2048, + 1024, 784); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template <typename Dev> +void inline tensor_contraction_both_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point<std::chrono::system_clock> start, end; + start = std::chrono::system_clock::now(); + + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 17, 5, + 17); + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 32, 8, + 32); + contraction_both_transposed<RowMajor, DataType, IndexType>(sycl_device, 64, + 16, 64); + end = std::chrono::system_clock::now(); + std::chrono::duration<double> elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) { + for (const auto &device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(tensorOutofBound(sycl_device)); + CALL_SUBTEST_2(tensorTensor(sycl_device)); + CALL_SUBTEST_2(tensorTensor_m(sycl_device)); + CALL_SUBTEST_2(tensorTensor_n(sycl_device)); + CALL_SUBTEST_2(tensorTensor_k(sycl_device)); + CALL_SUBTEST_2(tensorTensor_sizes(sycl_device)); + CALL_SUBTEST_3(vectorVector(sycl_device)); + CALL_SUBTEST_4(vectorTensor(sycl_device)); + CALL_SUBTEST_5(tensorVector(sycl_device)); + CALL_SUBTEST_6(tensorScalar(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_row(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_col(sycl_device)); + CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device)); + CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp index ace97057f..3b5c6a13c 100644 --- a/unsupported/test/cxx11_tensor_contraction.cpp +++ b/unsupported/test/cxx11_tensor_contraction.cpp @@ -471,7 +471,8 @@ static void test_tensor_product() mat1.setRandom(); mat2.setRandom(); - Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}}); + Eigen::array<DimPair, 0> dims; + Tensor<float, 4, DataLayout> result = mat1.contract(mat2, dims); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 3); @@ -510,36 +511,91 @@ static void test_const_inputs() VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1)); } -void test_cxx11_tensor_contraction() +// Apply Sqrt to all output elements. +struct SqrtOutputKernel { + template <typename Index, typename Scalar> + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, + const TensorContractionParams&, Index, Index, Index num_rows, + Index num_cols) const { + for (int i = 0; i < num_rows; ++i) { + for (int j = 0; j < num_cols; ++j) { + output_mapper(i, j) = std::sqrt(output_mapper(i, j)); + } + } + } +}; + +template <int DataLayout> +static void test_large_contraction_with_output_kernel() { + Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31); + Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10); + Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in mat4 to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result = t_left.contract(t_right, dims, SqrtOutputKernel()); + + m_result = m_left * m_right; + + for (std::ptrdiff_t i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_contraction) { - CALL_SUBTEST(test_evals<ColMajor>()); - CALL_SUBTEST(test_evals<RowMajor>()); - CALL_SUBTEST(test_scalar<ColMajor>()); - CALL_SUBTEST(test_scalar<RowMajor>()); - CALL_SUBTEST(test_multidims<ColMajor>()); - CALL_SUBTEST(test_multidims<RowMajor>()); - CALL_SUBTEST(test_holes<ColMajor>()); - CALL_SUBTEST(test_holes<RowMajor>()); - CALL_SUBTEST(test_full_redux<ColMajor>()); - CALL_SUBTEST(test_full_redux<RowMajor>()); - CALL_SUBTEST(test_contraction_of_contraction<ColMajor>()); - CALL_SUBTEST(test_contraction_of_contraction<RowMajor>()); - CALL_SUBTEST(test_expr<ColMajor>()); - CALL_SUBTEST(test_expr<RowMajor>()); - CALL_SUBTEST(test_out_of_order_contraction<ColMajor>()); - CALL_SUBTEST(test_out_of_order_contraction<RowMajor>()); - CALL_SUBTEST(test_consistency<ColMajor>()); - CALL_SUBTEST(test_consistency<RowMajor>()); - CALL_SUBTEST(test_large_contraction<ColMajor>()); - CALL_SUBTEST(test_large_contraction<RowMajor>()); - CALL_SUBTEST(test_matrix_vector<ColMajor>()); - CALL_SUBTEST(test_matrix_vector<RowMajor>()); - CALL_SUBTEST(test_tensor_vector<ColMajor>()); - CALL_SUBTEST(test_tensor_vector<RowMajor>()); - CALL_SUBTEST(test_small_blocking_factors<ColMajor>()); - CALL_SUBTEST(test_small_blocking_factors<RowMajor>()); - CALL_SUBTEST(test_tensor_product<ColMajor>()); - CALL_SUBTEST(test_tensor_product<RowMajor>()); - CALL_SUBTEST(test_const_inputs<ColMajor>()); - CALL_SUBTEST(test_const_inputs<RowMajor>()); + CALL_SUBTEST_1(test_evals<ColMajor>()); + CALL_SUBTEST_1(test_evals<RowMajor>()); + CALL_SUBTEST_1(test_scalar<ColMajor>()); + CALL_SUBTEST_1(test_scalar<RowMajor>()); + CALL_SUBTEST_2(test_multidims<ColMajor>()); + CALL_SUBTEST_2(test_multidims<RowMajor>()); + CALL_SUBTEST_2(test_holes<ColMajor>()); + CALL_SUBTEST_2(test_holes<RowMajor>()); + CALL_SUBTEST_3(test_full_redux<ColMajor>()); + CALL_SUBTEST_3(test_full_redux<RowMajor>()); + CALL_SUBTEST_3(test_contraction_of_contraction<ColMajor>()); + CALL_SUBTEST_3(test_contraction_of_contraction<RowMajor>()); + CALL_SUBTEST_4(test_expr<ColMajor>()); + CALL_SUBTEST_4(test_expr<RowMajor>()); + CALL_SUBTEST_4(test_out_of_order_contraction<ColMajor>()); + CALL_SUBTEST_4(test_out_of_order_contraction<RowMajor>()); + CALL_SUBTEST_5(test_consistency<ColMajor>()); + CALL_SUBTEST_5(test_consistency<RowMajor>()); + CALL_SUBTEST_5(test_large_contraction<ColMajor>()); + CALL_SUBTEST_5(test_large_contraction<RowMajor>()); + CALL_SUBTEST_6(test_matrix_vector<ColMajor>()); + CALL_SUBTEST_6(test_matrix_vector<RowMajor>()); + CALL_SUBTEST_6(test_tensor_vector<ColMajor>()); + CALL_SUBTEST_6(test_tensor_vector<RowMajor>()); + CALL_SUBTEST_7(test_small_blocking_factors<ColMajor>()); + CALL_SUBTEST_7(test_small_blocking_factors<RowMajor>()); + CALL_SUBTEST_7(test_tensor_product<ColMajor>()); + CALL_SUBTEST_7(test_tensor_product<RowMajor>()); + CALL_SUBTEST_8(test_const_inputs<ColMajor>()); + CALL_SUBTEST_8(test_const_inputs<RowMajor>()); + CALL_SUBTEST_8(test_large_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_8(test_large_contraction_with_output_kernel<RowMajor>()); + + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8 + } diff --git a/unsupported/test/cxx11_tensor_convolution.cpp b/unsupported/test/cxx11_tensor_convolution.cpp index e3d4675eb..c3688f678 100644 --- a/unsupported/test/cxx11_tensor_convolution.cpp +++ b/unsupported/test/cxx11_tensor_convolution.cpp @@ -25,7 +25,8 @@ static void test_evals() Tensor<float, 2, DataLayout> result(2,3); result.setZero(); - Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}}; + Eigen::array<Tensor<float, 2>::Index, 1> dims3; + dims3[0] = 0; typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator; Evaluator eval(input.convolve(kernel, dims3), DefaultDevice()); @@ -136,7 +137,7 @@ static void test_strides() { input(12)*kernel(2))); } -void test_cxx11_tensor_convolution() +EIGEN_DECLARE_TEST(cxx11_tensor_convolution) { CALL_SUBTEST(test_evals<ColMajor>()); CALL_SUBTEST(test_evals<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_convolution_sycl.cpp b/unsupported/test/cxx11_tensor_convolution_sycl.cpp new file mode 100644 index 000000000..3954c8a28 --- /dev/null +++ b/unsupported/test/cxx11_tensor_convolution_sycl.cpp @@ -0,0 +1,469 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include <iostream> +#include <chrono> +#include <ctime> + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> +#include <iomanip> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; +static const float error_threshold =1e-4f; + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr1D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=55; + IndexType outdim2=51; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 1> kernel_dims = {{4}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 1> dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr2D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=51; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 2> kernel_dims = {{4,5}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 2, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 2> dims3{{0,1}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_larg_expr3D(const Eigen::SyclDevice& sycl_device) +{ + IndexType indim0 =53; + IndexType indim1= 55; + IndexType indim2= 51; + IndexType outdim0=50; + IndexType outdim1=51; + IndexType outdim2=49; + Eigen::array<IndexType, 3> input_dims = {{indim0, indim1, indim2}}; + Eigen::array<IndexType, 3> kernel_dims = {{4,5,3}}; + Eigen::array<IndexType, 3> result_dims = {{outdim0, outdim1, outdim2}}; + + Tensor<DataType, 3, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 3, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 3, DataLayout,IndexType> result(result_dims); + Tensor<DataType, 3, DataLayout,IndexType> result_host(result_dims); + + Eigen::array<IndexType, 3> dims3{{0,1,2}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + result_host.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + result_host=input.convolve(kernel, dims3); + +for(IndexType i=0; i< outdim0; i++ ){ + for(IndexType j=0; j< outdim1; j++ ){ + for(IndexType k=0; k< outdim2; k++ ){ + if (!(Eigen::internal::isApprox(result(i,j,k), result_host(i,j,k), error_threshold))) { + std::cout <<std::setprecision(16)<< "mismatch detected at index ( "<< i << " , " << j << ", " << k << " ) " << " \t " << result(i,j,k) << " vs "<< result_host(i,j,k) << std::endl; + assert(false); + } + } + } +} + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); + +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_evals(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array<IndexType, 2> input_dims = {{3, 3}}; + Eigen::array<IndexType, 1> kernel_dims = {{2}}; + Eigen::array<IndexType, 2> result_dims = {{2, 3}}; + + Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout,IndexType> kernel(kernel_dims); + Tensor<DataType, 2, DataLayout,IndexType> result(result_dims); + + Eigen::array<IndexType, 1> dims3{{0}}; + + input.setRandom(); + kernel.setRandom(); + result.setZero(); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout, IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims3); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1)); // index 0 + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1)); // index 2 + VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1)); // index 4 + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1)); // index 1 + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1)); // index 3 + VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1)); // index 5 + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_expr(const Eigen::SyclDevice& sycl_device) +{ + Eigen::array<IndexType, 2> input_dims = {{3, 3}}; + Eigen::array<IndexType, 2> kernel_dims = {{2, 2}}; + Eigen::array<IndexType, 2> result_dims = {{2, 2}}; + + Tensor<DataType, 2, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 2, DataLayout, IndexType> kernel(kernel_dims); + Tensor<DataType, 2, DataLayout, IndexType> result(result_dims); + + input.setRandom(); + kernel.setRandom(); + Eigen::array<IndexType, 2> dims; + dims[0] = 0; + dims[1] = 1; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout,IndexType> > gpu_result(d_result, result_dims); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) + + input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) + + input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) + + input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1)); + VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) + + input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1)); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_result); +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_modes(const Eigen::SyclDevice& sycl_device){ + +Eigen::array<IndexType, 1> input_dims = {{3}}; +Eigen::array<IndexType, 1> kernel_dims = {{3}}; + +Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); +Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); + +input.setRandom(); +kernel.setRandom(); +Eigen::array<IndexType, 1> dims; +dims[0] = 0; + + input(0) = 1.0f; + input(1) = 2.0f; + input(2) = 3.0f; + kernel(0) = 0.5f; + kernel(1) = 1.0f; + kernel(2) = 0.0f; + + Eigen::array<std::pair<IndexType, IndexType>, 1> padding; + + // Emulate VALID mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(0, 0); + Tensor<DataType, 1, DataLayout, IndexType> valid(1); + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t valid_bytes = valid.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_valid = static_cast<DataType*>(sycl_device.allocate(valid_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_valid(d_valid, valid.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_valid.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(valid.data(), d_valid, valid_bytes); + + VERIFY_IS_EQUAL(valid.dimension(0), 1); + VERIFY_IS_APPROX(valid(0), 2.5f); + + // Emulate SAME mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(1, 1); + Tensor<DataType, 1, DataLayout, IndexType> same(3); + std::size_t same_bytes = same.size() * sizeof(DataType); + DataType * d_same = static_cast<DataType*>(sycl_device.allocate(same_bytes)); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_same(d_same, same.dimensions()); + gpu_same.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(same.data(), d_same, same_bytes); + + VERIFY_IS_EQUAL(same.dimension(0), 3); + VERIFY_IS_APPROX(same(0), 1.0f); + VERIFY_IS_APPROX(same(1), 2.5f); + VERIFY_IS_APPROX(same(2), 4.0f); + + // Emulate FULL mode (as defined in + // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html). + padding[0] = std::make_pair(2, 2); + + Tensor<DataType, 1, DataLayout, IndexType> full(5); + std::size_t full_bytes = full.size() * sizeof(DataType); + DataType * d_full = static_cast<DataType*>(sycl_device.allocate(full_bytes)); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_full(d_full, full.dimensions()); + gpu_full.device(sycl_device)=gpu_input.pad(padding).convolve(gpu_kernel, dims); + sycl_device.memcpyDeviceToHost(full.data(), d_full, full_bytes); + + VERIFY_IS_EQUAL(full.dimension(0), 5); + VERIFY_IS_APPROX(full(0), 0.0f); + VERIFY_IS_APPROX(full(1), 1.0f); + VERIFY_IS_APPROX(full(2), 2.5f); + VERIFY_IS_APPROX(full(3), 4.0f); + VERIFY_IS_APPROX(full(4), 1.5f); + + sycl_device.deallocate(d_input); + sycl_device.deallocate(d_kernel); + sycl_device.deallocate(d_valid); + sycl_device.deallocate(d_same); + sycl_device.deallocate(d_full); + +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_strides(const Eigen::SyclDevice& sycl_device){ + + Eigen::array<IndexType, 1> input_dims = {{13}}; + Eigen::array<IndexType, 1> kernel_dims = {{3}}; + + Tensor<DataType, 1, DataLayout, IndexType> input(input_dims); + Tensor<DataType, 1, DataLayout, IndexType> kernel(kernel_dims); + Tensor<DataType, 1, DataLayout, IndexType> result(2); + + input.setRandom(); + kernel.setRandom(); + Eigen::array<IndexType, 1> dims; + dims[0] = 0; + + Eigen::array<IndexType, 1> stride_of_3; + stride_of_3[0] = 3; + Eigen::array<IndexType, 1> stride_of_2; + stride_of_2[0] = 2; + + std::size_t input_bytes = input.size() * sizeof(DataType); + std::size_t kernel_bytes = kernel.size() * sizeof(DataType); + std::size_t result_bytes = result.size() * sizeof(DataType); + + DataType * d_input = static_cast<DataType*>(sycl_device.allocate(input_bytes)); + DataType * d_kernel = static_cast<DataType*>(sycl_device.allocate(kernel_bytes)); + DataType * d_result = static_cast<DataType*>(sycl_device.allocate(result_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_input(d_input, input_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_kernel(d_kernel, kernel_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 1, DataLayout,IndexType> > gpu_result(d_result, result.dimensions()); + sycl_device.memcpyHostToDevice(d_input, input.data(), input_bytes); + sycl_device.memcpyHostToDevice(d_kernel, kernel.data(), kernel_bytes); + + gpu_result.device(sycl_device)=gpu_input.stride(stride_of_3).convolve(gpu_kernel, dims).stride(stride_of_2); + sycl_device.memcpyDeviceToHost(result.data(), d_result, result_bytes); + + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) + + input(6)*kernel(2))); + VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) + + input(12)*kernel(2))); +} + +template <typename Dev_selector> void tensorConvolutionPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_larg_expr1D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr1D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr2D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr2D<float, ColMajor, int64_t>(sycl_device); + test_larg_expr3D<float, RowMajor, int64_t>(sycl_device); + test_larg_expr3D<float, ColMajor, int64_t>(sycl_device); + test_evals<float, ColMajor, int64_t>(sycl_device); + test_evals<float, RowMajor, int64_t>(sycl_device); + test_expr<float, ColMajor, int64_t>(sycl_device); + test_expr<float, RowMajor, int64_t>(sycl_device); + test_modes<float, ColMajor, int64_t>(sycl_device); + test_modes<float, RowMajor, int64_t>(sycl_device); + test_strides<float, ColMajor, int64_t>(sycl_device); + test_strides<float, RowMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_convolution_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorConvolutionPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_custom_index.cpp b/unsupported/test/cxx11_tensor_custom_index.cpp index 4528cc176..b5dbc97bd 100644 --- a/unsupported/test/cxx11_tensor_custom_index.cpp +++ b/unsupported/test/cxx11_tensor_custom_index.cpp @@ -88,7 +88,7 @@ static void test_sizes_as_index() } -void test_cxx11_tensor_custom_index() { +EIGEN_DECLARE_TEST(cxx11_tensor_custom_index) { test_map_as_index<ColMajor>(); test_map_as_index<RowMajor>(); test_matrix_as_index<ColMajor>(); diff --git a/unsupported/test/cxx11_tensor_custom_op.cpp b/unsupported/test/cxx11_tensor_custom_op.cpp index 8baa477cc..875ea57d2 100644 --- a/unsupported/test/cxx11_tensor_custom_op.cpp +++ b/unsupported/test/cxx11_tensor_custom_op.cpp @@ -104,7 +104,7 @@ static void test_custom_binary_op() } -void test_cxx11_tensor_custom_op() +EIGEN_DECLARE_TEST(cxx11_tensor_custom_op) { CALL_SUBTEST(test_custom_unary_op()); CALL_SUBTEST(test_custom_binary_op()); diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp new file mode 100644 index 000000000..d947ead83 --- /dev/null +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -0,0 +1,170 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +template<typename TensorType> +struct InsertZeros { + DSizes<DenseIndex, 2> dimensions(const TensorType& input) const { + DSizes<DenseIndex, 2> result; + result[0] = input.dimension(0) * 2; + result[1] = input.dimension(1) * 2; + return result; + } + + template <typename Output, typename Device> + void eval(const TensorType& input, Output& output, const Device& device) const + { + array<DenseIndex, 2> strides; + strides[0] = 2; + strides[1] = 2; + output.stride(strides).device(device) = input; + + Eigen::DSizes<DenseIndex, 2> offsets(1,1); + Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1); + output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f); + } +}; + +template<typename DataType, int DataLayout, typename IndexType> +static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 5; + Eigen::array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; + Eigen::array<IndexType, 2> tensorResultRange = {{6, 10}}; + + Eigen::Tensor<DataType, 2, DataLayout, IndexType> in1(tensorRange); + Eigen::Tensor<DataType, 2, DataLayout, IndexType> out(tensorResultRange); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + gpu_out.device(sycl_device) = gpu_in1.customOp(InsertZeros<TensorType>()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + VERIFY_IS_EQUAL(out.dimension(0), 6); + VERIFY_IS_EQUAL(out.dimension(1), 10); + + for (int i = 0; i < 6; i+=2) { + for (int j = 0; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), in1(i/2, j/2)); + } + } + for (int i = 1; i < 6; i+=2) { + for (int j = 1; j < 10; j+=2) { + VERIFY_IS_EQUAL(out(i, j), 0); + } + } + sycl_device.deallocate(gpu_in1_data); +sycl_device.deallocate(gpu_out_data); +} + +template<typename TensorType> +struct BatchMatMul { + DSizes<DenseIndex, 3> dimensions(const TensorType& input1, const TensorType& input2) const { + DSizes<DenseIndex, 3> result; + result[0] = input1.dimension(0); + result[1] = input2.dimension(1); + result[2] = input2.dimension(2); + return result; + } + + template <typename Output, typename Device> + void eval(const TensorType& input1, const TensorType& input2, + Output& output, const Device& device) const + { + typedef typename TensorType::DimensionPair DimPair; + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); + for (int64_t i = 0; i < output.dimension(2); ++i) { + output.template chip<2>(i).device(device) = input1.template chip<2>(i).contract(input2.template chip<2>(i), dims); + } + } +}; + +template<typename DataType, int DataLayout, typename IndexType> +static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + + Eigen::array<IndexType, 3> tensorRange1 = {{2, 3, 5}}; + Eigen::array<IndexType, 3> tensorRange2 = {{3,7,5}}; + Eigen::array<IndexType, 3> tensorResultRange = {{2, 7, 5}}; + + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange1); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange2); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorResultRange); + + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); + + typedef Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType> > TensorType; + TensorType gpu_in1(gpu_in1_data, tensorRange1); + TensorType gpu_in2(gpu_in2_data, tensorRange2); + TensorType gpu_out(gpu_out_data, tensorResultRange); + + in1.setRandom(); + in2.setRandom(); + + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); + + gpu_out.device(sycl_device) = gpu_in1.customOp(gpu_in2, BatchMatMul<TensorType>()); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + + for (IndexType i = 0; i < 5; ++i) { + typedef typename Eigen::Tensor<DataType, 3, DataLayout, IndexType>::DimensionPair DimPair; + array<DimPair, 1> dims; + dims[0] = DimPair(1, 0); + Eigen::Tensor<DataType, 2, DataLayout, IndexType> reference = in1.template chip<2>(i).contract(in2.template chip<2>(i), dims); + TensorRef<Eigen::Tensor<DataType, 2, DataLayout, IndexType> > val = out.template chip<2>(i); + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_APPROX(val(j, k), reference(j, k)); + } + } + } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, typename Dev_selector> void custom_op_perDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_custom_unary_op_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_custom_unary_op_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_custom_binary_op_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_custom_binary_op_sycl<DataType, RowMajor, int64_t>(sycl_device); + +} +EIGEN_DECLARE_TEST(cxx11_tensor_custom_op_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(custom_op_perDevice<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index fde20ddf2..c9f78d2d3 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -9,16 +9,15 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; using Eigen::RowMajor; @@ -68,22 +67,22 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess); float kernel_1d_val[] = {3.14f, 2.7f}; - assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); - assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess); float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; - assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); - assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess); float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; - assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); } ~GPUContext() { - assert(cudaFree(kernel_1d_) == cudaSuccess); - assert(cudaFree(kernel_2d_) == cudaSuccess); - assert(cudaFree(kernel_3d_) == cudaSuccess); + assert(gpuFree(kernel_1d_) == gpuSuccess); + assert(gpuFree(kernel_2d_) == gpuSuccess); + assert(gpuFree(kernel_3d_) == gpuSuccess); } const Eigen::GpuDevice& device() const { return gpu_device_; } @@ -104,7 +103,7 @@ struct GPUContext { float* kernel_2d_; float* kernel_3d_; - Eigen::CudaStreamDevice stream_; + Eigen::GpuStreamDevice stream_; Eigen::GpuDevice gpu_device_; }; @@ -283,12 +282,12 @@ void test_gpu() { float* d_in1; float* d_in2; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); @@ -296,7 +295,7 @@ void test_gpu() { GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -306,7 +305,7 @@ void test_gpu() { } test_forced_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -316,7 +315,7 @@ void test_gpu() { } test_compound_assignment(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -326,7 +325,7 @@ void test_gpu() { } test_contraction(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { const float result = out(i,j,0); @@ -341,8 +340,8 @@ void test_gpu() { } test_1d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { @@ -352,8 +351,8 @@ void test_gpu() { } test_2d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { @@ -365,9 +364,13 @@ void test_gpu() { } } +#if !defined(EIGEN_USE_HIP) +// disable this test on the HIP platform +// 3D tensor convolutions seem to hang on the HIP platform + test_3d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { @@ -380,10 +383,13 @@ void test_gpu() { } } } + +#endif + } -void test_cxx11_tensor_device() +EIGEN_DECLARE_TEST(cxx11_tensor_device) { CALL_SUBTEST_1(test_cpu()); CALL_SUBTEST_2(test_gpu()); diff --git a/unsupported/test/cxx11_tensor_device_sycl.cpp b/unsupported/test/cxx11_tensor_device_sycl.cpp index 7f79753c5..5095cb078 100644 --- a/unsupported/test/cxx11_tensor_device_sycl.cpp +++ b/unsupported/test/cxx11_tensor_device_sycl.cpp @@ -13,19 +13,65 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <stdint.h> +#include <iostream> + +template <typename DataType, int DataLayout, typename IndexType> +void test_device_memory(const Eigen::SyclDevice &sycl_device) { + std::cout << "Running on : " + << sycl_device.sycl_queue().get_device(). template get_info<cl::sycl::info::device::name>() + <<std::endl; + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorRange = {{sizeDim1}}; + Tensor<DataType, 1, DataLayout,IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout,IndexType> in1(tensorRange); + memset(in1.data(), 1, in1.size() * sizeof(DataType)); + DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); + sycl_device.memset(gpu_in_data, 1, in.size()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(in.data(), gpu_in_data, in.size()*sizeof(DataType)); + for (IndexType i=0; i<in.size(); i++) { + VERIFY_IS_EQUAL(in(i), in1(i)); + } + sycl_device.deallocate(gpu_in_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_device_exceptions(const Eigen::SyclDevice &sycl_device) { + VERIFY(sycl_device.ok()); + IndexType sizeDim1 = 100; + array<IndexType, 1> tensorDims = {{sizeDim1}}; + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(sizeDim1*sizeof(DataType))); + sycl_device.memset(gpu_data, 1, sizeDim1*sizeof(DataType)); -void test_device_sycl(const Eigen::SyclDevice &sycl_device) { - std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : " - << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;; + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> in(gpu_data, tensorDims); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> out(gpu_data, tensorDims); + out.device(sycl_device) = in / in.constant(0); + + sycl_device.synchronize(); + VERIFY(!sycl_device.ok()); + sycl_device.deallocate(gpu_data); +} + +template<typename DataType> void sycl_device_test_per_device(const cl::sycl::device& d){ + std::cout << "Running on " << d.template get_info<cl::sycl::info::device::name>() << std::endl; + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_device_memory<DataType, RowMajor, int64_t>(sycl_device); + test_device_memory<DataType, ColMajor, int64_t>(sycl_device); + /// this test throw an exception. enable it if you want to see the exception + //test_device_exceptions<DataType, RowMajor>(sycl_device); + /// this test throw an exception. enable it if you want to see the exception + //test_device_exceptions<DataType, ColMajor>(sycl_device); } -void test_cxx11_tensor_device_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST(test_device_sycl(sycl_device)); + +EIGEN_DECLARE_TEST(cxx11_tensor_device_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_device_test_per_device<float>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp index 16f168ed4..ee416e14a 100644 --- a/unsupported/test/cxx11_tensor_dimension.cpp +++ b/unsupported/test/cxx11_tensor_dimension.cpp @@ -60,10 +60,29 @@ static void test_rank_zero() VERIFY_IS_EQUAL((int)dscalar.rank(), 0); } -void test_cxx11_tensor_dimension() +static void test_index_type_promotion() { + Eigen::DSizes<int, 3> src0(1, 2, 3); + Eigen::array<int, 3> src1; + src1[0] = 4; + src1[1] = 5; + src1[2] = 6; + + Eigen::DSizes<long, 3> dst0(src0); + Eigen::DSizes<long, 3> dst1(src1); + + VERIFY_IS_EQUAL(dst0[0], 1L); + VERIFY_IS_EQUAL(dst0[1], 2L); + VERIFY_IS_EQUAL(dst0[2], 3L); + VERIFY_IS_EQUAL(dst1[0], 4L); + VERIFY_IS_EQUAL(dst1[1], 5L); + VERIFY_IS_EQUAL(dst1[2], 6L); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_dimension) { CALL_SUBTEST(test_dynamic_size()); CALL_SUBTEST(test_fixed_size()); CALL_SUBTEST(test_match()); CALL_SUBTEST(test_rank_zero()); + CALL_SUBTEST(test_index_type_promotion()); } diff --git a/unsupported/test/cxx11_tensor_empty.cpp b/unsupported/test/cxx11_tensor_empty.cpp index d7eea42d7..fd889c46c 100644 --- a/unsupported/test/cxx11_tensor_empty.cpp +++ b/unsupported/test/cxx11_tensor_empty.cpp @@ -33,7 +33,7 @@ static void test_empty_fixed_size_tensor() } -void test_cxx11_tensor_empty() +EIGEN_DECLARE_TEST(cxx11_tensor_empty) { CALL_SUBTEST(test_empty_tensor()); CALL_SUBTEST(test_empty_fixed_size_tensor()); diff --git a/unsupported/test/cxx11_tensor_executor.cpp b/unsupported/test/cxx11_tensor_executor.cpp new file mode 100644 index 000000000..66b06e8ee --- /dev/null +++ b/unsupported/test/cxx11_tensor_executor.cpp @@ -0,0 +1,731 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2018 Eugene Zhulenev <ezhulenev@google.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::RowMajor; +using Eigen::ColMajor; +using Eigen::internal::TiledEvaluation; + +// A set of tests to verify that different TensorExecutor strategies yields the +// same results for all the ops, supporting tiled evaluation. + +// Default assignment that does no use block evaluation or vectorization. +// We assume that default coefficient evaluation is well tested and correct. +template <typename Dst, typename Expr> +static void DefaultAssign(Dst& dst, Expr expr) { + using Assign = Eigen::TensorAssignOp<Dst, const Expr>; + using Executor = + Eigen::internal::TensorExecutor<const Assign, DefaultDevice, + /*Vectorizable=*/false, + /*Tiling=*/TiledEvaluation::Off>; + + Executor::run(Assign(dst, expr), DefaultDevice()); +} + +// Assignment with specified device and tiling strategy. +template <bool Vectorizable, TiledEvaluation Tiling, typename Device, + typename Dst, typename Expr> +static void DeviceAssign(Device& d, Dst& dst, Expr expr) { + using Assign = Eigen::TensorAssignOp<Dst, const Expr>; + using Executor = Eigen::internal::TensorExecutor<const Assign, Device, + Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); +} + +template <int NumDims> +static array<Index, NumDims> RandomDims(int min_dim = 1, int max_dim = 20) { + array<Index, NumDims> dims; + for (int i = 0; i < NumDims; ++i) { + dims[i] = internal::random<int>(min_dim, max_dim); + } + return dims; +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_unary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); + + Tensor<T, NumDims, Options, Index> src(dims); + Tensor<T, NumDims, Options, Index> dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_binary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); + + Tensor<T, NumDims, Options, Index> lhs(dims); + Tensor<T, NumDims, Options, Index> rhs(dims); + Tensor<T, NumDims, Options, Index> dst(dims); + + lhs.setRandom(); + rhs.setRandom(); + + const auto expr = lhs + rhs; + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T sum = lhs.coeff(i) + rhs.coeff(i); + VERIFY_IS_EQUAL(sum, dst.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_broadcasting(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + const auto broadcasts = RandomDims<NumDims>(1, 7); + const auto expr = src.broadcast(broadcasts); + + // We assume that broadcasting on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor<T, NumDims, Options, Index> golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_chipping_rvalue(Device d) +{ + auto dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Layout, Index> src(dims); + src.setRandom(); + +#define TEST_CHIPPING(CHIP_DIM) \ + if (NumDims > (CHIP_DIM)) { \ + const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ + const auto expr = src.template chip<(CHIP_DIM)>(offset); \ + \ + Tensor<T, NumDims - 1, Layout, Index> golden; \ + golden = expr; \ + \ + Tensor<T, NumDims - 1, Layout, Index> dst(golden.dimensions()); \ + \ + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; \ + using Executor = internal::TensorExecutor<const Assign, Device, \ + Vectorizable, Tiling>; \ + \ + Executor::run(Assign(dst, expr), d); \ + \ + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ + } \ + } + + TEST_CHIPPING(0) + TEST_CHIPPING(1) + TEST_CHIPPING(2) + TEST_CHIPPING(3) + TEST_CHIPPING(4) + TEST_CHIPPING(5) + +#undef TEST_CHIPPING +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_chipping_lvalue(Device d) +{ + auto dims = RandomDims<NumDims>(1, 10); + +#define TEST_CHIPPING(CHIP_DIM) \ + if (NumDims > (CHIP_DIM)) { \ + /* Generate random data that we'll assign to the chipped tensor dim. */ \ + array<Index, NumDims - 1> src_dims; \ + for (int i = 0; i < NumDims - 1; ++i) { \ + int dim = i < (CHIP_DIM) ? i : i + 1; \ + src_dims[i] = dims[dim]; \ + } \ + \ + Tensor<T, NumDims - 1, Layout, Index> src(src_dims); \ + src.setRandom(); \ + \ + const auto offset = internal::random<Index>(0, dims[(CHIP_DIM)] - 1); \ + \ + Tensor<T, NumDims, Layout, Index> random(dims); \ + random.setZero(); \ + \ + Tensor<T, NumDims, Layout, Index> golden(dims); \ + golden = random; \ + golden.template chip<(CHIP_DIM)>(offset) = src; \ + \ + Tensor<T, NumDims, Layout, Index> dst(dims); \ + dst = random; \ + auto expr = dst.template chip<(CHIP_DIM)>(offset); \ + \ + using Assign = TensorAssignOp<decltype(expr), const decltype(src)>; \ + using Executor = internal::TensorExecutor<const Assign, Device, \ + Vectorizable, Tiling>; \ + \ + Executor::run(Assign(expr, src), d); \ + \ + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { \ + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); \ + } \ + } + + TEST_CHIPPING(0) + TEST_CHIPPING(1) + TEST_CHIPPING(2) + TEST_CHIPPING(3) + TEST_CHIPPING(4) + TEST_CHIPPING(5) + +#undef TEST_CHIPPING +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_shuffle_rvalue(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + DSizes<Index, NumDims> shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + // Test all possible shuffle permutations. + do { + DSizes<Index, NumDims> shuffled_dims; + for (int i = 0; i < NumDims; ++i) { + shuffled_dims[i] = dims[shuffle[i]]; + } + + const auto expr = src.shuffle(shuffle); + + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor<T, NumDims, Options, Index> golden(shuffled_dims); + DefaultAssign(golden, expr); + + // Now do the shuffling using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(shuffled_dims); + DeviceAssign<Vectorizable, Tiling>(d, dst, expr); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_shuffle_lvalue(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(5, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + DSizes<Index, NumDims> shuffle; + for (int i = 0; i < NumDims; ++i) shuffle[i] = i; + + // Test all possible shuffle permutations. + do { + DSizes<Index, NumDims> shuffled_dims; + for (int i = 0; i < NumDims; ++i) shuffled_dims[shuffle[i]] = dims[i]; + + // We assume that shuffling on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor<T, NumDims, Options, Index> golden(shuffled_dims); + auto golden_shuffle = golden.shuffle(shuffle); + DefaultAssign(golden_shuffle, src); + + // Now do the shuffling using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(shuffled_dims); + auto dst_shuffle = dst.shuffle(shuffle); + DeviceAssign<Vectorizable, Tiling>(d, dst_shuffle, src); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } + + } while (std::next_permutation(&shuffle[0], &shuffle[0] + NumDims)); +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_reshape(Device d) +{ + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); + + static constexpr int ReshapedDims = NumDims - 1; + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(5, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + // Multiple 0th dimension and then shuffle. + std::vector<Index> shuffle; + for (int i = 0; i < ReshapedDims; ++i) shuffle.push_back(i); + std::shuffle(shuffle.begin(), shuffle.end(), std::mt19937()); + + DSizes<Index, ReshapedDims> reshaped_dims; + reshaped_dims[shuffle[0]] = dims[0] * dims[1]; + for (int i = 1; i < ReshapedDims; ++i) reshaped_dims[shuffle[i]] = dims[i + 1]; + + Tensor<T, ReshapedDims, Options, Index> golden = src.reshape(reshaped_dims); + + // Now reshape using configured tensor executor. + Tensor<T, ReshapedDims, Options, Index> dst(golden.dimensions()); + + auto expr = src.reshape(reshaped_dims); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_slice_rvalue(Device d) +{ + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(5, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>()); + auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>()); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + Tensor<T, NumDims, Options, Index> golden = + src.slice(slice_start, slice_size); + + // Now reshape using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); + + auto expr = src.slice(slice_start, slice_size); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_slice_lvalue(Device d) +{ + static_assert(NumDims >= 2, "NumDims must be greater or equal than 2"); + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(5, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + // Pick a random slice of src tensor. + auto slice_start = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10)); + auto slice_size = DSizes<Index, NumDims>(RandomDims<NumDims>(1, 10)); + + // Make sure that slice start + size do not overflow tensor dims. + for (int i = 0; i < NumDims; ++i) { + slice_start[i] = numext::mini(dims[i] - 1, slice_start[i]); + slice_size[i] = numext::mini(slice_size[i], dims[i] - slice_start[i]); + } + + Tensor<T, NumDims, Options, Index> slice(slice_size); + slice.setRandom(); + + // Assign a slice using default executor. + Tensor<T, NumDims, Options, Index> golden = src; + golden.slice(slice_start, slice_size) = slice; + + // And using configured execution strategy. + Tensor<T, NumDims, Options, Index> dst = src; + auto expr = dst.slice(slice_start, slice_size); + + using Assign = TensorAssignOp<decltype(expr), const decltype(slice)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(expr, slice), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_broadcasting_of_forced_eval(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(1, 10); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + const auto broadcasts = RandomDims<NumDims>(1, 7); + const auto expr = src.square().eval().broadcast(broadcasts); + + // We assume that broadcasting on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor<T, NumDims, Options, Index> golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template<typename T, int NumDims> +struct DummyGenerator { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE + T operator()(const array <Index, NumDims>& dims) const { + T result = static_cast<T>(0); + for (int i = 0; i < NumDims; ++i) { + result += static_cast<T>((i + 1) * dims[i]); + } + return result; + } +}; + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_generator_op(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(20, 30); + Tensor<T, NumDims, Options, Index> src(dims); + src.setRandom(); + + const auto expr = src.generate(DummyGenerator<T, NumDims>()); + + // We assume that generator on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor<T, NumDims, Options, Index> golden; + golden = expr; + + // Now do the broadcasting using configured tensor executor. + Tensor<T, NumDims, Options, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_execute_reverse_rvalue(Device d) +{ + static constexpr int Options = 0 | Layout; + + auto dims = RandomDims<NumDims>(1, numext::pow(1000000.0, 1.0 / NumDims)); + Tensor <T, NumDims, Options, Index> src(dims); + src.setRandom(); + + // Reverse half of the dimensions. + Eigen::array<bool, NumDims> reverse; + for (int i = 0; i < NumDims; ++i) reverse[i] = internal::random<bool>(); + + const auto expr = src.reverse(reverse); + + // We assume that reversing on a default device is tested and correct, so + // we can rely on it to verify correctness of tensor executor and tiling. + Tensor <T, NumDims, Options, Index> golden; + golden = expr; + + // Now do the reversing using configured tensor executor. + Tensor <T, NumDims, Options, Index> dst(golden.dimensions()); + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using Executor = + internal::TensorExecutor<const Assign, Device, Vectorizable, Tiling>; + + Executor::run(Assign(dst, expr), d); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + VERIFY_IS_EQUAL(dst.coeff(i), golden.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_async_execute_unary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); + + Tensor<T, NumDims, Options, Index> src(dims); + Tensor<T, NumDims, Options, Index> dst(dims); + + src.setRandom(); + const auto expr = src.square(); + + Eigen::Barrier done(1); + auto on_done = [&done]() { done.Notify(); }; + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using DoneCallback = decltype(on_done); + using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback, + Vectorizable, Tiling>; + + Executor::runAsync(Assign(dst, expr), d, on_done); + done.Wait(); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T square = src.coeff(i) * src.coeff(i); + VERIFY_IS_EQUAL(square, dst.coeff(i)); + } +} + +template <typename T, int NumDims, typename Device, bool Vectorizable, + TiledEvaluation Tiling, int Layout> +static void test_async_execute_binary_expr(Device d) +{ + static constexpr int Options = 0 | Layout; + + // Pick a large enough tensor size to bypass small tensor block evaluation + // optimization. + auto dims = RandomDims<NumDims>(50 / NumDims, 100 / NumDims); + + Tensor<T, NumDims, Options, Index> lhs(dims); + Tensor<T, NumDims, Options, Index> rhs(dims); + Tensor<T, NumDims, Options, Index> dst(dims); + + lhs.setRandom(); + rhs.setRandom(); + + const auto expr = lhs + rhs; + + Eigen::Barrier done(1); + auto on_done = [&done]() { done.Notify(); }; + + using Assign = TensorAssignOp<decltype(dst), const decltype(expr)>; + using DoneCallback = decltype(on_done); + using Executor = internal::TensorAsyncExecutor<const Assign, Device, DoneCallback, + Vectorizable, Tiling>; + + Executor::runAsync(Assign(dst, expr), d, on_done); + done.Wait(); + + for (Index i = 0; i < dst.dimensions().TotalSize(); ++i) { + T sum = lhs.coeff(i) + rhs.coeff(i); + VERIFY_IS_EQUAL(sum, dst.coeff(i)); + } +} + +#ifdef EIGEN_DONT_VECTORIZE +#define VECTORIZABLE(VAL) !EIGEN_DONT_VECTORIZE && VAL +#else +#define VECTORIZABLE(VAL) VAL +#endif + +#define CALL_SUBTEST_PART(PART) \ + CALL_SUBTEST_##PART + +#define CALL_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, ColMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, ColMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::Off, RowMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, false, TiledEvaluation::On, RowMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, DefaultDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(default_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device))) + +// NOTE: Currently only ThreadPoolDevice supports async expression evaluation. +#define CALL_ASYNC_SUBTEST_COMBINATIONS(PART, NAME, T, NUM_DIMS) \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, ColMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::Off, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, false, TiledEvaluation::On, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::Off, RowMajor>(tp_device))); \ + CALL_SUBTEST_PART(PART)((NAME<T, NUM_DIMS, ThreadPoolDevice, VECTORIZABLE(true), TiledEvaluation::On, RowMajor>(tp_device))) + +EIGEN_DECLARE_TEST(cxx11_tensor_executor) { + Eigen::DefaultDevice default_device; + // Default device is unused in ASYNC tests. + EIGEN_UNUSED_VARIABLE(default_device); + + const auto num_threads = internal::random<int>(20, 24); + Eigen::ThreadPool tp(num_threads); + Eigen::ThreadPoolDevice tp_device(&tp, num_threads); + + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(1, test_execute_unary_expr, float, 5); + + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 3); + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 4); + CALL_SUBTEST_COMBINATIONS(2, test_execute_binary_expr, float, 5); + + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 3); + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 4); + CALL_SUBTEST_COMBINATIONS(3, test_execute_broadcasting, float, 5); + + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(4, test_execute_chipping_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(5, test_execute_chipping_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(6, test_execute_shuffle_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(7, test_execute_shuffle_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 2); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 3); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 4); + CALL_SUBTEST_COMBINATIONS(9, test_execute_reshape, float, 5); + + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(10, test_execute_slice_rvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(11, test_execute_slice_lvalue, float, 5); + + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 2); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 3); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 4); + CALL_SUBTEST_COMBINATIONS(12, test_execute_broadcasting_of_forced_eval, float, 5); + + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 2); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 3); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 4); + CALL_SUBTEST_COMBINATIONS(13, test_execute_generator_op, float, 5); + + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 1); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 2); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 3); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 4); + CALL_SUBTEST_COMBINATIONS(14, test_execute_reverse_rvalue, float, 5); + + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 3); + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 4); + CALL_ASYNC_SUBTEST_COMBINATIONS(15, test_async_execute_unary_expr, float, 5); + + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 3); + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 4); + CALL_ASYNC_SUBTEST_COMBINATIONS(16, test_async_execute_binary_expr, float, 5); + + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11;12;13;14;15;16 +} diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 77e24cb67..169fc1898 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -7,6 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#include <numeric> + #include "main.h" #include <Eigen/CXX11/Tensor> @@ -193,26 +195,23 @@ static void test_constants() static void test_boolean() { - Tensor<int, 1> vec(6); - std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data()); + const int kSize = 31; + Tensor<int, 1> vec(kSize); + std::iota(vec.data(), vec.data() + kSize, 0); // Test ||. Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4); - VERIFY_IS_EQUAL(bool1[0], true); - VERIFY_IS_EQUAL(bool1[1], false); - VERIFY_IS_EQUAL(bool1[2], false); - VERIFY_IS_EQUAL(bool1[3], false); - VERIFY_IS_EQUAL(bool1[4], false); - VERIFY_IS_EQUAL(bool1[5], true); + for (int i = 0; i < kSize; ++i) { + bool expected = i < 1 || i > 4; + VERIFY_IS_EQUAL(bool1[i], expected); + } // Test &&, including cast of operand vec. Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4); - VERIFY_IS_EQUAL(bool2[0], false); - VERIFY_IS_EQUAL(bool2[1], true); - VERIFY_IS_EQUAL(bool2[2], true); - VERIFY_IS_EQUAL(bool2[3], true); - VERIFY_IS_EQUAL(bool2[4], false); - VERIFY_IS_EQUAL(bool2[5], false); + for (int i = 0; i < kSize; ++i) { + bool expected = bool(i) && i < 4; + VERIFY_IS_EQUAL(bool2[i], expected); + } // Compilation tests: // Test Tensor<bool> against results of cast or comparison; verifies that @@ -300,8 +299,152 @@ static void test_select() } } +template <typename Scalar> +void test_minmax_nan_propagation_templ() { + for (int size = 1; size < 17; ++size) { + const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN(); + const Scalar kInf = std::numeric_limits<Scalar>::infinity(); + const Scalar kZero(0); + Tensor<Scalar, 1> vec_all_nan(size); + Tensor<Scalar, 1> vec_one_nan(size); + Tensor<Scalar, 1> vec_zero(size); + vec_all_nan.setConstant(kNaN); + vec_zero.setZero(); + vec_one_nan.setZero(); + vec_one_nan(size/2) = kNaN; + + auto verify_all_nan = [&](const Tensor<Scalar, 1>& v) { + for (int i = 0; i < size; ++i) { + VERIFY((numext::isnan)(v(i))); + } + }; + + auto verify_all_zero = [&](const Tensor<Scalar, 1>& v) { + for (int i = 0; i < size; ++i) { + VERIFY_IS_EQUAL(v(i), Scalar(0)); + } + }; + + // Test NaN propagating max. + // max(nan, nan) = nan + // max(nan, 0) = nan + // max(0, nan) = nan + // max(0, 0) = 0 + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN)); + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan)); + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero)); + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero)); + verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN)); + verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero)); + + // Test number propagating max. + // max(nan, nan) = nan + // max(nan, 0) = 0 + // max(0, nan) = 0 + // max(0, 0) = 0 + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN)); + verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan)); + verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero)); + verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero)); + verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero)); + + // Test NaN propagating min. + // min(nan, nan) = nan + // min(nan, 0) = nan + // min(0, nan) = nan + // min(0, 0) = 0 + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN)); + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan)); + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero)); + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero)); + verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN)); + verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero)); + + // Test number propagating min. + // min(nan, nan) = nan + // min(nan, 0) = 0 + // min(0, nan) = 0 + // min(0, 0) = 0 + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN)); + verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan)); + verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero)); + verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero)); + verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero)); + + // Test min and max reduction + Tensor<Scalar, 0> val; + val = vec_zero.minimum(); + VERIFY_IS_EQUAL(val(), kZero); + val = vec_zero.template minimum<PropagateNaN>(); + VERIFY_IS_EQUAL(val(), kZero); + val = vec_zero.template minimum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), kZero); + val = vec_zero.maximum(); + VERIFY_IS_EQUAL(val(), kZero); + val = vec_zero.template maximum<PropagateNaN>(); + VERIFY_IS_EQUAL(val(), kZero); + val = vec_zero.template maximum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), kZero); + + // Test NaN propagation for tensor of all NaNs. + val = vec_all_nan.template minimum<PropagateNaN>(); + VERIFY((numext::isnan)(val())); + val = vec_all_nan.template minimum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), kInf); + val = vec_all_nan.template maximum<PropagateNaN>(); + VERIFY((numext::isnan)(val())); + val = vec_all_nan.template maximum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), -kInf); + + // Test NaN propagation for tensor with a single NaN. + val = vec_one_nan.template minimum<PropagateNaN>(); + VERIFY((numext::isnan)(val())); + val = vec_one_nan.template minimum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), (size == 1 ? kInf : kZero)); + val = vec_one_nan.template maximum<PropagateNaN>(); + VERIFY((numext::isnan)(val())); + val = vec_one_nan.template maximum<PropagateNumbers>(); + VERIFY_IS_EQUAL(val(), (size == 1 ? -kInf : kZero)); + } +} + +static void test_clip() +{ + Tensor<float, 1> vec(6); + vec(0) = 4.0; + vec(1) = 8.0; + vec(2) = 15.0; + vec(3) = 16.0; + vec(4) = 23.0; + vec(5) = 42.0; + + float kMin = 20; + float kMax = 30; + + Tensor<float, 1> vec_clipped(6); + vec_clipped = vec.clip(kMin, kMax); + for (int i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(vec_clipped(i), numext::mini(numext::maxi(vec(i), kMin), kMax)); + } +} + +static void test_minmax_nan_propagation() +{ + test_minmax_nan_propagation_templ<float>(); + test_minmax_nan_propagation_templ<double>(); +} -void test_cxx11_tensor_expr() +EIGEN_DECLARE_TEST(cxx11_tensor_expr) { CALL_SUBTEST(test_1d()); CALL_SUBTEST(test_2d()); @@ -311,4 +454,11 @@ void test_cxx11_tensor_expr() CALL_SUBTEST(test_functors()); CALL_SUBTEST(test_type_casting()); CALL_SUBTEST(test_select()); + CALL_SUBTEST(test_clip()); + +// Nan propagation does currently not work like one would expect from std::max/std::min, +// so we disable it for now +#if !EIGEN_ARCH_ARM_OR_ARM64 + CALL_SUBTEST(test_minmax_nan_propagation()); +#endif } diff --git a/unsupported/test/cxx11_tensor_fft.cpp b/unsupported/test/cxx11_tensor_fft.cpp index 2f14ebc62..2e1008eca 100644 --- a/unsupported/test/cxx11_tensor_fft.cpp +++ b/unsupported/test/cxx11_tensor_fft.cpp @@ -224,7 +224,35 @@ static void test_fft_real_input_energy() { } } -void test_cxx11_tensor_fft() { +template <typename RealScalar> +static void test_fft_non_power_of_2_round_trip(int exponent) { + int n = (1 << exponent) + 1; + + Eigen::DSizes<ptrdiff_t, 1> dimensions; + dimensions[0] = n; + const DSizes<ptrdiff_t, 1> arr = dimensions; + Tensor<RealScalar, 1, ColMajor, ptrdiff_t> input; + + input.resize(arr); + input.setRandom(); + + array<int, 1> fft; + fft[0] = 0; + + Tensor<std::complex<RealScalar>, 1, ColMajor> forward = + input.template fft<BothParts, FFT_FORWARD>(fft); + + Tensor<RealScalar, 1, ColMajor, ptrdiff_t> output = + forward.template fft<RealPart, FFT_REVERSE>(fft); + + for (int i = 0; i < n; ++i) { + RealScalar tol = test_precision<RealScalar>() * + (std::abs(input[i]) + std::abs(output[i]) + 1); + VERIFY_IS_APPROX_OR_LESS_THAN(std::abs(input[i] - output[i]), tol); + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_fft) { test_fft_complex_input_golden(); test_fft_real_input_golden(); @@ -270,4 +298,7 @@ void test_cxx11_tensor_fft() { test_fft_real_input_energy<RowMajor, double, true, Eigen::BothParts, FFT_FORWARD, 4>(); test_fft_real_input_energy<RowMajor, float, false, Eigen::BothParts, FFT_FORWARD, 4>(); test_fft_real_input_energy<RowMajor, double, false, Eigen::BothParts, FFT_FORWARD, 4>(); + + test_fft_non_power_of_2_round_trip<float>(7); + test_fft_non_power_of_2_round_trip<double>(7); } diff --git a/unsupported/test/cxx11_tensor_fixed_size.cpp b/unsupported/test/cxx11_tensor_fixed_size.cpp index 4c660de65..456ce6bea 100644 --- a/unsupported/test/cxx11_tensor_fixed_size.cpp +++ b/unsupported/test/cxx11_tensor_fixed_size.cpp @@ -21,7 +21,7 @@ static void test_0d() TensorFixedSize<float, Sizes<>, RowMajor> scalar2; VERIFY_IS_EQUAL(scalar1.rank(), 0); VERIFY_IS_EQUAL(scalar1.size(), 1); - VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1); + VERIFY_IS_EQUAL(internal::array_prod(scalar1.dimensions()), 1); scalar1() = 7.0; scalar2() = 13.0; @@ -250,7 +250,7 @@ static void test_array() } } -void test_cxx11_tensor_fixed_size() +EIGEN_DECLARE_TEST(cxx11_tensor_fixed_size) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); diff --git a/unsupported/test/cxx11_tensor_forced_eval.cpp b/unsupported/test/cxx11_tensor_forced_eval.cpp index 45d7345e9..a21a02bec 100644 --- a/unsupported/test/cxx11_tensor_forced_eval.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval.cpp @@ -61,7 +61,7 @@ static void test_const() Eigen::array<int, 2> bcast; bcast[0] = 3; bcast[1] = 1; - const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3); + const TensorMap<const Tensor<float, 2> > input_tensor(input.data(), 3, 3); Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)); for (int i = 0; i < 3; ++i) { @@ -72,7 +72,7 @@ static void test_const() } -void test_cxx11_tensor_forced_eval() +EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval) { CALL_SUBTEST(test_simple()); CALL_SUBTEST(test_const()); diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 5690da723..a55a5ad8a 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -13,44 +13,44 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> using Eigen::Tensor; - +template <typename DataType, int DataLayout, typename IndexType> void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 200; - int sizeDim3 = 200; - Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Eigen::Tensor<float, 3> in1(tensorRange); - Eigen::Tensor<float, 3> in2(tensorRange); - Eigen::Tensor<float, 3> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 20; + IndexType sizeDim3 = 20; + Eigen::array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> in2(tensorRange); + Eigen::Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); - float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float))); - float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float))); - float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); + in1 = in1.random() + in1.constant(static_cast<DataType>(10.0f)); + in2 = in2.random() + in2.constant(static_cast<DataType>(10.0f)); // creating TensorMap from tensor - Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange); - Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float)); - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float)); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + Eigen::TensorMap<Eigen::Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(DataType)); /// c=(a+b)*b gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(DataType)); + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k)); } @@ -63,8 +63,15 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { } -void test_cxx11_tensor_forced_eval_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST(test_forced_eval_sycl(sycl_device)); +template <typename DataType, typename Dev_selector> void tensorForced_evalperDevice(Dev_selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_forced_eval_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_forced_eval_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorForced_evalperDevice<float>(device)); + CALL_SUBTEST(tensorForced_evalperDevice<half>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_generator.cpp b/unsupported/test/cxx11_tensor_generator.cpp index dcb928714..6dcf676bb 100644 --- a/unsupported/test/cxx11_tensor_generator.cpp +++ b/unsupported/test/cxx11_tensor_generator.cpp @@ -42,11 +42,11 @@ struct Generator2D { template <int DataLayout> static void test_2D() { - Tensor<float, 2> matrix(5, 7); + Tensor<float, 2> matrix(512, 512); Tensor<float, 2> result = matrix.generate(Generator2D()); - for (int i = 0; i < 5; ++i) { - for (int j = 0; j < 5; ++j) { + for (int i = 0; i < 512; ++i) { + for (int j = 0; j < 512; ++j) { VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); } } @@ -80,7 +80,7 @@ static void test_gaussian() } -void test_cxx11_tensor_generator() +EIGEN_DECLARE_TEST(cxx11_tensor_generator) { CALL_SUBTEST(test_1D<ColMajor>()); CALL_SUBTEST(test_1D<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_generator_sycl.cpp b/unsupported/test/cxx11_tensor_generator_sycl.cpp new file mode 100644 index 000000000..fb6e3d9d0 --- /dev/null +++ b/unsupported/test/cxx11_tensor_generator_sycl.cpp @@ -0,0 +1,147 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL +static const float error_threshold =1e-8f; + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +struct Generator1D { + Generator1D() { } + + float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const { + return coordinates[0]; + } +}; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_1D_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 6; + array<IndexType, 1> tensorRange = {{sizeDim1}}; + Tensor<DataType, 1, DataLayout,IndexType> vec(tensorRange); + Tensor<DataType, 1, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =vec.size()*sizeof(DataType); + DataType* gpu_data_vec = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_vec(gpu_data_vec, tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_vec, vec.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_vec.generate(Generator1D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 6; ++i) { + VERIFY_IS_EQUAL(result(i), i); + } +} + + +struct Generator2D { + Generator2D() { } + + float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const { + return 3 * coordinates[0] + 11 * coordinates[1]; + } +}; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_2D_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 7; + array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; + Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange); + Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(Generator2D()); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < 5; ++i) { + for (IndexType j = 0; j < 5; ++j) { + VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j); + } + } +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_gaussian_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType rows = 32; + IndexType cols = 48; + array<DataType, 2> means; + means[0] = rows / 2.0f; + means[1] = cols / 2.0f; + array<DataType, 2> std_devs; + std_devs[0] = 3.14f; + std_devs[1] = 2.7f; + internal::GaussianGenerator<DataType, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs); + + array<IndexType, 2> tensorRange = {{rows, cols}}; + Tensor<DataType, 2, DataLayout,IndexType> matrix(tensorRange); + Tensor<DataType, 2, DataLayout,IndexType> result(tensorRange); + + const size_t tensorBuffSize =matrix.size()*sizeof(DataType); + DataType* gpu_data_matrix = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_result = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_matrix(gpu_data_matrix, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout,IndexType>> gpu_result(gpu_data_result, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_matrix, matrix.data(), tensorBuffSize); + gpu_result.device(sycl_device)=gpu_matrix.generate(gaussian_gen); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data_result, tensorBuffSize); + + for (IndexType i = 0; i < rows; ++i) { + for (IndexType j = 0; j < cols; ++j) { + DataType g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f; + DataType g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f; + DataType gaussian = expf(-g_rows - g_cols); + Eigen::internal::isApprox(result(i, j), gaussian, error_threshold); + } + } +} + +template<typename DataType, typename dev_Selector> void sycl_generator_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_1D_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_1D_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_2D_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_2D_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_gaussian_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_gaussian_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_generator_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_generator_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_gpu.cu index 0ba9d52e9..137d0d596 100644 --- a/unsupported/test/cxx11_tensor_cuda.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -9,18 +9,19 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda + #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + +#define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11 + using Eigen::Tensor; -void test_cuda_nullary() { +void test_gpu_nullary() { Tensor<float, 1, 0, int> in1(2); Tensor<float, 1, 0, int> in2(2); in1.setRandom(); @@ -30,12 +31,12 @@ void test_cuda_nullary() { float* d_in1; float* d_in2; - cudaMalloc((void**)(&d_in1), tensor_bytes); - cudaMalloc((void**)(&d_in2), tensor_bytes); - cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice); + gpuMalloc((void**)(&d_in1), tensor_bytes); + gpuMalloc((void**)(&d_in2), tensor_bytes); + gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( @@ -49,23 +50,23 @@ void test_cuda_nullary() { Tensor<float, 1, 0, int> new1(2); Tensor<float, 1, 0, int> new2(2); - assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 2; ++i) { VERIFY_IS_APPROX(new1(i), 3.14f); VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); } - cudaFree(d_in1); - cudaFree(d_in2); + gpuFree(d_in1); + gpuFree(d_in2); } -void test_cuda_elementwise_small() { +void test_gpu_elementwise_small() { Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2)); Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2)); Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2)); @@ -79,14 +80,14 @@ void test_cuda_elementwise_small() { float* d_in1; float* d_in2; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( @@ -98,9 +99,9 @@ void test_cuda_elementwise_small() { gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 2; ++i) { VERIFY_IS_APPROX( @@ -108,12 +109,12 @@ void test_cuda_elementwise_small() { in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i))); } - cudaFree(d_in1); - cudaFree(d_in2); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_in2); + gpuFree(d_out); } -void test_cuda_elementwise() +void test_gpu_elementwise() { Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); @@ -132,16 +133,16 @@ void test_cuda_elementwise() float* d_in2; float* d_in3; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_in3), in3_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_in3), in3_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); @@ -151,8 +152,8 @@ void test_cuda_elementwise() gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 53; ++j) { @@ -162,13 +163,13 @@ void test_cuda_elementwise() } } - cudaFree(d_in1); - cudaFree(d_in2); - cudaFree(d_in3); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_in2); + gpuFree(d_in3); + gpuFree(d_out); } -void test_cuda_props() { +void test_gpu_props() { Tensor<float, 1> in1(200); Tensor<bool, 1> out(200); in1.setRandom(); @@ -178,12 +179,12 @@ void test_cuda_props() { float* d_in1; bool* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( @@ -193,19 +194,19 @@ void test_cuda_props() { gpu_out.device(gpu_device) = (gpu_in1.isnan)(); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 200; ++i) { VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); } - cudaFree(d_in1); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_out); } -void test_cuda_reduction() +void test_gpu_reduction() { Tensor<float, 4> in1(72,53,97,113); Tensor<float, 2> out(72,97); @@ -216,12 +217,12 @@ void test_cuda_reduction() float* d_in1; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); @@ -233,8 +234,8 @@ void test_cuda_reduction() gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -249,12 +250,12 @@ void test_cuda_reduction() } } - cudaFree(d_in1); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_contraction() +void test_gpu_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -274,14 +275,14 @@ void test_cuda_contraction() float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -301,7 +302,7 @@ void test_cuda_contraction() m_result = m_left * m_right; gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { @@ -310,13 +311,13 @@ void test_cuda_contraction() } } - cudaFree(d_t_left); - cudaFree(d_t_right); - cudaFree(d_t_result); + gpuFree(d_t_left); + gpuFree(d_t_right); + gpuFree(d_t_result); } template<int DataLayout> -void test_cuda_convolution_1d() +void test_gpu_convolution_1d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 1, DataLayout> kernel(4); @@ -331,14 +332,14 @@ void test_cuda_convolution_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); @@ -348,8 +349,8 @@ void test_cuda_convolution_1d() Eigen::array<Eigen::DenseIndex, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 34; ++j) { @@ -364,12 +365,12 @@ void test_cuda_convolution_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } -void test_cuda_convolution_inner_dim_col_major_1d() +void test_gpu_convolution_inner_dim_col_major_1d() { Tensor<float, 4, ColMajor> input(74,9,11,7); Tensor<float, 1, ColMajor> kernel(4); @@ -384,14 +385,14 @@ void test_cuda_convolution_inner_dim_col_major_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); @@ -401,8 +402,8 @@ void test_cuda_convolution_inner_dim_col_major_1d() Eigen::array<Eigen::DenseIndex, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 71; ++i) { for (int j = 0; j < 9; ++j) { @@ -417,12 +418,12 @@ void test_cuda_convolution_inner_dim_col_major_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } -void test_cuda_convolution_inner_dim_row_major_1d() +void test_gpu_convolution_inner_dim_row_major_1d() { Tensor<float, 4, RowMajor> input(7,9,11,74); Tensor<float, 1, RowMajor> kernel(4); @@ -437,14 +438,14 @@ void test_cuda_convolution_inner_dim_row_major_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); @@ -454,8 +455,8 @@ void test_cuda_convolution_inner_dim_row_major_1d() Eigen::array<Eigen::DenseIndex, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 7; ++i) { for (int j = 0; j < 9; ++j) { @@ -470,13 +471,13 @@ void test_cuda_convolution_inner_dim_row_major_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_convolution_2d() +void test_gpu_convolution_2d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 2, DataLayout> kernel(3,4); @@ -491,14 +492,14 @@ void test_cuda_convolution_2d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); @@ -508,8 +509,8 @@ void test_cuda_convolution_2d() Eigen::array<Eigen::DenseIndex, 2> dims(1,2); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 35; ++j) { @@ -534,13 +535,13 @@ void test_cuda_convolution_2d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_convolution_3d() +void test_gpu_convolution_3d() { Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17)); Tensor<float, 3, DataLayout> kernel(3,4,2); @@ -555,14 +556,14 @@ void test_cuda_convolution_3d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); @@ -572,8 +573,8 @@ void test_cuda_convolution_3d() Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 35; ++j) { @@ -612,14 +613,15 @@ void test_cuda_convolution_3d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } +#if EIGEN_GPU_TEST_C99_MATH template <typename Scalar> -void test_cuda_lgamma(const Scalar stddev) +void test_gpu_lgamma(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -631,12 +633,12 @@ void test_cuda_lgamma(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -644,8 +646,8 @@ void test_cuda_lgamma(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.lgamma(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -653,12 +655,13 @@ void test_cuda_lgamma(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } +#endif template <typename Scalar> -void test_cuda_digamma() +void test_gpu_digamma() { Tensor<Scalar, 1> in(7); Tensor<Scalar, 1> out(7); @@ -685,12 +688,12 @@ void test_cuda_digamma() Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); @@ -698,8 +701,8 @@ void test_cuda_digamma() gpu_out.device(gpu_device) = gpu_in.digamma(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 5; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); @@ -708,12 +711,12 @@ void test_cuda_digamma() VERIFY_IS_EQUAL(out(i), expected_out(i)); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_zeta() +void test_gpu_zeta() { Tensor<Scalar, 1> in_x(6); Tensor<Scalar, 1> in_q(6); @@ -747,14 +750,14 @@ void test_cuda_zeta() Scalar* d_in_x; Scalar* d_in_q; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_q), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_q), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); @@ -763,8 +766,8 @@ void test_cuda_zeta() gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out(0), expected_out(0)); VERIFY((std::isnan)(out(3))); @@ -775,13 +778,13 @@ void test_cuda_zeta() } } - cudaFree(d_in_x); - cudaFree(d_in_q); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_q); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_polygamma() +void test_gpu_polygamma() { Tensor<Scalar, 1> in_x(7); Tensor<Scalar, 1> in_n(7); @@ -818,14 +821,14 @@ void test_cuda_polygamma() Scalar* d_in_x; Scalar* d_in_n; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_n), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_n), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); @@ -834,20 +837,20 @@ void test_cuda_polygamma() gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 7; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_in_x); - cudaFree(d_in_n); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_n); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_igamma() +void test_gpu_igamma() { Tensor<Scalar, 2> a(6, 6); Tensor<Scalar, 2> x(6, 6); @@ -883,14 +886,14 @@ void test_cuda_igamma() Scalar* d_a; Scalar* d_x; Scalar* d_out; - assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); - cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); @@ -899,8 +902,8 @@ void test_cuda_igamma() gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { @@ -912,13 +915,13 @@ void test_cuda_igamma() } } - cudaFree(d_a); - cudaFree(d_x); - cudaFree(d_out); + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_igammac() +void test_gpu_igammac() { Tensor<Scalar, 2> a(6, 6); Tensor<Scalar, 2> x(6, 6); @@ -953,14 +956,14 @@ void test_cuda_igammac() Scalar* d_a; Scalar* d_x; Scalar* d_out; - cudaMalloc((void**)(&d_a), bytes); - cudaMalloc((void**)(&d_x), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_a), bytes); + gpuMalloc((void**)(&d_x), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); @@ -969,8 +972,8 @@ void test_cuda_igammac() gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { @@ -982,13 +985,14 @@ void test_cuda_igammac() } } - cudaFree(d_a); - cudaFree(d_x); - cudaFree(d_out); + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); } +#if EIGEN_GPU_TEST_C99_MATH template <typename Scalar> -void test_cuda_erf(const Scalar stddev) +void test_gpu_erf(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -1000,12 +1004,12 @@ void test_cuda_erf(const Scalar stddev) Scalar* d_in; Scalar* d_out; - assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -1013,8 +1017,8 @@ void test_cuda_erf(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.erf(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -1022,12 +1026,12 @@ void test_cuda_erf(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_erfc(const Scalar stddev) +void test_gpu_erfc(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -1039,12 +1043,12 @@ void test_cuda_erfc(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -1052,8 +1056,8 @@ void test_cuda_erfc(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.erfc(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -1061,12 +1065,73 @@ void test_cuda_erfc(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); +} +#endif +template <typename Scalar> +void test_gpu_ndtri() +{ + Tensor<Scalar, 1> in_x(8); + Tensor<Scalar, 1> out(8); + Tensor<Scalar, 1> expected_out(8); + out.setZero(); + + in_x(0) = Scalar(1); + in_x(1) = Scalar(0.); + in_x(2) = Scalar(0.5); + in_x(3) = Scalar(0.2); + in_x(4) = Scalar(0.8); + in_x(5) = Scalar(0.9); + in_x(6) = Scalar(0.1); + in_x(7) = Scalar(0.99); + in_x(8) = Scalar(0.01); + + expected_out(0) = std::numeric_limits<Scalar>::infinity(); + expected_out(1) = -std::numeric_limits<Scalar>::infinity(); + expected_out(2) = Scalar(0.0); + expected_out(3) = Scalar(-0.8416212335729142); + expected_out(4) = Scalar(0.8416212335729142); + expected_out(5) = Scalar(1.2815515655446004); + expected_out(6) = Scalar(-1.2815515655446004); + expected_out(7) = Scalar(2.3263478740408408); + expected_out(8) = Scalar(-2.3263478740408408); + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in_x; + Scalar* d_out; + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_out), bytes); + + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6); + + gpu_out.device(gpu_device) = gpu_in_x.ndtri(); + + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + VERIFY_IS_EQUAL(out(0), expected_out(0)); + VERIFY((std::isnan)(out(3))); + + for (int i = 1; i < 6; ++i) { + if (i != 3) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + } + + gpuFree(d_in_x); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_betainc() +void test_gpu_betainc() { Tensor<Scalar, 1> in_x(125); Tensor<Scalar, 1> in_a(125); @@ -1175,16 +1240,16 @@ void test_cuda_betainc() Scalar* d_in_a; Scalar* d_in_b; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_a), bytes); - cudaMalloc((void**)(&d_in_b), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_a), bytes); + gpuMalloc((void**)(&d_in_b), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); @@ -1194,8 +1259,8 @@ void test_cuda_betainc() gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 1; i < 125; ++i) { if ((std::isnan)(expected_out(i))) { @@ -1205,83 +1270,374 @@ void test_cuda_betainc() } } - cudaFree(d_in_x); - cudaFree(d_in_a); - cudaFree(d_in_b); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_a); + gpuFree(d_in_b); + gpuFree(d_out); +} + +template <typename Scalar> +void test_gpu_i0e() +{ + Tensor<Scalar, 1> in_x(21); + Tensor<Scalar, 1> out(21); + Tensor<Scalar, 1> expected_out(21); + out.setZero(); + + Array<Scalar, 1, Dynamic> in_x_array(21); + Array<Scalar, 1, Dynamic> expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361, + 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857, + 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554, + 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163, + 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128, + 0.0897803118848; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); + + gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.bessel_i0e(); + + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + gpuFree(d_in); + gpuFree(d_out); +} + +template <typename Scalar> +void test_gpu_i1e() +{ + Tensor<Scalar, 1> in_x(21); + Tensor<Scalar, 1> out(21); + Tensor<Scalar, 1> expected_out(21); + out.setZero(); + + Array<Scalar, 1, Dynamic> in_x_array(21); + Array<Scalar, 1, Dynamic> expected_out_array(21); + + in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0, + -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0; + + expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565, + -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293, + -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249, + 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384, + 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872, + 0.0875062221833; + + for (int i = 0; i < 21; ++i) { + in_x(i) = in_x_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_in; + Scalar* d_out; + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); + + gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21); + + gpu_out.device(gpu_device) = gpu_in.bessel_i1e(); + + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (int i = 0; i < 21; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + gpuFree(d_in); + gpuFree(d_out); } +template <typename Scalar> +void test_gpu_igamma_der_a() +{ + Tensor<Scalar, 1> in_x(30); + Tensor<Scalar, 1> in_a(30); + Tensor<Scalar, 1> out(30); + Tensor<Scalar, 1> expected_out(30); + out.setZero(); + + Array<Scalar, 1, Dynamic> in_a_array(30); + Array<Scalar, 1, Dynamic> in_x_array(30); + Array<Scalar, 1, Dynamic> expected_out_array(30); + + // See special_functions.cpp for the Python code that generates the test data. + + in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, + 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0, + 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, + 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, + 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, + 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, + 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, + 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263, + -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645, + -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314, + -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921, + -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446, + -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363, + -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042, + -0.0125850719397, -0.00455500206958, -0.00476436993148; + + for (int i = 0; i < 30; ++i) { + in_x(i) = in_x_array(i); + in_a(i) = in_a_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_x.size() * sizeof(Scalar); + + Scalar* d_a; + Scalar* d_x; + Scalar* d_out; + gpuMalloc((void**)(&d_a), bytes); + gpuMalloc((void**)(&d_x), bytes); + gpuMalloc((void**)(&d_out), bytes); + + gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30); + + gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x); + + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (int i = 0; i < 30; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } -void test_cxx11_tensor_cuda() + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); +} + +template <typename Scalar> +void test_gpu_gamma_sample_der_alpha() { - CALL_SUBTEST_1(test_cuda_nullary()); - CALL_SUBTEST_1(test_cuda_elementwise_small()); - CALL_SUBTEST_1(test_cuda_elementwise()); - CALL_SUBTEST_1(test_cuda_props()); - CALL_SUBTEST_1(test_cuda_reduction()); - CALL_SUBTEST_2(test_cuda_contraction<ColMajor>()); - CALL_SUBTEST_2(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>()); - -#if __cplusplus > 199711L + Tensor<Scalar, 1> in_alpha(30); + Tensor<Scalar, 1> in_sample(30); + Tensor<Scalar, 1> out(30); + Tensor<Scalar, 1> expected_out(30); + out.setZero(); + + Array<Scalar, 1, Dynamic> in_alpha_array(30); + Array<Scalar, 1, Dynamic> in_sample_array(30); + Array<Scalar, 1, Dynamic> expected_out_array(30); + + // See special_functions.cpp for the Python code that generates the test data. + + in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, + 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, + 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065, + 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288, + 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458, + 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233, + 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677, + 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738, + 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243, + 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302, + 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534, + 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812, + 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061, + 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206, + 1.00106492525, 0.97734200649, 1.02198794179; + + for (int i = 0; i < 30; ++i) { + in_alpha(i) = in_alpha_array(i); + in_sample(i) = in_sample_array(i); + expected_out(i) = expected_out_array(i); + } + + std::size_t bytes = in_alpha.size() * sizeof(Scalar); + + Scalar* d_alpha; + Scalar* d_sample; + Scalar* d_out; + gpuMalloc((void**)(&d_alpha), bytes); + gpuMalloc((void**)(&d_sample), bytes); + gpuMalloc((void**)(&d_out), bytes); + + gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice); + + Eigen::GpuStreamDevice stream; + Eigen::GpuDevice gpu_device(&stream); + + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30); + Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30); + + gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample); + + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); + + for (int i = 0; i < 30; ++i) { + VERIFY_IS_APPROX(out(i), expected_out(i)); + } + + gpuFree(d_alpha); + gpuFree(d_sample); + gpuFree(d_out); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_gpu) +{ + CALL_SUBTEST_1(test_gpu_nullary()); + CALL_SUBTEST_1(test_gpu_elementwise_small()); + CALL_SUBTEST_1(test_gpu_elementwise()); + CALL_SUBTEST_1(test_gpu_props()); + CALL_SUBTEST_1(test_gpu_reduction()); + CALL_SUBTEST_2(test_gpu_contraction<ColMajor>()); + CALL_SUBTEST_2(test_gpu_contraction<RowMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>()); +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. +// they hang..need to investigate and fix + CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>()); +#endif + +#if EIGEN_GPU_TEST_C99_MATH // std::erf, std::erfc, and so on where only added in c++11. We use them // as a golden reference to validate the results produced by Eigen. Therefore // we can only run these tests if we use a c++11 compiler. - CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001)); - - CALL_SUBTEST_4(test_cuda_erf<float>(1.0f)); - CALL_SUBTEST_4(test_cuda_erf<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_erf<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_erf<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f)); - // CALL_SUBTEST(test_cuda_erfc<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_erf<double>(1.0)); - CALL_SUBTEST_4(test_cuda_erf<double>(100.0)); - CALL_SUBTEST_4(test_cuda_erf<double>(0.01)); - CALL_SUBTEST_4(test_cuda_erf<double>(0.001)); - - CALL_SUBTEST_4(test_cuda_erfc<double>(1.0)); - // CALL_SUBTEST(test_cuda_erfc<double>(100.0)); - CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST_4(test_cuda_erfc<double>(0.01)); - CALL_SUBTEST_4(test_cuda_erfc<double>(0.001)); - - CALL_SUBTEST_5(test_cuda_digamma<float>()); - CALL_SUBTEST_5(test_cuda_digamma<double>()); - - CALL_SUBTEST_5(test_cuda_polygamma<float>()); - CALL_SUBTEST_5(test_cuda_polygamma<double>()); - - CALL_SUBTEST_5(test_cuda_zeta<float>()); - CALL_SUBTEST_5(test_cuda_zeta<double>()); - - CALL_SUBTEST_5(test_cuda_igamma<float>()); - CALL_SUBTEST_5(test_cuda_igammac<float>()); - - CALL_SUBTEST_5(test_cuda_igamma<double>()); - CALL_SUBTEST_5(test_cuda_igammac<double>()); - - CALL_SUBTEST_6(test_cuda_betainc<float>()); - CALL_SUBTEST_6(test_cuda_betainc<double>()); + CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001)); + + CALL_SUBTEST_4(test_gpu_erf<float>(1.0f)); + CALL_SUBTEST_4(test_gpu_erf<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_erf<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_erf<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f)); + // CALL_SUBTEST(test_gpu_erfc<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs + CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_erf<double>(1.0)); + CALL_SUBTEST_4(test_gpu_erf<double>(100.0)); + CALL_SUBTEST_4(test_gpu_erf<double>(0.01)); + CALL_SUBTEST_4(test_gpu_erf<double>(0.001)); + + CALL_SUBTEST_4(test_gpu_erfc<double>(1.0)); + // CALL_SUBTEST(test_gpu_erfc<double>(100.0)); + CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs + CALL_SUBTEST_4(test_gpu_erfc<double>(0.01)); + CALL_SUBTEST_4(test_gpu_erfc<double>(0.001)); + +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. + + CALL_SUBTEST_5(test_gpu_ndtri<float>()); + CALL_SUBTEST_5(test_gpu_ndtri<double>()); + + CALL_SUBTEST_5(test_gpu_digamma<float>()); + CALL_SUBTEST_5(test_gpu_digamma<double>()); + + CALL_SUBTEST_5(test_gpu_polygamma<float>()); + CALL_SUBTEST_5(test_gpu_polygamma<double>()); + + CALL_SUBTEST_5(test_gpu_zeta<float>()); + CALL_SUBTEST_5(test_gpu_zeta<double>()); +#endif + + CALL_SUBTEST_5(test_gpu_igamma<float>()); + CALL_SUBTEST_5(test_gpu_igammac<float>()); + + CALL_SUBTEST_5(test_gpu_igamma<double>()); + CALL_SUBTEST_5(test_gpu_igammac<double>()); + +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. + CALL_SUBTEST_6(test_gpu_betainc<float>()); + CALL_SUBTEST_6(test_gpu_betainc<double>()); + + CALL_SUBTEST_6(test_gpu_i0e<float>()); + CALL_SUBTEST_6(test_gpu_i0e<double>()); + + CALL_SUBTEST_6(test_gpu_i1e<float>()); + CALL_SUBTEST_6(test_gpu_i1e<double>()); + + CALL_SUBTEST_6(test_gpu_i1e<float>()); + CALL_SUBTEST_6(test_gpu_i1e<double>()); + + CALL_SUBTEST_6(test_gpu_igamma_der_a<float>()); + CALL_SUBTEST_6(test_gpu_igamma_der_a<double>()); + + CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>()); + CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>()); +#endif + #endif } diff --git a/unsupported/test/cxx11_tensor_ifft.cpp b/unsupported/test/cxx11_tensor_ifft.cpp index 5fd88fa6c..c20edd9ac 100644 --- a/unsupported/test/cxx11_tensor_ifft.cpp +++ b/unsupported/test/cxx11_tensor_ifft.cpp @@ -131,7 +131,7 @@ static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) } } -void test_cxx11_tensor_ifft() { +EIGEN_DECLARE_TEST(cxx11_tensor_ifft) { CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4)); CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16)); CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32)); diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp new file mode 100644 index 000000000..db1c0206e --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template <typename DataType, int DataLayout, typename IndexType> +static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 245; + IndexType sizeDim2 = 343; + IndexType sizeDim3 = 577; + + array<IndexType, 3> input_range ={{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}}; + + Tensor<DataType, 3,DataLayout, IndexType> tensor1(input_range); + Tensor<DataType, 3,DataLayout, IndexType> tensor2(input_range); + Tensor<DataType, 3, DataLayout, IndexType> tensor3(slice_range); + Tensor<DataType, 3, DataLayout, IndexType> tensor3_cpu(slice_range); + + + + typedef Eigen::DSizes<IndexType, 3> Index3; + Index3 strides1(1L,1L, 1L); + Index3 indicesStart1(1L, 0L, 0L); + Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3); + + Index3 strides2(1L,1L, 1L); + Index3 indicesStart2(0L, 0L, 0L); + Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3); + Eigen::DSizes<IndexType, 3> sizes(sizeDim1-1,sizeDim2,sizeDim3); + + tensor1.setRandom(); + tensor2.setRandom(); + + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, input_range); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, input_range); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu3(gpu_data3, slice_range); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType)); + gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes); + sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType)); + + tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2); + + + for (IndexType i = 0; i <slice_range[0] ; ++i) { + for (IndexType j = 0; j < slice_range[1]; ++j) { + for (IndexType k = 0; k < slice_range[2]; ++k) { + VERIFY_IS_EQUAL(tensor3_cpu(i,j,k), tensor3(i,j,k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + + +template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_image_op_sycl<DataType, RowMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_computing_test_per_device<double>(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_image_patch.cpp b/unsupported/test/cxx11_tensor_image_patch.cpp index 475c59651..862f1f7f0 100644 --- a/unsupported/test/cxx11_tensor_image_patch.cpp +++ b/unsupported/test/cxx11_tensor_image_patch.cpp @@ -405,6 +405,57 @@ void test_patch_padding_same() } } +// Verifies that SAME padding, when computed as negative values, will be clipped +// to zero. +void test_patch_padding_same_negative_padding_clip_to_zero() { + int input_depth = 1; + int input_rows = 15; + int input_cols = 1; + int input_batches = 1; + int ksize = 1; // Corresponds to the Rows and Cols for + // tensor.extract_image_patches<>. + int row_stride = 5; + int col_stride = 1; + // ColMajor + Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches); + // Initializes tensor with incrementing numbers. + for (int i = 0; i < tensor.size(); ++i) { + tensor.data()[i] = i + 1; + } + Tensor<float, 5> result = tensor.extract_image_patches( + ksize, ksize, row_stride, col_stride, 1, 1, PADDING_SAME); + // row padding will be computed as -2 originally and then be clipped to 0. + VERIFY_IS_EQUAL(result.coeff(0), 1.0f); + VERIFY_IS_EQUAL(result.coeff(1), 6.0f); + VERIFY_IS_EQUAL(result.coeff(2), 11.0f); + + VERIFY_IS_EQUAL(result.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result.dimension(3), 3); // number of patches + VERIFY_IS_EQUAL(result.dimension(4), input_batches); // number of batches + + // RowMajor + Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0)); + + Tensor<float, 5, RowMajor> result_row_major = + tensor_row_major.extract_image_patches(ksize, ksize, row_stride, + col_stride, 1, 1, PADDING_SAME); + VERIFY_IS_EQUAL(result_row_major.coeff(0), 1.0f); + VERIFY_IS_EQUAL(result_row_major.coeff(1), 6.0f); + VERIFY_IS_EQUAL(result_row_major.coeff(2), 11.0f); + + VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0)); +} + void test_patch_no_extra_dim() { Tensor<float, 3> tensor(2,3,5); @@ -746,7 +797,7 @@ void test_imagenet_patches() } } -void test_cxx11_tensor_image_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_image_patch) { CALL_SUBTEST_1(test_simple_patch()); CALL_SUBTEST_2(test_patch_no_extra_dim()); @@ -754,4 +805,5 @@ void test_cxx11_tensor_image_patch() CALL_SUBTEST_4(test_patch_padding_valid_same_value()); CALL_SUBTEST_5(test_patch_padding_same()); CALL_SUBTEST_6(test_imagenet_patches()); + CALL_SUBTEST_7(test_patch_padding_same_negative_padding_clip_to_zero()); } diff --git a/unsupported/test/cxx11_tensor_image_patch_sycl.cpp b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp new file mode 100644 index 000000000..c1828a0ec --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_patch_sycl.cpp @@ -0,0 +1,1092 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template <typename DataType, typename IndexType> +static void test_simple_image_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array<IndexType, 4> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Single pixel patch: ColMajor + array<IndexType, 5> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(4), 7); + + // Single pixel patch: RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index colmajor " << i << " : " + << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] + << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index row major" << i << " : " + << tensor_row_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(4), 7); + + // Entire image patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); + gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j, b); + expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(b, patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(4), 7); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim4, sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2); + + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + for (IndexType b = 0; b < 7; ++b) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + } + if (twod_patch_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId, b), expected_col_major); + + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) { + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + + } + if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); + +} + + +// Verifies VALID padding (no padding) with incrementing values. +template <typename DataType, typename IndexType> +static void test_patch_padding_valid_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 3; + IndexType input_cols = 3; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + // ColMajor + array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 1, input_batches}}; + Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 1); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 1, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) < input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) < input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_result_col_major); + sycl_device.deallocate(gpu_data_result_row_major); +} + +// Verifies VALID padding (no padding) with the same value. +template <typename DataType, typename IndexType> +static void test_patch_padding_valid_same_value_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 1; + IndexType input_rows = 5; + IndexType input_cols = 5; + IndexType input_batches = 2; + IndexType ksize = 3; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + // ColMajor + + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + gpu_col_major.device(sycl_device)=gpu_col_major.constant(11.0f); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_col_major.data(), gpu_data_col_major, (tensor_col_major.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 4, input_batches}}; + Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); + DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); + gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 4); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 4, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // No padding is carried out. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r + i - row_padding; + IndexType col_offset = c + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + +// Verifies SAME padding. +template <typename DataType, typename IndexType> +static void test_patch_padding_same_sycl(const Eigen::SyclDevice& sycl_device){ + IndexType input_depth = 3; + IndexType input_rows = 4; + IndexType input_cols = 2; + IndexType input_batches = 1; + IndexType ksize = 2; // Corresponds to the Rows and Cols for tensor.extract_image_patches<>. + IndexType stride = 2; // Only same stride is supported. + + // ColMajor + array<IndexType, 4> tensorColMajorRange = {{input_depth, input_rows, input_cols, input_batches}}; + array<IndexType, 4> tensorRowMajorRange = {{input_batches, input_cols, input_rows, input_depth}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 4, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 4, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(3)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(3), tensor_row_major.dimension(0)); + + // Initializes tensor with incrementing numbers. + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + tensor_col_major.data()[i] = i + 1; + } + +array<IndexType, 5> patchColMajorTensorRange={{input_depth, ksize, ksize, 2, input_batches}}; +Tensor<DataType, 5, DataLayout,IndexType> result_col_major(patchColMajorTensorRange); +size_t patchTensorBuffSize =result_col_major.size()*sizeof(DataType); +DataType* gpu_data_result_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); +TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_result_col_major(gpu_data_result_col_major, patchColMajorTensorRange); +gpu_result_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); +sycl_device.memcpyDeviceToHost(result_col_major.data(), gpu_data_result_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(result_col_major.dimension(0), input_depth); // depth + VERIFY_IS_EQUAL(result_col_major.dimension(1), ksize); // kernel rows + VERIFY_IS_EQUAL(result_col_major.dimension(2), ksize); // kernel cols + VERIFY_IS_EQUAL(result_col_major.dimension(3), 2); // number of patches + VERIFY_IS_EQUAL(result_col_major.dimension(4), input_batches); // number of batches + + // RowMajor + + array<IndexType, 5> patchRowMajorTensorRange={{input_batches, 2, ksize, ksize, input_depth }}; + Tensor<DataType, 5, RowMajor,IndexType> result_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =result_row_major.size()*sizeof(DataType); + DataType* gpu_data_result_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_result_row_major(gpu_data_result_row_major, patchRowMajorTensorRange); + gpu_result_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME); + sycl_device.memcpyDeviceToHost(result_row_major.data(), gpu_data_result_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(result_col_major.dimension(0), result_row_major.dimension(4)); + VERIFY_IS_EQUAL(result_col_major.dimension(1), result_row_major.dimension(3)); + VERIFY_IS_EQUAL(result_col_major.dimension(2), result_row_major.dimension(2)); + VERIFY_IS_EQUAL(result_col_major.dimension(3), result_row_major.dimension(1)); + VERIFY_IS_EQUAL(result_col_major.dimension(4), result_row_major.dimension(0)); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + + for (IndexType i = 0; (i+stride+ksize-1) <= input_rows; i += stride) { // input rows + for (IndexType j = 0; (j+stride+ksize-1) <= input_cols; j += stride) { // input cols + IndexType patchId = i+input_rows*j; + for (IndexType r = 0; r < ksize; ++r) { // patch rows + for (IndexType c = 0; c < ksize; ++c) { // patch cols + for (IndexType d = 0; d < input_depth; ++d) { // depth + for (IndexType b = 0; b < input_batches; ++b) { // batch + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) { + expected_col_major = tensor_col_major(d, row_offset, col_offset, b); + expected_row_major = tensor_row_major(b, col_offset, row_offset, d); + } + // ColMajor + if (result_col_major(d, r, c, patchId, b) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_col_major(d, r, c, patchId, b), expected_col_major); + // RowMajor + if (result_row_major(b, patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + } +} + + +template <typename DataType, typename IndexType> +static void test_patch_no_extra_dim_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + + // ColMajor + array<IndexType, 3> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowMajorRange = {{sizeDim3, sizeDim2, sizeDim1}}; + Tensor<DataType, 3, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + tensor_col_major.setRandom(); + Tensor<DataType, 3, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_row_major.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(tensor_col_major.dimension(0), tensor_row_major.dimension(2)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(1), tensor_row_major.dimension(1)); + VERIFY_IS_EQUAL(tensor_col_major.dimension(2), tensor_row_major.dimension(0)); + + + // Single pixel patch: ColMajor + array<IndexType, 4> patchColMajorTensorRange={{sizeDim1, 1, 1, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> single_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_single_patch_col_major(gpu_data_single_patch_col_major, patchColMajorTensorRange); + gpu_single_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_col_major.data(), gpu_data_single_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_col_major.dimension(3), sizeDim2*sizeDim3); + + // Single pixel patch: RowMajor + array<IndexType, 4> patchRowMajorTensorRange={{sizeDim2*sizeDim3, 1, 1, sizeDim1}}; + Tensor<DataType, 4, RowMajor,IndexType> single_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_single_patch_row_major(gpu_data_single_patch_row_major, patchRowMajorTensorRange); + gpu_single_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(1, 1); + sycl_device.memcpyDeviceToHost(single_patch_row_major.data(), gpu_data_single_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_patch_row_major.dimension(0), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_patch_row_major.dimension(3), sizeDim1); + + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + // ColMajor + if (tensor_col_major.data()[i] != single_patch_col_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " << tensor_col_major.data()[i] << " vs " << single_patch_col_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], tensor_col_major.data()[i]); + // RowMajor + if (tensor_row_major.data()[i] != single_patch_row_major.data()[i]) { + std::cout << "Mismatch detected at index " << i << " : " + << tensor_col_major.data()[i] << " vs " + << single_patch_row_major.data()[i] << std::endl; + } + VERIFY_IS_EQUAL(single_patch_row_major.data()[i], + tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + VERIFY_IS_EQUAL(single_patch_col_major.data()[i], + single_patch_row_major.data()[i]); + } + + // Entire image patch: ColMajor + patchColMajorTensorRange={{sizeDim1, sizeDim2, sizeDim3, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> entire_image_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =entire_image_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_image_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_entire_image_patch_col_major(gpu_data_entire_image_patch_col_major, patchColMajorTensorRange); + gpu_entire_image_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(3, 5); + sycl_device.memcpyDeviceToHost(entire_image_patch_col_major.data(), gpu_data_entire_image_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(2), 5); + VERIFY_IS_EQUAL(entire_image_patch_col_major.dimension(3), 3*5); + + // Entire image patch: RowMajor +patchRowMajorTensorRange={{sizeDim2*sizeDim3, sizeDim3, sizeDim2, sizeDim1}}; +Tensor<DataType, 4, RowMajor,IndexType> entire_image_patch_row_major(patchRowMajorTensorRange); +patchTensorBuffSize =entire_image_patch_row_major.size()*sizeof(DataType); +DataType* gpu_data_entire_image_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); +TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_entire_image_patch_row_major(gpu_data_entire_image_patch_row_major, patchRowMajorTensorRange); +gpu_entire_image_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(3, 5); +sycl_device.memcpyDeviceToHost(entire_image_patch_row_major.data(), gpu_data_entire_image_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2); + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 3; ++r) { + for (IndexType c = 0; c < 5; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) { + expected_col_major = tensor_col_major(d, r-1+i, c-2+j); + expected_row_major = tensor_row_major(c-2+j, r-1+i, d); + } + // ColMajor + if (entire_image_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (entire_image_patch_row_major(patchId, c, r, d) != + expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d), + expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + // 2D patch: ColMajor + patchColMajorTensorRange={{sizeDim1, 2, 2, sizeDim2*sizeDim3}}; + Tensor<DataType, 4, DataLayout,IndexType> twod_patch_col_major(patchColMajorTensorRange); + patchTensorBuffSize =twod_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_twod_patch_col_major(gpu_data_twod_patch_col_major, patchColMajorTensorRange); + gpu_twod_patch_col_major.device(sycl_device)=gpu_col_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_col_major.data(), gpu_data_twod_patch_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(0), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_col_major.dimension(3), 3*5); + + // 2D patch: RowMajor + patchRowMajorTensorRange={{sizeDim2*sizeDim3, 2, 2, sizeDim1}}; + Tensor<DataType, 4, RowMajor,IndexType> twod_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =twod_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_twod_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 4, RowMajor,IndexType>> gpu_twod_patch_row_major(gpu_data_twod_patch_row_major, patchRowMajorTensorRange); + gpu_twod_patch_row_major.device(sycl_device)=gpu_row_major.extract_image_patches(2, 2); + sycl_device.memcpyDeviceToHost(twod_patch_row_major.data(), gpu_data_twod_patch_row_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2); + + // Based on the calculation described in TensorTraits.h, padding happens to be 0. + IndexType row_padding = 0; + IndexType col_padding = 0; + IndexType stride = 1; + + for (IndexType i = 0; i < 3; ++i) { + for (IndexType j = 0; j < 5; ++j) { + IndexType patchId = i+3*j; + for (IndexType r = 0; r < 2; ++r) { + for (IndexType c = 0; c < 2; ++c) { + for (IndexType d = 0; d < 2; ++d) { + DataType expected_col_major = 0.0f; + DataType expected_row_major = 0.0f; + IndexType row_offset = r*stride + i - row_padding; + IndexType col_offset = c*stride + j - col_padding; + // ColMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_col_major.dimension(1) && col_offset < tensor_col_major.dimension(2)) { + expected_col_major = tensor_col_major(d, row_offset, col_offset); + } + if (twod_patch_col_major(d, r, c, patchId) != expected_col_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_col_major(d, r, c, patchId), expected_col_major); + // RowMajor + if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) { + expected_row_major = tensor_row_major(col_offset, row_offset, d); + } + if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl; + } + VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major); + // Check that ColMajor and RowMajor agree. + VERIFY_IS_EQUAL(expected_col_major, expected_row_major); + } + } + } + } + } + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_patch_col_major); + sycl_device.deallocate(gpu_data_single_patch_row_major); + sycl_device.deallocate(gpu_data_entire_image_patch_col_major); + sycl_device.deallocate(gpu_data_entire_image_patch_row_major); + sycl_device.deallocate(gpu_data_twod_patch_col_major); + sycl_device.deallocate(gpu_data_twod_patch_row_major); +} + +template <typename DataType, typename IndexType> +static void test_imagenet_patches_sycl(const Eigen::SyclDevice& sycl_device) +{ + // Test the code on typical configurations used by the 'imagenet' benchmarks at + // https://github.com/soumith/convnet-benchmarks + // ColMajor + IndexType sizeDim1 = 3; + IndexType sizeDim2 = 128; + IndexType sizeDim3 = 128; + IndexType sizeDim4 = 16; + array<IndexType, 4> tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> l_in_col_major(tensorColMajorRange); + l_in_col_major.setRandom(); + + DataType* gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>> gpu_l_in_col_major(gpu_data_l_in_col_major, tensorColMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + + array<IndexType, 5> patchTensorRange={{sizeDim1, 11, 11, sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 5, DataLayout,IndexType> l_out_col_major(patchTensorRange); + size_t patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_l_out_col_major(gpu_data_l_out_col_major, patchTensorRange); + gpu_l_out_col_major.device(sycl_device)=gpu_l_in_col_major.extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), sizeDim4); + + // RowMajor + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 11, 11, sizeDim1}}; + Tensor<DataType, 5, RowMajor,IndexType> l_out_row_major(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + DataType* gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>> gpu_l_out_row_major(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major.device(sycl_device)=gpu_l_in_col_major.swap_layout().extract_image_patches(11, 11); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), sizeDim4); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), sizeDim2*sizeDim3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), sizeDim1); + + for (IndexType b = 0; b < 16; ++b) { + for (IndexType i = 0; i < 128; ++i) { + for (IndexType j = 0; j < 128; ++j) { + IndexType patchId = i+128*j; + for (IndexType c = 0; c < 11; ++c) { + for (IndexType r = 0; r < 11; ++r) { + for (IndexType d = 0; d < 3; ++d) { + DataType expected = 0.0f; + if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) { + expected = l_in_col_major(d, r-5+i, c-5+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != + expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j + << " r=" << r << " c=" << c << " d=" << d << " b=" << b + << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), + expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 16; + sizeDim2 = 64; + sizeDim3 = 64; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize1(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 9, 9, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize1(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 64*64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + +// RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 9, 9 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize1(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize1.device(sycl_device)=gpu_l_in_col_major_resize1.swap_layout().extract_image_patches(9, 9); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 64; ++i) { + for (IndexType j = 0; j < 64; ++j) { + IndexType patchId = i+64*j; + for (IndexType c = 0; c < 9; ++c) { + for (IndexType r = 0; r < 9; ++r) { + for (IndexType d = 0; d < 16; ++d) { + DataType expected = 0.0f; + if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) { + expected = l_in_col_major(d, r-4+i, c-4+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 32; + sizeDim2 = 16; + sizeDim3 = 16; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize2(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 7, 7, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize2(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 16*16); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 7, 7 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize2(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize2.device(sycl_device)=gpu_l_in_col_major_resize2.swap_layout().extract_image_patches(7, 7); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 16; ++i) { + for (IndexType j = 0; j < 16; ++j) { + IndexType patchId = i+16*j; + for (IndexType c = 0; c < 7; ++c) { + for (IndexType r = 0; r < 7; ++r) { + for (IndexType d = 0; d < 32; ++d) { + DataType expected = 0.0f; + if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) { + expected = l_in_col_major(d, r-3+i, c-3+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + + // ColMajor + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sizeDim1 = 64; + sizeDim2 = 13; + sizeDim3 = 13; + sizeDim4 = 32; + tensorColMajorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + l_in_col_major.resize(tensorColMajorRange); + l_in_col_major.setRandom(); + gpu_data_l_in_col_major = static_cast<DataType*>(sycl_device.allocate(l_in_col_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4, ColMajor, IndexType>>gpu_l_in_col_major_resize3(gpu_data_l_in_col_major, tensorColMajorRange); + + patchTensorRange={{sizeDim1, 3, 3, sizeDim2*sizeDim3, sizeDim4}}; + l_out_col_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_col_major.size()*sizeof(DataType); + gpu_data_l_out_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>>gpu_l_out_col_major_resize3(gpu_data_l_out_col_major, patchTensorRange); + sycl_device.memcpyHostToDevice(gpu_data_l_in_col_major, l_in_col_major.data(),(l_in_col_major.size())*sizeof(DataType)); + gpu_l_out_col_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_col_major.data(), gpu_data_l_out_col_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_col_major.dimension(0), 64); + VERIFY_IS_EQUAL(l_out_col_major.dimension(1), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_col_major.dimension(3), 13*13); + VERIFY_IS_EQUAL(l_out_col_major.dimension(4), 32); + + // RowMajor + sycl_device.deallocate(gpu_data_l_out_row_major); + patchTensorRange={{sizeDim4, sizeDim2*sizeDim3, 3, 3 ,sizeDim1}}; + l_out_row_major.resize(patchTensorRange); + patchTensorBuffSize =l_out_row_major.size()*sizeof(DataType); + gpu_data_l_out_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, RowMajor,IndexType>>gpu_l_out_row_major_resize3(gpu_data_l_out_row_major, patchTensorRange); + gpu_l_out_row_major_resize3.device(sycl_device)=gpu_l_in_col_major_resize3.swap_layout().extract_image_patches(3, 3); + sycl_device.memcpyDeviceToHost(l_out_row_major.data(), gpu_data_l_out_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32); + VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13); + VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3); + VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64); + + for (IndexType b = 0; b < 32; ++b) { + for (IndexType i = 0; i < 13; ++i) { + for (IndexType j = 0; j < 13; ++j) { + IndexType patchId = i+13*j; + for (IndexType c = 0; c < 3; ++c) { + for (IndexType r = 0; r < 3; ++r) { + for (IndexType d = 0; d < 64; ++d) { + DataType expected = 0.0f; + if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) { + expected = l_in_col_major(d, r-1+i, c-1+j, b); + } + // ColMajor + if (l_out_col_major(d, r, c, patchId, b) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_col_major(d, r, c, patchId, b), expected); + // RowMajor + if (l_out_row_major(b, patchId, c, r, d) != expected) { + std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl; + } + VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected); + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_l_in_col_major); + sycl_device.deallocate(gpu_data_l_out_col_major); + sycl_device.deallocate(gpu_data_l_out_row_major); +} + + +template<typename DataType, typename dev_Selector> void sycl_tensor_image_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +test_simple_image_patch_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_valid_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_valid_same_value_sycl<DataType, int64_t>(sycl_device); +test_patch_padding_same_sycl<DataType, int64_t>(sycl_device); +test_patch_no_extra_dim_sycl<DataType, int64_t>(sycl_device); +test_imagenet_patches_sycl<DataType, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_image_patch_sycl) +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_image_patch_test_per_device<float>(device)); +} +} diff --git a/unsupported/test/cxx11_tensor_index_list.cpp b/unsupported/test/cxx11_tensor_index_list.cpp index 4cf5df666..2166532c8 100644 --- a/unsupported/test/cxx11_tensor_index_list.cpp +++ b/unsupported/test/cxx11_tensor_index_list.cpp @@ -22,9 +22,9 @@ static void test_static_index_list() VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0); VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2); EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -167,19 +167,18 @@ static void test_type2indexpair_list() typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0; typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a; - typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b; - typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c; + typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<Index>, Eigen::type2indexpair<2,12>> Dims2_b; + typedef Eigen::IndexPairList<Eigen::IndexPair<Index>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<Index>> Dims2_c; - Dims0 d0; Dims2_a d2_a; Dims2_b d2_b; - d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11)); + d2_b.set(1, Eigen::IndexPair<Index>(1,11)); Dims2_c d2_c; - d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10))); - d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11)); // setting type2indexpair to correct value. - d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12)); + d2_c.set(0, Eigen::IndexPair<Index>(Eigen::IndexPair<Index>(0,10))); + d2_c.set(1, Eigen::IndexPair<Index>(1,11)); // setting type2indexpair to correct value. + d2_c.set(2, Eigen::IndexPair<Index>(2,12)); VERIFY_IS_EQUAL(d2_a[0].first, 0); VERIFY_IS_EQUAL(d2_a[0].second, 10); @@ -278,9 +277,9 @@ static void test_dynamic_index_list() VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2); VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 2); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 0); Tensor<float, 1> result = tensor.sum(reduction_axis); for (int i = 0; i < result.size(); ++i) { @@ -310,10 +309,10 @@ static void test_mixed_index_list() VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1); VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2); VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2); - VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[0]), 0); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[1]), 1); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[2]), 2); + VERIFY_IS_EQUAL(static_cast<Index>(reduction_axis[3]), 3); typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices; ReductionIndices reduction_indices; @@ -373,7 +372,7 @@ static void test_dim_check() #endif -void test_cxx11_tensor_index_list() +EIGEN_DECLARE_TEST(cxx11_tensor_index_list) { #ifdef EIGEN_HAS_INDEX_LIST CALL_SUBTEST(test_static_index_list()); diff --git a/unsupported/test/cxx11_tensor_inflation.cpp b/unsupported/test/cxx11_tensor_inflation.cpp index 4997935e9..75089e856 100644 --- a/unsupported/test/cxx11_tensor_inflation.cpp +++ b/unsupported/test/cxx11_tensor_inflation.cpp @@ -74,7 +74,7 @@ static void test_simple_inflation() } } -void test_cxx11_tensor_inflation() +EIGEN_DECLARE_TEST(cxx11_tensor_inflation) { CALL_SUBTEST(test_simple_inflation<ColMajor>()); CALL_SUBTEST(test_simple_inflation<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_inflation_sycl.cpp b/unsupported/test/cxx11_tensor_inflation_sycl.cpp new file mode 100644 index 000000000..521ae0cc3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_inflation_sycl.cpp @@ -0,0 +1,136 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +// Inflation Definition for each dimension the inflated val would be +//((dim-1)*strid[dim] +1) + +// for 1 dimension vector of size 3 with value (4,4,4) with the inflated stride value of 3 would be changed to +// tensor of size (2*3) +1 = 7 with the value of +// (4, 0, 0, 4, 0, 0, 4). + +template <typename DataType, int DataLayout, typename IndexType> +void test_simple_inflation_sycl(const Eigen::SyclDevice &sycl_device) { + + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensorRange); + tensor.setRandom(); + + array<IndexType, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_stride = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_no_stride(gpu_data_no_stride, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_stride.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), gpu_data_no_stride, tensorBuffSize); + + VERIFY_IS_EQUAL(no_stride.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(no_stride.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(no_stride.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(no_stride.dimension(3), sizeDim4); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + + IndexType inflatedSizeDim1 = 3; + IndexType inflatedSizeDim2 = 9; + IndexType inflatedSizeDim3 = 9; + IndexType inflatedSizeDim4 = 19; + array<IndexType, 4> inflatedTensorRange = {{inflatedSizeDim1, inflatedSizeDim2, inflatedSizeDim3, inflatedSizeDim4}}; + + Tensor<DataType, 4, DataLayout, IndexType> inflated(inflatedTensorRange); + + const size_t inflatedTensorBuffSize =inflated.size()*sizeof(DataType); + DataType* gpu_data_inflated = static_cast<DataType*>(sycl_device.allocate(inflatedTensorBuffSize)); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu_inflated(gpu_data_inflated, inflatedTensorRange); + gpu_inflated.device(sycl_device)=gpu_tensor.inflate(strides); + sycl_device.memcpyDeviceToHost(inflated.data(), gpu_data_inflated, inflatedTensorBuffSize); + + VERIFY_IS_EQUAL(inflated.dimension(0), inflatedSizeDim1); + VERIFY_IS_EQUAL(inflated.dimension(1), inflatedSizeDim2); + VERIFY_IS_EQUAL(inflated.dimension(2), inflatedSizeDim3); + VERIFY_IS_EQUAL(inflated.dimension(3), inflatedSizeDim4); + + for (IndexType i = 0; i < inflatedSizeDim1; ++i) { + for (IndexType j = 0; j < inflatedSizeDim2; ++j) { + for (IndexType k = 0; k < inflatedSizeDim3; ++k) { + for (IndexType l = 0; l < inflatedSizeDim4; ++l) { + if (i % strides[0] == 0 && + j % strides[1] == 0 && + k % strides[2] == 0 && + l % strides[3] == 0) { + VERIFY_IS_EQUAL(inflated(i,j,k,l), + tensor(i/strides[0], j/strides[1], k/strides[2], l/strides[3])); + } else { + VERIFY_IS_EQUAL(0, inflated(i,j,k,l)); + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_stride); + sycl_device.deallocate(gpu_data_inflated); +} + +template<typename DataType, typename dev_Selector> void sycl_inflation_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_inflation_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_simple_inflation_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_inflation_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_inflation_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_intdiv.cpp b/unsupported/test/cxx11_tensor_intdiv.cpp index 8e2b70b75..d18a05ec4 100644 --- a/unsupported/test/cxx11_tensor_intdiv.cpp +++ b/unsupported/test/cxx11_tensor_intdiv.cpp @@ -135,7 +135,7 @@ void test_specific() { VERIFY_IS_EQUAL(result, result_op); } -void test_cxx11_tensor_intdiv() +EIGEN_DECLARE_TEST(cxx11_tensor_intdiv) { CALL_SUBTEST_1(test_signed_32bit()); CALL_SUBTEST_2(test_unsigned_32bit()); diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp index 489960529..2c638f9bf 100644 --- a/unsupported/test/cxx11_tensor_io.cpp +++ b/unsupported/test/cxx11_tensor_io.cpp @@ -119,7 +119,7 @@ static void test_output_const() } -void test_cxx11_tensor_io() +EIGEN_DECLARE_TEST(cxx11_tensor_io) { CALL_SUBTEST(test_output_0d<ColMajor>()); CALL_SUBTEST(test_output_0d<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_layout_swap.cpp b/unsupported/test/cxx11_tensor_layout_swap.cpp index ae297a9da..efb333360 100644 --- a/unsupported/test/cxx11_tensor_layout_swap.cpp +++ b/unsupported/test/cxx11_tensor_layout_swap.cpp @@ -54,7 +54,7 @@ static void test_swap_as_lvalue() } -void test_cxx11_tensor_layout_swap() +EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap) { CALL_SUBTEST(test_simple_swap()); CALL_SUBTEST(test_swap_as_lvalue()); diff --git a/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp new file mode 100644 index 000000000..9546b911c --- /dev/null +++ b/unsupported/test/cxx11_tensor_layout_swap_sycl.cpp @@ -0,0 +1,126 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, typename IndexType> +static void test_simple_swap_sycl(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + + Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange); + Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor<float, 3, ColMajor> tensor(2,3,7); + //tensor.setRandom(); + +// Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout(); + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template <typename DataType, typename IndexType> +static void test_swap_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 7; + array<IndexType, 3> tensorColRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array<IndexType, 3> tensorRowRange = {{sizeDim3, sizeDim2, sizeDim1}}; + + Tensor<DataType, 3, ColMajor, IndexType> tensor1(tensorColRange); + Tensor<DataType, 3, RowMajor, IndexType> tensor2(tensorRowRange); + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 3, ColMajor, IndexType>> gpu1(gpu_data1, tensorColRange); + TensorMap<Tensor<DataType, 3, RowMajor, IndexType>> gpu2(gpu_data2, tensorRowRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + gpu2.swap_layout().device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + +// Tensor<float, 3, ColMajor> tensor(2,3,7); +// tensor.setRandom(); + + //Tensor<float, 3, RowMajor> tensor2(7,3,2); +// tensor2.swap_layout() = tensor; + VERIFY_IS_EQUAL(tensor1.dimension(0), tensor2.dimension(2)); + VERIFY_IS_EQUAL(tensor1.dimension(1), tensor2.dimension(1)); + VERIFY_IS_EQUAL(tensor1.dimension(2), tensor2.dimension(0)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor1(i,j,k), tensor2(k,j,i)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + + +template<typename DataType, typename dev_Selector> void sycl_tensor_layout_swap_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_swap_sycl<DataType, int64_t>(sycl_device); + test_swap_as_lvalue_sycl<DataType, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_layout_swap_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_layout_swap_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_lvalue.cpp b/unsupported/test/cxx11_tensor_lvalue.cpp index 071f5b406..6ba9a212d 100644 --- a/unsupported/test/cxx11_tensor_lvalue.cpp +++ b/unsupported/test/cxx11_tensor_lvalue.cpp @@ -36,7 +36,7 @@ static void test_compound_assignment() } -void test_cxx11_tensor_lvalue() +EIGEN_DECLARE_TEST(cxx11_tensor_lvalue) { CALL_SUBTEST(test_compound_assignment()); } diff --git a/unsupported/test/cxx11_tensor_map.cpp b/unsupported/test/cxx11_tensor_map.cpp index 3db0ee7c0..4d4f68911 100644 --- a/unsupported/test/cxx11_tensor_map.cpp +++ b/unsupported/test/cxx11_tensor_map.cpp @@ -19,8 +19,8 @@ static void test_0d() Tensor<int, 0> scalar1; Tensor<int, 0, RowMajor> scalar2; - TensorMap<Tensor<const int, 0> > scalar3(scalar1.data()); - TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data()); + TensorMap<const Tensor<int, 0> > scalar3(scalar1.data()); + TensorMap<const Tensor<int, 0, RowMajor> > scalar4(scalar2.data()); scalar1() = 7; scalar2() = 13; @@ -37,8 +37,8 @@ static void test_1d() Tensor<int, 1> vec1(6); Tensor<int, 1, RowMajor> vec2(6); - TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6); - TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6); + TensorMap<const Tensor<int, 1> > vec3(vec1.data(), 6); + TensorMap<const Tensor<int, 1, RowMajor> > vec4(vec2.data(), 6); vec1(0) = 4; vec2(0) = 0; vec1(1) = 8; vec2(1) = 1; @@ -85,8 +85,8 @@ static void test_2d() mat2(1,1) = 4; mat2(1,2) = 5; - TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3); - TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3); + TensorMap<const Tensor<int, 2> > mat3(mat1.data(), 2, 3); + TensorMap<const Tensor<int, 2, RowMajor> > mat4(mat2.data(), 2, 3); VERIFY_IS_EQUAL(mat3.rank(), 2); VERIFY_IS_EQUAL(mat3.size(), 6); @@ -129,8 +129,8 @@ static void test_3d() } } - TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7); - TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7); + TensorMap<const Tensor<int, 3> > mat3(mat1.data(), 2, 3, 7); + TensorMap<const Tensor<int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7); VERIFY_IS_EQUAL(mat3.rank(), 3); VERIFY_IS_EQUAL(mat3.size(), 2*3*7); @@ -265,7 +265,54 @@ static void test_casting() VERIFY_IS_EQUAL(sum1, 861); } -void test_cxx11_tensor_map() +template<typename T> +static const T& add_const(T& value) { + return value; +} + +static void test_0d_const_tensor() +{ + Tensor<int, 0> scalar1; + Tensor<int, 0, RowMajor> scalar2; + + TensorMap<const Tensor<int, 0> > scalar3(add_const(scalar1).data()); + TensorMap<const Tensor<int, 0, RowMajor> > scalar4(add_const(scalar2).data()); + + scalar1() = 7; + scalar2() = 13; + + VERIFY_IS_EQUAL(scalar1.rank(), 0); + VERIFY_IS_EQUAL(scalar1.size(), 1); + + VERIFY_IS_EQUAL(scalar3(), 7); + VERIFY_IS_EQUAL(scalar4(), 13); +} + +static void test_0d_const_tensor_map() +{ + Tensor<int, 0> scalar1; + Tensor<int, 0, RowMajor> scalar2; + + const TensorMap<Tensor<int, 0> > scalar3(scalar1.data()); + const TensorMap<Tensor<int, 0, RowMajor> > scalar4(scalar2.data()); + + // Although TensorMap is constant, we still can write to the underlying + // storage, because we map over non-constant Tensor. + scalar3() = 7; + scalar4() = 13; + + VERIFY_IS_EQUAL(scalar1(), 7); + VERIFY_IS_EQUAL(scalar2(), 13); + + // Pointer to the underlying storage is also non-const. + scalar3.data()[0] = 8; + scalar4.data()[0] = 14; + + VERIFY_IS_EQUAL(scalar1(), 8); + VERIFY_IS_EQUAL(scalar2(), 14); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_map) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); @@ -274,4 +321,7 @@ void test_cxx11_tensor_map() CALL_SUBTEST(test_from_tensor()); CALL_SUBTEST(test_casting()); + + CALL_SUBTEST(test_0d_const_tensor()); + CALL_SUBTEST(test_0d_const_tensor_map()); } diff --git a/unsupported/test/cxx11_tensor_math.cpp b/unsupported/test/cxx11_tensor_math.cpp index 61c742a16..82a1a26d8 100644 --- a/unsupported/test/cxx11_tensor_math.cpp +++ b/unsupported/test/cxx11_tensor_math.cpp @@ -39,7 +39,7 @@ static void test_sigmoid() } -void test_cxx11_tensor_math() +EIGEN_DECLARE_TEST(cxx11_tensor_math) { CALL_SUBTEST(test_tanh()); CALL_SUBTEST(test_sigmoid()); diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp new file mode 100644 index 000000000..029653e27 --- /dev/null +++ b/unsupported/test/cxx11_tensor_math_sycl.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template <typename DataType, int DataLayout, typename IndexType> +static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.tanh(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.tanh(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.sigmoid(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.sigmoid(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} + + +template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_tanh_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_tanh_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_sigmoid_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_sigmoid_sycl<DataType, ColMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_mixed_indices.cpp b/unsupported/test/cxx11_tensor_mixed_indices.cpp index 4fba6fdd1..ee2616fd7 100644 --- a/unsupported/test/cxx11_tensor_mixed_indices.cpp +++ b/unsupported/test/cxx11_tensor_mixed_indices.cpp @@ -47,7 +47,7 @@ static void test_simple() } -void test_cxx11_tensor_mixed_indices() +EIGEN_DECLARE_TEST(cxx11_tensor_mixed_indices) { CALL_SUBTEST(test_simple()); } diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp index f7de43110..ed5d5ade3 100644 --- a/unsupported/test/cxx11_tensor_morphing.cpp +++ b/unsupported/test/cxx11_tensor_morphing.cpp @@ -41,7 +41,29 @@ static void test_simple_reshape() } } -template<typename> +template <typename> +static void test_static_reshape() { +#if defined(EIGEN_HAS_INDEX_LIST) + using Eigen::type2index; + + Tensor<float, 5> tensor(2, 3, 1, 7, 1); + tensor.setRandom(); + + // New dimensions: [2, 3, 7] + Eigen::IndexList<type2index<2>, type2index<3>, type2index<7>> dim; + Tensor<float, 3> reshaped = tensor.reshape(static_cast<Eigen::DSizes<ptrdiff_t,3>>(dim)); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 7; ++k) { + VERIFY_IS_EQUAL(tensor(i, j, 0, k, 0), reshaped(i, j, k)); + } + } + } +#endif +} + +template <typename> static void test_reshape_in_expr() { MatrixXf m1(2,3*5*7*11); MatrixXf m2(3*5*7*11,13); @@ -90,19 +112,19 @@ static void test_reshape_as_lvalue() } } -template<int DataLayout> +template<typename T, int DataLayout> static void test_simple_slice() { - Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); + Tensor<T, 5, DataLayout> tensor(2,3,5,7,11); tensor.setRandom(); - Tensor<float, 5, DataLayout> slice1(1,1,1,1,1); + Tensor<T, 5, DataLayout> slice1(1,1,1,1,1); Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5); Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1); slice1 = tensor.slice(indices, sizes); VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); - Tensor<float, 5, DataLayout> slice2(1,1,2,2,3); + Tensor<T, 5, DataLayout> slice2(1,1,2,2,3); Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5); Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3); slice2 = tensor.slice(indices2, sizes2); @@ -115,20 +137,20 @@ static void test_simple_slice() } } -template<typename=void> +template<typename T> static void test_const_slice() { - const float b[1] = {42}; - TensorMap<Tensor<const float, 1> > m(b, 1); + const T b[1] = {42}; + TensorMap<Tensor<const T, 1> > m(b, 1); DSizes<DenseIndex, 1> offsets; offsets[0] = 0; - TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions())); + TensorRef<Tensor<const T, 1> > slice_ref(m.slice(offsets, m.dimensions())); VERIFY_IS_EQUAL(slice_ref(0), 42); } -template<int DataLayout> +template<typename T, int DataLayout> static void test_slice_in_expr() { - typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx; + typedef Matrix<T, Dynamic, Dynamic, DataLayout> Mtx; Mtx m1(7,7); Mtx m2(3,3); m1.setRandom(); @@ -136,10 +158,10 @@ static void test_slice_in_expr() { Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1); - TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7); - TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3); - Tensor<float, 2, DataLayout> tensor3(3,1); - typedef Tensor<float, 1>::DimensionPair DimPair; + TensorMap<Tensor<T, 2, DataLayout>> tensor1(m1.data(), 7, 7); + TensorMap<Tensor<T, 2, DataLayout>> tensor2(m2.data(), 3, 3); + Tensor<T, 2, DataLayout> tensor3(3,1); + typedef typename Tensor<T, 1>::DimensionPair DimPair; array<DimPair, 1> contract_along{{DimPair(1, 0)}}; Eigen::DSizes<ptrdiff_t, 2> indices1(1,2); @@ -156,28 +178,28 @@ static void test_slice_in_expr() { } // Take an arbitrary slice of an arbitrarily sized tensor. - TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7); - Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35)); + TensorMap<Tensor<const T, 2, DataLayout>> tensor4(m1.data(), 7, 7); + Tensor<T, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35)); for (int i = 0; i < 35; ++i) { VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i])); } } -template<int DataLayout> +template<typename T, int DataLayout> static void test_slice_as_lvalue() { - Tensor<float, 3, DataLayout> tensor1(2,2,7); + Tensor<T, 3, DataLayout> tensor1(2,2,7); tensor1.setRandom(); - Tensor<float, 3, DataLayout> tensor2(2,2,7); + Tensor<T, 3, DataLayout> tensor2(2,2,7); tensor2.setRandom(); - Tensor<float, 3, DataLayout> tensor3(4,3,5); + Tensor<T, 3, DataLayout> tensor3(4,3,5); tensor3.setRandom(); - Tensor<float, 3, DataLayout> tensor4(4,3,2); + Tensor<T, 3, DataLayout> tensor4(4,3,2); tensor4.setRandom(); - Tensor<float, 3, DataLayout> tensor5(10,13,12); + Tensor<T, 3, DataLayout> tensor5(10,13,12); tensor5.setRandom(); - Tensor<float, 3, DataLayout> result(4,5,7); + Tensor<T, 3, DataLayout> result(4,5,7); Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7); Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0); result.slice(first_slice, sizes12) = tensor1; @@ -223,10 +245,10 @@ static void test_slice_as_lvalue() } } -template<int DataLayout> +template<typename T, int DataLayout> static void test_slice_raw_data() { - Tensor<float, 4, DataLayout> tensor(3,5,7,11); + Tensor<T, 4, DataLayout> tensor(3,5,7,11); tensor.setRandom(); Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4); @@ -253,7 +275,7 @@ static void test_slice_raw_data() extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1); auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice()); VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2); - VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0)); + VERIFY_IS_EQUAL(slice3.data(), static_cast<T*>(0)); if (DataLayout == ColMajor) { offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4); @@ -318,15 +340,15 @@ static void test_slice_raw_data() } -template<int DataLayout> +template<typename T, int DataLayout> static void test_strided_slice() { - typedef Tensor<float, 5, DataLayout> Tensor5f; + typedef Tensor<T, 5, DataLayout> Tensor5f; typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5; - typedef Tensor<float, 2, DataLayout> Tensor2f; + typedef Tensor<T, 2, DataLayout> Tensor2f; typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2; - Tensor<float, 5, DataLayout> tensor(2,3,5,7,11); - Tensor<float, 2, DataLayout> tensor2(7,11); + Tensor<T, 5, DataLayout> tensor(2,3,5,7,11); + Tensor<T, 2, DataLayout> tensor2(7,11); tensor.setRandom(); tensor2.setRandom(); @@ -412,13 +434,13 @@ static void test_strided_slice() } } -template<int DataLayout> +template<typename T, int DataLayout> static void test_strided_slice_write() { - typedef Tensor<float, 2, DataLayout> Tensor2f; + typedef Tensor<T, 2, DataLayout> Tensor2f; typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2; - Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11); + Tensor<T, 2, DataLayout> tensor(7,11),tensor2(7,11); tensor.setRandom(); tensor2=tensor; Tensor2f slice(2,3); @@ -438,15 +460,14 @@ static void test_strided_slice_write() } } - -template<int DataLayout> +template<typename T, int DataLayout> static void test_composition() { - Eigen::Tensor<float, 2, DataLayout> matrix(7, 11); + Eigen::Tensor<T, 2, DataLayout> matrix(7, 11); matrix.setRandom(); const DSizes<ptrdiff_t, 3> newDims(1, 1, 11); - Eigen::Tensor<float, 3, DataLayout> tensor = + Eigen::Tensor<T, 3, DataLayout> tensor = matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims); VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11); @@ -458,28 +479,87 @@ static void test_composition() } } +template<typename T, int DataLayout> +static void test_empty_slice() +{ + Tensor<T, 3, DataLayout> tensor(2,3,5); + tensor.setRandom(); + Tensor<T, 3, DataLayout> copy = tensor; + + // empty size in first dimension + Eigen::DSizes<ptrdiff_t, 3> indices1(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes1(0,1,2); + Tensor<T, 3, DataLayout> slice1(0,1,2); + slice1.setRandom(); + tensor.slice(indices1, sizes1) = slice1; + + // empty size in second dimension + Eigen::DSizes<ptrdiff_t, 3> indices2(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes2(1,0,2); + Tensor<T, 3, DataLayout> slice2(1,0,2); + slice2.setRandom(); + tensor.slice(indices2, sizes2) = slice2; + + // empty size in third dimension + Eigen::DSizes<ptrdiff_t, 3> indices3(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes3(1,1,0); + Tensor<T, 3, DataLayout> slice3(1,1,0); + slice3.setRandom(); + tensor.slice(indices3, sizes3) = slice3; + + // empty size in first and second dimension + Eigen::DSizes<ptrdiff_t, 3> indices4(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes4(0,0,2); + Tensor<T, 3, DataLayout> slice4(0,0,2); + slice4.setRandom(); + tensor.slice(indices4, sizes4) = slice4; + + // empty size in second and third dimension + Eigen::DSizes<ptrdiff_t, 3> indices5(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes5(1,0,0); + Tensor<T, 3, DataLayout> slice5(1,0,0); + slice5.setRandom(); + tensor.slice(indices5, sizes5) = slice5; + + // empty size in all dimensions + Eigen::DSizes<ptrdiff_t, 3> indices6(1,2,3); + Eigen::DSizes<ptrdiff_t, 3> sizes6(0,0,0); + Tensor<T, 3, DataLayout> slice6(0,0,0); + slice6.setRandom(); + tensor.slice(indices6, sizes6) = slice6; + + // none of these operations should change the tensor's components + // because all of the rvalue slices have at least one zero dimension + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 5; ++k) { + VERIFY_IS_EQUAL(tensor(i,j,k), copy(i,j,k)); + } + } + } +} + +#define CALL_SUBTEST_PART(PART) \ + CALL_SUBTEST_##PART + +#define CALL_SUBTESTS_TYPES_LAYOUTS(PART, NAME) \ + CALL_SUBTEST_PART(PART)((NAME<float, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<float, RowMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, ColMajor>())); \ + CALL_SUBTEST_PART(PART)((NAME<bool, RowMajor>())) -void test_cxx11_tensor_morphing() +EIGEN_DECLARE_TEST(cxx11_tensor_morphing) { CALL_SUBTEST_1(test_simple_reshape<void>()); - CALL_SUBTEST_1(test_reshape_in_expr<void>()); + CALL_SUBTEST_1(test_static_reshape<void>()); CALL_SUBTEST_1(test_reshape_as_lvalue<void>()); - - CALL_SUBTEST_1(test_simple_slice<ColMajor>()); - CALL_SUBTEST_1(test_simple_slice<RowMajor>()); - CALL_SUBTEST_1(test_const_slice()); - CALL_SUBTEST_2(test_slice_in_expr<ColMajor>()); - CALL_SUBTEST_3(test_slice_in_expr<RowMajor>()); - CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>()); - CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>()); - CALL_SUBTEST_5(test_slice_raw_data<ColMajor>()); - CALL_SUBTEST_5(test_slice_raw_data<RowMajor>()); - - CALL_SUBTEST_6(test_strided_slice_write<ColMajor>()); - CALL_SUBTEST_6(test_strided_slice<ColMajor>()); - CALL_SUBTEST_6(test_strided_slice_write<RowMajor>()); - CALL_SUBTEST_6(test_strided_slice<RowMajor>()); - - CALL_SUBTEST_7(test_composition<ColMajor>()); - CALL_SUBTEST_7(test_composition<RowMajor>()); + CALL_SUBTEST_1(test_reshape_in_expr<void>()); + CALL_SUBTEST_1(test_const_slice<float>()); + + CALL_SUBTESTS_TYPES_LAYOUTS(2, test_simple_slice); + CALL_SUBTESTS_TYPES_LAYOUTS(3, test_slice_as_lvalue); + CALL_SUBTESTS_TYPES_LAYOUTS(4, test_slice_raw_data); + CALL_SUBTESTS_TYPES_LAYOUTS(5, test_strided_slice_write); + CALL_SUBTESTS_TYPES_LAYOUTS(6, test_strided_slice); + CALL_SUBTESTS_TYPES_LAYOUTS(7, test_composition); } diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp new file mode 100644 index 000000000..bf001b40f --- /dev/null +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -0,0 +1,386 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_reshape(const Eigen::SyclDevice& sycl_device) +{ + typename Tensor<DataType, 5 ,DataLayout, IndexType>::Dimensions dim1(2,3,1,7,1); + typename Tensor<DataType, 3 ,DataLayout, IndexType>::Dimensions dim2(2,3,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim3(6,7); + typename Tensor<DataType, 2 ,DataLayout, IndexType>::Dimensions dim4(2,21); + + Tensor<DataType, 5, DataLayout, IndexType> tensor1(dim1); + Tensor<DataType, 3, DataLayout, IndexType> tensor2(dim2); + Tensor<DataType, 2, DataLayout, IndexType> tensor3(dim3); + Tensor<DataType, 2, DataLayout, IndexType> tensor4(dim4); + + tensor1.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor3.size()*sizeof(DataType))); + DataType* gpu_data4 = static_cast<DataType*>(sycl_device.allocate(tensor4.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, dim1); + TensorMap<Tensor<DataType, 3,DataLayout, IndexType>> gpu2(gpu_data2, dim2); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu3(gpu_data3, dim3); + TensorMap<Tensor<DataType, 2,DataLayout, IndexType>> gpu4(gpu_data4, dim4); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + + gpu2.device(sycl_device)=gpu1.reshape(dim2); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor1.size())*sizeof(DataType)); + + gpu3.device(sycl_device)=gpu1.reshape(dim3); + sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType)); + + gpu4.device(sycl_device)=gpu1.reshape(dim2).reshape(dim4); + sycl_device.memcpyDeviceToHost(tensor4.data(), gpu_data4,(tensor4.size())*sizeof(DataType)); + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); ///ColMajor + if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k)); ///ColMajor + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k)); ///ColMajor + } + else{ + //VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k)); /// RowMajor + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j*7 +k)); /// RowMajor + VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i*3 +j,k)); /// RowMajor + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); + sycl_device.deallocate(gpu_data4); +} + + +template<typename DataType, int DataLayout, typename IndexType> +static void test_reshape_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + typename Tensor<DataType, 3, DataLayout, IndexType>::Dimensions dim1(2,3,7); + typename Tensor<DataType, 2, DataLayout, IndexType>::Dimensions dim2(6,7); + typename Tensor<DataType, 5, DataLayout, IndexType>::Dimensions dim3(2,3,1,7,1); + Tensor<DataType, 3, DataLayout, IndexType> tensor(dim1); + Tensor<DataType, 2, DataLayout, IndexType> tensor2d(dim2); + Tensor<DataType, 5, DataLayout, IndexType> tensor5d(dim3); + + tensor.setRandom(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2d.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(tensor5d.size()*sizeof(DataType))); + + TensorMap< Tensor<DataType, 3, DataLayout, IndexType> > gpu1(gpu_data1, dim1); + TensorMap< Tensor<DataType, 2, DataLayout, IndexType> > gpu2(gpu_data2, dim2); + TensorMap< Tensor<DataType, 5, DataLayout, IndexType> > gpu3(gpu_data3, dim3); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + + gpu2.reshape(dim1).device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor2d.data(), gpu_data2,(tensor2d.size())*sizeof(DataType)); + + gpu3.reshape(dim1).device(sycl_device)=gpu1; + sycl_device.memcpyDeviceToHost(tensor5d.data(), gpu_data3,(tensor5d.size())*sizeof(DataType)); + + + for (IndexType i = 0; i < 2; ++i){ + for (IndexType j = 0; j < 3; ++j){ + for (IndexType k = 0; k < 7; ++k){ + VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k)); + if (static_cast<int>(DataLayout) == static_cast<int>(ColMajor)) { + VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k)); ///ColMajor + } + else{ + VERIFY_IS_EQUAL(tensor2d(i*3 +j,k),tensor(i,j,k)); /// RowMajor + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_slice(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 5,DataLayout, IndexType> tensor(tensorRange); + tensor.setRandom(); + array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; + Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); + Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); + Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.slice(indices, sizes); + sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + + + array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; + Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); + Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); + Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); + gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); + sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + typedef Eigen::DSizes<IndexType, 5> Index5; + Index5 strides(1L,1L,1L,1L,1L); + Index5 indicesStart(1L,2L,3L,4L,5L); + Index5 indicesStop(2L,3L,4L,5L,6L); + Index5 lengths(1L,1L,1L,1L,1L); + + array<IndexType, 5> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor<DataType, 5, DataLayout, IndexType> tensor(tensorRange); + tensor.setRandom(); + + array<IndexType, 5> slice1_range ={{1, 1, 1, 1, 1}}; + Tensor<DataType, 5,DataLayout, IndexType> slice1(slice1_range); + Tensor<DataType, 5, DataLayout, IndexType> slice_stride1(slice1_range); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(slice1.size()*sizeof(DataType))); + DataType* gpu_data_stride2 = static_cast<DataType*>(sycl_device.allocate(slice_stride1.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu2(gpu_data2, slice1_range); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride2(gpu_data_stride2, slice1_range); + + Eigen::DSizes<IndexType, 5> indices(1,2,3,4,5); + Eigen::DSizes<IndexType, 5> sizes(1,1,1,1,1); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.slice(indices, sizes); + sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); + + gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides); + sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5)); + + array<IndexType, 5> slice2_range ={{1,1,2,2,3}}; + Tensor<DataType, 5,DataLayout, IndexType> slice2(slice2_range); + Tensor<DataType, 5, DataLayout, IndexType> strideSlice2(slice2_range); + + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice2.size()*sizeof(DataType))); + DataType* gpu_data_stride3 = static_cast<DataType*>(sycl_device.allocate(strideSlice2.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu3(gpu_data3, slice2_range); + TensorMap<Tensor<DataType, 5,DataLayout, IndexType>> gpu_stride3(gpu_data_stride3, slice2_range); + Eigen::DSizes<IndexType, 5> indices2(1,1,3,4,5); + Eigen::DSizes<IndexType, 5> sizes2(1,1,2,2,3); + Index5 strides2(1L,1L,1L,1L,1L); + Index5 indicesStart2(1L,1L,3L,4L,5L); + Index5 indicesStop2(2L,2L,5L,6L,8L); + + gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); + sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); + + gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2); + sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + +template<typename DataType, int DataLayout, typename IndexType> +static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) +{ + typedef Tensor<DataType, 2, DataLayout, IndexType> Tensor2f; + typedef Eigen::DSizes<IndexType, 2> Index2; + IndexType sizeDim1 = 7L; + IndexType sizeDim2 = 11L; + array<IndexType, 2> tensorRange = {{sizeDim1, sizeDim2}}; + Tensor<DataType, 2, DataLayout, IndexType> tensor(tensorRange),tensor2(tensorRange); + IndexType sliceDim1 = 2; + IndexType sliceDim2 = 3; + array<IndexType, 2> sliceRange = {{sliceDim1, sliceDim2}}; + Tensor2f slice(sliceRange); + Index2 strides(1L,1L); + Index2 indicesStart(3L,4L); + Index2 indicesStop(5L,7L); + Index2 lengths(2L,3L); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(slice.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, tensorRange); + TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu3(gpu_data3, sliceRange); + + + tensor.setRandom(); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1; + + slice.setRandom(); + sycl_device.memcpyHostToDevice(gpu_data3, slice.data(),(slice.size())*sizeof(DataType)); + + + gpu1.slice(indicesStart,lengths).device(sycl_device)=gpu3; + gpu2.stridedSlice(indicesStart,indicesStop,strides).device(sycl_device)=gpu3; + sycl_device.memcpyDeviceToHost(tensor.data(), gpu_data1,(tensor.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(tensor2.data(), gpu_data2,(tensor2.size())*sizeof(DataType)); + + for(IndexType i=0;i<sizeDim1;i++) + for(IndexType j=0;j<sizeDim2;j++){ + VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j)); + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + +template <typename OutIndex, typename DSizes> +Eigen::array<OutIndex, DSizes::count> To32BitDims(const DSizes& in) { + Eigen::array<OutIndex, DSizes::count> out; + for (int i = 0; i < DSizes::count; ++i) { + out[i] = in[i]; + } + return out; +} + +template <class DataType, int DataLayout, typename IndexType, typename ConvertedIndexType> +int run_eigen(const SyclDevice& sycl_device) { + using TensorI64 = Tensor<DataType, 5, DataLayout, IndexType>; + using TensorI32 = Tensor<DataType, 5, DataLayout, ConvertedIndexType>; + using TensorMI64 = TensorMap<TensorI64>; + using TensorMI32 = TensorMap<TensorI32>; + Eigen::array<IndexType, 5> tensor_range{{4, 1, 1, 1, 6}}; + Eigen::array<IndexType, 5> slice_range{{4, 1, 1, 1, 3}}; + + TensorI64 out_tensor_gpu(tensor_range); + TensorI64 out_tensor_cpu(tensor_range); + out_tensor_cpu.setRandom(); + + TensorI64 sub_tensor(slice_range); + sub_tensor.setRandom(); + + DataType* out_gpu_data = static_cast<DataType*>(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType))); + DataType* sub_gpu_data = static_cast<DataType*>(sycl_device.allocate(sub_tensor.size() * sizeof(DataType))); + TensorMI64 out_gpu(out_gpu_data, tensor_range); + TensorMI64 sub_gpu(sub_gpu_data, slice_range); + + sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType)); + sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType)); + + Eigen::array<ConvertedIndexType, 5> slice_offset_32{{0, 0, 0, 0, 3}}; + Eigen::array<ConvertedIndexType, 5> slice_range_32{{4, 1, 1, 1, 3}}; + TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims<ConvertedIndexType>(out_tensor_cpu.dimensions())); + TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims<ConvertedIndexType>(sub_tensor.dimensions())); + TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims<ConvertedIndexType>(out_gpu.dimensions())); + TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims<ConvertedIndexType>(sub_gpu.dimensions())); + + out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32; + + out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32; + + sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType)); + int has_err = 0; + for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) { + auto exp = out_tensor_cpu(i); + auto val = out_tensor_gpu(i); + if (val != exp) { + std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl; + has_err = 1; + } + } + sycl_device.deallocate(out_gpu_data); + sycl_device.deallocate(sub_gpu_data); + return has_err; +} + +template<typename DataType, typename dev_Selector> void sycl_morphing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_slice<DataType, RowMajor, int64_t>(sycl_device); + test_simple_slice<DataType, ColMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reshape<DataType, ColMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, RowMajor, int64_t>(sycl_device); + test_reshape_as_lvalue<DataType, ColMajor, int64_t>(sycl_device); + test_strided_slice_write_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_strided_slice_write_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_strided_slice_as_rhs_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_strided_slice_as_rhs_sycl<DataType, RowMajor, int64_t>(sycl_device); + run_eigen<float, RowMajor, long, int>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_morphing_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_move.cpp b/unsupported/test/cxx11_tensor_move.cpp new file mode 100644 index 000000000..a2982319f --- /dev/null +++ b/unsupported/test/cxx11_tensor_move.cpp @@ -0,0 +1,76 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Viktor Csomor <viktor.csomor@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> +#include <utility> + +using Eigen::Tensor; +using Eigen::RowMajor; + +static void calc_indices(int i, int& x, int& y, int& z) +{ + x = i / 4; + y = (i % 4) / 2; + z = i % 2; +} + +static void test_move() +{ + int x; + int y; + int z; + + Tensor<int,3> tensor1(2, 2, 2); + Tensor<int,3,RowMajor> tensor2(2, 2, 2); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + tensor1(x,y,z) = i; + tensor2(x,y,z) = 2 * i; + } + + // Invokes the move constructor. + Tensor<int,3> moved_tensor1 = std::move(tensor1); + Tensor<int,3,RowMajor> moved_tensor2 = std::move(tensor2); + + VERIFY_IS_EQUAL(tensor1.size(), 0); + VERIFY_IS_EQUAL(tensor2.size(), 0); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + VERIFY_IS_EQUAL(moved_tensor1(x,y,z), i); + VERIFY_IS_EQUAL(moved_tensor2(x,y,z), 2 * i); + } + + Tensor<int,3> moved_tensor3(2,2,2); + Tensor<int,3,RowMajor> moved_tensor4(2,2,2); + + moved_tensor3.setZero(); + moved_tensor4.setZero(); + + // Invokes the move assignment operator. + moved_tensor3 = std::move(moved_tensor1); + moved_tensor4 = std::move(moved_tensor2); + + for (int i = 0; i < 8; i++) + { + calc_indices(i, x, y, z); + VERIFY_IS_EQUAL(moved_tensor3(x,y,z), i); + VERIFY_IS_EQUAL(moved_tensor4(x,y,z), 2 * i); + } +} + +EIGEN_DECLARE_TEST(cxx11_tensor_move) +{ + CALL_SUBTEST(test_move()); +} diff --git a/unsupported/test/cxx11_tensor_notification.cpp b/unsupported/test/cxx11_tensor_notification.cpp index c946007b8..8e8165302 100644 --- a/unsupported/test/cxx11_tensor_notification.cpp +++ b/unsupported/test/cxx11_tensor_notification.cpp @@ -9,38 +9,21 @@ #define EIGEN_USE_THREADS +#include <atomic> + #include <stdlib.h> #include "main.h" #include <Eigen/CXX11/Tensor> -#if EIGEN_OS_WIN || EIGEN_OS_WIN64 -#include <windows.h> -void sleep(int seconds) { - Sleep(seconds*1000); -} -#else -#include <unistd.h> -#endif - - -namespace { - -void WaitAndAdd(Eigen::Notification* n, int* counter) { - n->Wait(); - *counter = *counter + 1; -} - -} // namespace - static void test_notification_single() { ThreadPool thread_pool(1); - int counter = 0; + std::atomic<int> counter(0); Eigen::Notification n; - std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter); + auto func = [&n, &counter](){ n.Wait(); ++counter;}; thread_pool.Schedule(func); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // The thread should be waiting for the notification. VERIFY_IS_EQUAL(counter, 0); @@ -48,7 +31,7 @@ static void test_notification_single() // Unblock the thread n.Notify(); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); // Verify the counter has been incremented VERIFY_IS_EQUAL(counter, 1); @@ -60,21 +43,21 @@ static void test_notification_multiple() { ThreadPool thread_pool(1); - int counter = 0; + std::atomic<int> counter(0); Eigen::Notification n; - std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter); + auto func = [&n, &counter](){ n.Wait(); ++counter;}; thread_pool.Schedule(func); thread_pool.Schedule(func); thread_pool.Schedule(func); thread_pool.Schedule(func); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); VERIFY_IS_EQUAL(counter, 0); n.Notify(); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); VERIFY_IS_EQUAL(counter, 4); } -void test_cxx11_tensor_notification() +EIGEN_DECLARE_TEST(cxx11_tensor_notification) { CALL_SUBTEST(test_notification_single()); CALL_SUBTEST(test_notification_multiple()); diff --git a/unsupported/test/cxx11_tensor_of_complex.cpp b/unsupported/test/cxx11_tensor_of_complex.cpp index e9d1b2d3c..99e18076a 100644 --- a/unsupported/test/cxx11_tensor_of_complex.cpp +++ b/unsupported/test/cxx11_tensor_of_complex.cpp @@ -94,7 +94,7 @@ static void test_contractions() } -void test_cxx11_tensor_of_complex() +EIGEN_DECLARE_TEST(cxx11_tensor_of_complex) { CALL_SUBTEST(test_additions()); CALL_SUBTEST(test_abs()); diff --git a/unsupported/test/cxx11_tensor_of_const_values.cpp b/unsupported/test/cxx11_tensor_of_const_values.cpp index f179a0c21..344d678ef 100644 --- a/unsupported/test/cxx11_tensor_of_const_values.cpp +++ b/unsupported/test/cxx11_tensor_of_const_values.cpp @@ -97,7 +97,7 @@ static void test_plus_equal() } -void test_cxx11_tensor_of_const_values() +EIGEN_DECLARE_TEST(cxx11_tensor_of_const_values) { CALL_SUBTEST(test_assign()); CALL_SUBTEST(test_plus()); diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 2f86980a2..30bcc1d28 100644 --- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -9,21 +9,19 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> + using Eigen::Tensor; template<typename> -void test_cuda_numext() { - Eigen::CudaStreamDevice stream; +void test_gpu_numext() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -59,14 +57,14 @@ void test_cuda_numext() { } -#ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_GPU_FP16 template<typename> -void test_cuda_conversion() { - Eigen::CudaStreamDevice stream; +void test_gpu_conversion() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; - + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half)); float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float)); @@ -97,8 +95,8 @@ void test_cuda_conversion() { } template<typename> -void test_cuda_unary() { - Eigen::CudaStreamDevice stream; +void test_gpu_unary() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -134,8 +132,8 @@ void test_cuda_unary() { } template<typename> -void test_cuda_elementwise() { - Eigen::CudaStreamDevice stream; +void test_gpu_elementwise() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -176,8 +174,8 @@ void test_cuda_elementwise() { } template<typename> -void test_cuda_trancendental() { - Eigen::CudaStreamDevice stream; +void test_gpu_trancendental() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -200,6 +198,8 @@ void test_cuda_trancendental() { Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_half(d_res3_half, num_elem); + Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res4_float(d_res3_float, num_elem); gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f); gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f); @@ -207,6 +207,7 @@ void test_cuda_trancendental() { gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>(); gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>(); gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>(); + gpu_res4_float.device(gpu_device) = gpu_float3.expm1().cast<Eigen::half>(); gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>(); gpu_res1_half.device(gpu_device) = gpu_res1_half.exp(); @@ -217,6 +218,9 @@ void test_cuda_trancendental() { gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p(); + gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>(); + gpu_res3_half.device(gpu_device) = gpu_res3_half.expm1(); + Tensor<float, 1> input1(num_elem); Tensor<Eigen::half, 1> half_prec1(num_elem); Tensor<Eigen::half, 1> full_prec1(num_elem); @@ -243,7 +247,7 @@ void test_cuda_trancendental() { } for (int i = 0; i < num_elem; ++i) { std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl; - if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1 + if(std::abs(input2(i)-1.f)<0.05f) // log lacks accuracy nearby 1 VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f)); else VERIFY_IS_APPROX(full_prec2(i), half_prec2(i)); @@ -264,8 +268,8 @@ void test_cuda_trancendental() { } template<typename> -void test_cuda_contractions() { - Eigen::CudaStreamDevice stream; +void test_gpu_contractions() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int rows = 23; int cols = 23; @@ -315,36 +319,32 @@ void test_cuda_contractions() { } template<typename> -void test_cuda_reductions(int size1, int size2, int redux) { +void test_gpu_reductions(int size1, int size2, int redux) { std::cout << "Reducing " << size1 << " by " << size2 - << " tensor along dim " << redux << std::endl; + << " tensor along dim " << redux << std::endl; - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = size1*size2; int result_size = (redux == 1 ? size1 : size2); - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( - d_float1, size1, size2); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( - d_float2, size1, size2); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float( + d_float, size1, size2); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half( d_res_half, result_size); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float( d_res_float, result_size); - gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; - gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; + gpu_float.device(gpu_device) = gpu_float.random() * 2.0f; - Eigen::array<int, 1> redux_dim = {{redux}}; - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim); + Eigen::array<int, 1> redux_dim = {redux}; + gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim); Tensor<Eigen::half, 1> half_prec(result_size); Tensor<Eigen::half, 1> full_prec(result_size); @@ -357,50 +357,45 @@ void test_cuda_reductions(int size1, int size2, int redux) { VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } template<typename> -void test_cuda_reductions() { - test_cuda_reductions<void>(13, 13, 0); - test_cuda_reductions<void>(13, 13, 1); +void test_gpu_reductions() { + test_gpu_reductions<void>(13, 13, 0); + test_gpu_reductions<void>(13, 13, 1); - test_cuda_reductions<void>(35, 36, 0); - test_cuda_reductions<void>(35, 36, 1); + test_gpu_reductions<void>(35, 36, 0); + test_gpu_reductions<void>(35, 36, 1); - test_cuda_reductions<void>(36, 35, 0); - test_cuda_reductions<void>(36, 35, 1); + test_gpu_reductions<void>(36, 35, 0); + test_gpu_reductions<void>(36, 35, 1); } template<typename> -void test_cuda_full_reductions() { - Eigen::CudaStreamDevice stream; +void test_gpu_full_reductions() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int size = 13; int num_elem = size*size; - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1( - d_float1, size, size); - Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2( - d_float2, size, size); + Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float( + d_float, size, size); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half( d_res_half); Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float( d_res_float); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_float.device(gpu_device) = gpu_float.random(); - gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(); + gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(); Tensor<Eigen::half, 0> half_prec; Tensor<Eigen::half, 0> full_prec; @@ -410,24 +405,23 @@ void test_cuda_full_reductions() { VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>(); - gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum(); + gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>(); + gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum(); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); gpu_device.synchronize(); VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } template<typename> -void test_cuda_forced_evals() { +void test_gpu_forced_evals() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -440,7 +434,7 @@ void test_cuda_forced_evals() { d_float, num_elem); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1( d_res_half1, num_elem); - Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( + Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2( d_res_half2, num_elem); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float( d_res_float, num_elem); @@ -457,7 +451,7 @@ void test_cuda_forced_evals() { Tensor<float, 1> half_prec2(num_elem); Tensor<float, 1> full_prec(num_elem); gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float)); - gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float)); + gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half2, num_elem*sizeof(float)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float)); gpu_device.synchronize(); @@ -475,20 +469,20 @@ void test_cuda_forced_evals() { #endif -void test_cxx11_tensor_of_float16_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_of_float16_gpu) { - CALL_SUBTEST_1(test_cuda_numext<void>()); - -#ifdef EIGEN_HAS_CUDA_FP16 - CALL_SUBTEST_1(test_cuda_conversion<void>()); - CALL_SUBTEST_1(test_cuda_unary<void>()); - CALL_SUBTEST_1(test_cuda_elementwise<void>()); - CALL_SUBTEST_1(test_cuda_trancendental<void>()); - CALL_SUBTEST_2(test_cuda_contractions<void>()); - CALL_SUBTEST_3(test_cuda_reductions<void>()); - CALL_SUBTEST_4(test_cuda_full_reductions<void>()); - CALL_SUBTEST_5(test_cuda_forced_evals<void>()); + CALL_SUBTEST_1(test_gpu_numext<void>()); + +#ifdef EIGEN_HAS_GPU_FP16 + CALL_SUBTEST_1(test_gpu_conversion<void>()); + CALL_SUBTEST_1(test_gpu_unary<void>()); + CALL_SUBTEST_1(test_gpu_elementwise<void>()); + CALL_SUBTEST_1(test_gpu_trancendental<void>()); + CALL_SUBTEST_2(test_gpu_contractions<void>()); + CALL_SUBTEST_3(test_gpu_reductions<void>()); + CALL_SUBTEST_4(test_gpu_full_reductions<void>()); + CALL_SUBTEST_5(test_gpu_forced_evals<void>()); #else - std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; + std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl; #endif } diff --git a/unsupported/test/cxx11_tensor_of_strings.cpp b/unsupported/test/cxx11_tensor_of_strings.cpp index 4ef9aed91..159656276 100644 --- a/unsupported/test/cxx11_tensor_of_strings.cpp +++ b/unsupported/test/cxx11_tensor_of_strings.cpp @@ -141,7 +141,7 @@ static void test_initialization() } -void test_cxx11_tensor_of_strings() +EIGEN_DECLARE_TEST(cxx11_tensor_of_strings) { // Beware: none of this is likely to ever work on a GPU. CALL_SUBTEST(test_assign()); diff --git a/unsupported/test/cxx11_tensor_padding.cpp b/unsupported/test/cxx11_tensor_padding.cpp index ffa19896e..b8a329deb 100644 --- a/unsupported/test/cxx11_tensor_padding.cpp +++ b/unsupported/test/cxx11_tensor_padding.cpp @@ -84,7 +84,7 @@ static void test_padded_expr() } } -void test_cxx11_tensor_padding() +EIGEN_DECLARE_TEST(cxx11_tensor_padding) { CALL_SUBTEST(test_simple_padding<ColMajor>()); CALL_SUBTEST(test_simple_padding<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_padding_sycl.cpp b/unsupported/test/cxx11_tensor_padding_sycl.cpp new file mode 100644 index 000000000..727a9ffd7 --- /dev/null +++ b/unsupported/test/cxx11_tensor_padding_sycl.cpp @@ -0,0 +1,157 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template<typename DataType, int DataLayout, typename IndexType> +static void test_simple_padding(const Eigen::SyclDevice& sycl_device) +{ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + tensor.setRandom(); + + array<std::pair<IndexType, IndexType>, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + IndexType padedSizeDim1 = 2; + IndexType padedSizeDim2 = 6; + IndexType padedSizeDim3 = 12; + IndexType padedSizeDim4 = 7; + array<IndexType, 4> padedtensorRange = {{padedSizeDim1, padedSizeDim2, padedSizeDim3, padedSizeDim4}}; + + Tensor<DataType, 4, DataLayout, IndexType> padded(padedtensorRange); + + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(padded.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu2(gpu_data2, padedtensorRange); + + VERIFY_IS_EQUAL(padded.dimension(0), 2+0); + VERIFY_IS_EQUAL(padded.dimension(1), 3+3); + VERIFY_IS_EQUAL(padded.dimension(2), 5+7); + VERIFY_IS_EQUAL(padded.dimension(3), 7+0); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.pad(paddings); + sycl_device.memcpyDeviceToHost(padded.data(), gpu_data2,(padded.size())*sizeof(DataType)); + for (IndexType i = 0; i < padedSizeDim1; ++i) { + for (IndexType j = 0; j < padedSizeDim2; ++j) { + for (IndexType k = 0; k < padedSizeDim3; ++k) { + for (IndexType l = 0; l < padedSizeDim4; ++l) { + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f); + } + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template<typename DataType, int DataLayout, typename IndexType> +static void test_padded_expr(const Eigen::SyclDevice& sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + tensor.setRandom(); + + array<std::pair<IndexType, IndexType>, 4> paddings; + paddings[0] = std::make_pair(0, 0); + paddings[1] = std::make_pair(2, 1); + paddings[2] = std::make_pair(3, 4); + paddings[3] = std::make_pair(0, 0); + + Eigen::DSizes<IndexType, 2> reshape_dims; + reshape_dims[0] = 12; + reshape_dims[1] = 84; + + + Tensor<DataType, 2, DataLayout, IndexType> result(reshape_dims); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(result.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 4,DataLayout,IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 2,DataLayout,IndexType>> gpu2(gpu_data2, reshape_dims); + + + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.pad(paddings).reshape(reshape_dims); + sycl_device.memcpyDeviceToHost(result.data(), gpu_data2,(result.size())*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 6; ++j) { + for (IndexType k = 0; k < 12; ++k) { + for (IndexType l = 0; l < 7; ++l) { + const float result_value = DataLayout == ColMajor ? + result(i+2*j,k+12*l) : result(j+6*i,l+7*k); + if (j >= 2 && j < 5 && k >= 3 && k < 8) { + VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l)); + } else { + VERIFY_IS_EQUAL(result_value, 0.0f); + } + } + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template<typename DataType, typename dev_Selector> void sycl_padding_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_padding<DataType, RowMajor, int64_t>(sycl_device); + test_simple_padding<DataType, ColMajor, int64_t>(sycl_device); + test_padded_expr<DataType, RowMajor, int64_t>(sycl_device); + test_padded_expr<DataType, ColMajor, int64_t>(sycl_device); + +} +EIGEN_DECLARE_TEST(cxx11_tensor_padding_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_padding_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_patch.cpp b/unsupported/test/cxx11_tensor_patch.cpp index 434359730..498ab8ca7 100644 --- a/unsupported/test/cxx11_tensor_patch.cpp +++ b/unsupported/test/cxx11_tensor_patch.cpp @@ -164,7 +164,7 @@ static void test_simple_patch() } } -void test_cxx11_tensor_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_patch) { CALL_SUBTEST(test_simple_patch<ColMajor>()); CALL_SUBTEST(test_simple_patch<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_patch_sycl.cpp b/unsupported/test/cxx11_tensor_patch_sycl.cpp new file mode 100644 index 000000000..7f92bec78 --- /dev/null +++ b/unsupported/test/cxx11_tensor_patch_sycl.cpp @@ -0,0 +1,249 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_patch_sycl(const Eigen::SyclDevice& sycl_device){ + + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + array<IndexType, 5> patchTensorRange; + if (DataLayout == ColMajor) { + patchTensorRange = {{1, 1, 1, 1, sizeDim1*sizeDim2*sizeDim3*sizeDim4}}; + }else{ + patchTensorRange = {{sizeDim1*sizeDim2*sizeDim3*sizeDim4,1, 1, 1, 1}}; + } + + Tensor<DataType, 4, DataLayout,IndexType> tensor(tensorRange); + Tensor<DataType, 5, DataLayout,IndexType> no_patch(patchTensorRange); + + tensor.setRandom(); + + array<ptrdiff_t, 4> patch_dims; + patch_dims[0] = 1; + patch_dims[1] = 1; + patch_dims[2] = 1; + patch_dims[3] = 1; + + const size_t tensorBuffSize =tensor.size()*sizeof(DataType); + size_t patchTensorBuffSize =no_patch.size()*sizeof(DataType); + DataType* gpu_data_tensor = static_cast<DataType*>(sycl_device.allocate(tensorBuffSize)); + DataType* gpu_data_no_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + + TensorMap<Tensor<DataType, 4, DataLayout,IndexType>> gpu_tensor(gpu_data_tensor, tensorRange); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_no_patch(gpu_data_no_patch, patchTensorRange); + + sycl_device.memcpyHostToDevice(gpu_data_tensor, tensor.data(), tensorBuffSize); + gpu_no_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(no_patch.data(), gpu_data_no_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(no_patch.dimension(0), 1); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size()); + } else { + VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size()); + VERIFY_IS_EQUAL(no_patch.dimension(1), 1); + VERIFY_IS_EQUAL(no_patch.dimension(2), 1); + VERIFY_IS_EQUAL(no_patch.dimension(3), 1); + VERIFY_IS_EQUAL(no_patch.dimension(4), 1); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]); + } + + patch_dims[0] = 2; + patch_dims[1] = 3; + patch_dims[2] = 5; + patch_dims[3] = 7; + + if (DataLayout == ColMajor) { + patchTensorRange = {{sizeDim1,sizeDim2,sizeDim3,sizeDim4,1}}; + }else{ + patchTensorRange = {{1,sizeDim1,sizeDim2,sizeDim3,sizeDim4}}; + } + Tensor<DataType, 5, DataLayout,IndexType> single_patch(patchTensorRange); + patchTensorBuffSize =single_patch.size()*sizeof(DataType); + DataType* gpu_data_single_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_single_patch(gpu_data_single_patch, patchTensorRange); + + gpu_single_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(single_patch.data(), gpu_data_single_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(single_patch.dimension(0), 2); + VERIFY_IS_EQUAL(single_patch.dimension(1), 3); + VERIFY_IS_EQUAL(single_patch.dimension(2), 5); + VERIFY_IS_EQUAL(single_patch.dimension(3), 7); + VERIFY_IS_EQUAL(single_patch.dimension(4), 1); + } else { + VERIFY_IS_EQUAL(single_patch.dimension(0), 1); + VERIFY_IS_EQUAL(single_patch.dimension(1), 2); + VERIFY_IS_EQUAL(single_patch.dimension(2), 3); + VERIFY_IS_EQUAL(single_patch.dimension(3), 5); + VERIFY_IS_EQUAL(single_patch.dimension(4), 7); + } + + for (int i = 0; i < tensor.size(); ++i) { + VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]); + } + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 2; + patch_dims[3] = 1; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,2,1,2*2*4*7}}; + }else{ + patchTensorRange = {{2*2*4*7, 1, 2,2,1}}; + } + Tensor<DataType, 5, DataLayout,IndexType> twod_patch(patchTensorRange); + patchTensorBuffSize =twod_patch.size()*sizeof(DataType); + DataType* gpu_data_twod_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_twod_patch(gpu_data_twod_patch, patchTensorRange); + + gpu_twod_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(twod_patch.data(), gpu_data_twod_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7); + } else { + VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7); + VERIFY_IS_EQUAL(twod_patch.dimension(1), 1); + VERIFY_IS_EQUAL(twod_patch.dimension(2), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(3), 2); + VERIFY_IS_EQUAL(twod_patch.dimension(4), 1); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 4; ++k) { + for (int l = 0; l < 7; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 4 * l)); + } else { + patch_loc = l + 7 * (k + 4 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 2; ++y) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0)); + } + } + } + } + } + } + } + + patch_dims[0] = 1; + patch_dims[1] = 2; + patch_dims[2] = 3; + patch_dims[3] = 5; + + if (DataLayout == ColMajor) { + patchTensorRange = {{1,2,3,5,2*2*3*3}}; + }else{ + patchTensorRange = {{2*2*3*3, 1, 2,3,5}}; + } + Tensor<DataType, 5, DataLayout,IndexType> threed_patch(patchTensorRange); + patchTensorBuffSize =threed_patch.size()*sizeof(DataType); + DataType* gpu_data_threed_patch = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 5, DataLayout,IndexType>> gpu_threed_patch(gpu_data_threed_patch, patchTensorRange); + + gpu_threed_patch.device(sycl_device)=gpu_tensor.extract_patches(patch_dims); + sycl_device.memcpyDeviceToHost(threed_patch.data(), gpu_data_threed_patch, patchTensorBuffSize); + + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 5); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3); + } else { + VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3); + VERIFY_IS_EQUAL(threed_patch.dimension(1), 1); + VERIFY_IS_EQUAL(threed_patch.dimension(2), 2); + VERIFY_IS_EQUAL(threed_patch.dimension(3), 3); + VERIFY_IS_EQUAL(threed_patch.dimension(4), 5); + } + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + for (int k = 0; k < 3; ++k) { + for (int l = 0; l < 3; ++l) { + int patch_loc; + if (DataLayout == ColMajor) { + patch_loc = i + 2 * (j + 2 * (k + 3 * l)); + } else { + patch_loc = l + 3 * (k + 3 * (j + 2 * i)); + } + for (int x = 0; x < 2; ++x) { + for (int y = 0; y < 3; ++y) { + for (int z = 0; z < 5; ++z) { + if (DataLayout == ColMajor) { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc)); + } else { + VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z)); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_tensor); + sycl_device.deallocate(gpu_data_no_patch); + sycl_device.deallocate(gpu_data_single_patch); + sycl_device.deallocate(gpu_data_twod_patch); + sycl_device.deallocate(gpu_data_threed_patch); +} + +template<typename DataType, typename dev_Selector> void sycl_tensor_patch_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_patch_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_simple_patch_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_patch_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_patch_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_random.cpp b/unsupported/test/cxx11_tensor_random.cpp index 0f3dc5787..b9d4c5584 100644 --- a/unsupported/test/cxx11_tensor_random.cpp +++ b/unsupported/test/cxx11_tensor_random.cpp @@ -11,9 +11,10 @@ #include <Eigen/CXX11/Tensor> +template<typename Scalar> static void test_default() { - Tensor<float, 1> vec(6); + Tensor<Scalar, 1> vec(6); vec.setRandom(); // Fixme: we should check that the generated numbers follow a uniform @@ -23,10 +24,11 @@ static void test_default() } } +template<typename Scalar> static void test_normal() { - Tensor<float, 1> vec(6); - vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>(); + Tensor<Scalar, 1> vec(6); + vec.template setRandom<Eigen::internal::NormalRandomGenerator<Scalar>>(); // Fixme: we should check that the generated numbers follow a gaussian // distribution instead. @@ -70,9 +72,15 @@ static void test_custom() } } -void test_cxx11_tensor_random() +EIGEN_DECLARE_TEST(cxx11_tensor_random) { - CALL_SUBTEST(test_default()); - CALL_SUBTEST(test_normal()); + CALL_SUBTEST((test_default<float>())); + CALL_SUBTEST((test_normal<float>())); + CALL_SUBTEST((test_default<double>())); + CALL_SUBTEST((test_normal<double>())); + CALL_SUBTEST((test_default<Eigen::half>())); + CALL_SUBTEST((test_normal<Eigen::half>())); + CALL_SUBTEST((test_default<Eigen::bfloat16>())); + CALL_SUBTEST((test_normal<Eigen::bfloat16>())); CALL_SUBTEST(test_custom()); } diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_gpu.cu index b3be199e1..090986ebc 100644 --- a/unsupported/test/cxx11_tensor_random_cuda.cu +++ b/unsupported/test/cxx11_tensor_random_gpu.cu @@ -9,18 +9,16 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <Eigen/CXX11/Tensor> +#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> -void test_cuda_random_uniform() +void test_gpu_random_uniform() { Tensor<float, 2> out(72,97); out.setZero(); @@ -28,24 +26,24 @@ void test_cuda_random_uniform() std::size_t out_bytes = out.size() * sizeof(float); float* d_out; - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); gpu_out.device(gpu_device) = gpu_out.random(); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - // For now we just check thes code doesn't crash. + // For now we just check this code doesn't crash. // TODO: come up with a valid test of randomness } -void test_cuda_random_normal() +void test_gpu_random_normal() { Tensor<float, 2> out(72,97); out.setZero(); @@ -53,9 +51,9 @@ void test_cuda_random_normal() std::size_t out_bytes = out.size() * sizeof(float); float* d_out; - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); @@ -63,8 +61,8 @@ void test_cuda_random_normal() Eigen::internal::NormalRandomGenerator<float> gen(true); gpu_out.device(gpu_device) = gpu_out.random(gen); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); } static void test_complex() @@ -80,9 +78,9 @@ static void test_complex() } -void test_cxx11_tensor_random_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_random_gpu) { - CALL_SUBTEST(test_cuda_random_uniform()); - CALL_SUBTEST(test_cuda_random_normal()); + CALL_SUBTEST(test_gpu_random_uniform()); + CALL_SUBTEST(test_gpu_random_normal()); CALL_SUBTEST(test_complex()); } diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp new file mode 100644 index 000000000..6c83894a3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random_sycl.cpp @@ -0,0 +1,100 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +template <typename DataType, int DataLayout, typename IndexType> +static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device) +{ + Tensor<DataType, 2,DataLayout, IndexType> out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes)); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange); + + gpu_out.device(sycl_device)=gpu_out.random(); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i<sizeDim0; i++) + for(IndexType j=1; j<sizeDim1; j++) + { + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1)); } + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness + sycl_device.deallocate(d_out); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device) +{ + Tensor<DataType, 2,DataLayout,IndexType> out(72,97); + out.setZero(); + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array<IndexType, 2> tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast<DataType*>(sycl_device.allocate(out_bytes)); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> gpu_out(d_out, tensorRange); + Eigen::internal::NormalRandomGenerator<DataType> gen(true); + gpu_out.device(sycl_device)=gpu_out.random(gen); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i<sizeDim0; i++) + for(IndexType j=1; j<sizeDim1; j++) + { + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i,j-1)); + VERIFY_IS_NOT_EQUAL(out(i,j), out(i-1,j-1)); + + } + + // For now we just check thes code doesn't crash. + // TODO: come up with a valid test of randomness + sycl_device.deallocate(d_out); +} + +template<typename DataType, typename dev_Selector> void sycl_random_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_random_uniform<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_random_uniform<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_random_normal<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_random_normal<DataType, ColMajor, int64_t>(sycl_device); + +} +EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_random_test_per_device<float>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_random_test_per_device<double>(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp index 1490ec3da..c46c4c91d 100644 --- a/unsupported/test/cxx11_tensor_reduction.cpp +++ b/unsupported/test/cxx11_tensor_reduction.cpp @@ -53,20 +53,22 @@ static void test_trivial_reductions() { } } -template <int DataLayout> +template <typename Scalar,int DataLayout> static void test_simple_reductions() { - Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7); + Tensor<Scalar, 4, DataLayout> tensor(2, 3, 5, 7); tensor.setRandom(); + // Add a little offset so that the product reductions won't be close to zero. + tensor += tensor.constant(Scalar(0.5f)); array<ptrdiff_t, 2> reduction_axis2; reduction_axis2[0] = 1; reduction_axis2[1] = 3; - Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2); + Tensor<Scalar, 2, DataLayout> result = tensor.sum(reduction_axis2); VERIFY_IS_EQUAL(result.dimension(0), 2); VERIFY_IS_EQUAL(result.dimension(1), 5); for (int i = 0; i < 2; ++i) { for (int j = 0; j < 5; ++j) { - float sum = 0.0f; + Scalar sum = Scalar(0.0f); for (int k = 0; k < 3; ++k) { for (int l = 0; l < 7; ++l) { sum += tensor(i, k, j, l); @@ -77,7 +79,7 @@ static void test_simple_reductions() { } { - Tensor<float, 0, DataLayout> sum1 = tensor.sum(); + Tensor<Scalar, 0, DataLayout> sum1 = tensor.sum(); VERIFY_IS_EQUAL(sum1.rank(), 0); array<ptrdiff_t, 4> reduction_axis4; @@ -85,7 +87,7 @@ static void test_simple_reductions() { reduction_axis4[1] = 1; reduction_axis4[2] = 2; reduction_axis4[3] = 3; - Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4); + Tensor<Scalar, 0, DataLayout> sum2 = tensor.sum(reduction_axis4); VERIFY_IS_EQUAL(sum2.rank(), 0); VERIFY_IS_APPROX(sum1(), sum2()); @@ -98,7 +100,7 @@ static void test_simple_reductions() { VERIFY_IS_EQUAL(result.dimension(1), 7); for (int i = 0; i < 3; ++i) { for (int j = 0; j < 7; ++j) { - float prod = 1.0f; + Scalar prod = Scalar(1.0f); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 5; ++l) { prod *= tensor(k, i, l, j); @@ -109,7 +111,7 @@ static void test_simple_reductions() { } { - Tensor<float, 0, DataLayout> prod1 = tensor.prod(); + Tensor<Scalar, 0, DataLayout> prod1 = tensor.prod(); VERIFY_IS_EQUAL(prod1.rank(), 0); array<ptrdiff_t, 4> reduction_axis4; @@ -117,7 +119,7 @@ static void test_simple_reductions() { reduction_axis4[1] = 1; reduction_axis4[2] = 2; reduction_axis4[3] = 3; - Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4); + Tensor<Scalar, 0, DataLayout> prod2 = tensor.prod(reduction_axis4); VERIFY_IS_EQUAL(prod2.rank(), 0); VERIFY_IS_APPROX(prod1(), prod2()); @@ -130,7 +132,7 @@ static void test_simple_reductions() { VERIFY_IS_EQUAL(result.dimension(1), 7); for (int i = 0; i < 3; ++i) { for (int j = 0; j < 7; ++j) { - float max_val = std::numeric_limits<float>::lowest(); + Scalar max_val = std::numeric_limits<Scalar>::lowest(); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 5; ++l) { max_val = (std::max)(max_val, tensor(k, i, l, j)); @@ -141,7 +143,7 @@ static void test_simple_reductions() { } { - Tensor<float, 0, DataLayout> max1 = tensor.maximum(); + Tensor<Scalar, 0, DataLayout> max1 = tensor.maximum(); VERIFY_IS_EQUAL(max1.rank(), 0); array<ptrdiff_t, 4> reduction_axis4; @@ -149,7 +151,7 @@ static void test_simple_reductions() { reduction_axis4[1] = 1; reduction_axis4[2] = 2; reduction_axis4[3] = 3; - Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4); + Tensor<Scalar, 0, DataLayout> max2 = tensor.maximum(reduction_axis4); VERIFY_IS_EQUAL(max2.rank(), 0); VERIFY_IS_APPROX(max1(), max2()); @@ -162,7 +164,7 @@ static void test_simple_reductions() { VERIFY_IS_EQUAL(result.dimension(1), 7); for (int i = 0; i < 5; ++i) { for (int j = 0; j < 7; ++j) { - float min_val = (std::numeric_limits<float>::max)(); + Scalar min_val = (std::numeric_limits<Scalar>::max)(); for (int k = 0; k < 2; ++k) { for (int l = 0; l < 3; ++l) { min_val = (std::min)(min_val, tensor(k, l, i, j)); @@ -173,7 +175,7 @@ static void test_simple_reductions() { } { - Tensor<float, 0, DataLayout> min1 = tensor.minimum(); + Tensor<Scalar, 0, DataLayout> min1 = tensor.minimum(); VERIFY_IS_EQUAL(min1.rank(), 0); array<ptrdiff_t, 4> reduction_axis4; @@ -181,7 +183,7 @@ static void test_simple_reductions() { reduction_axis4[1] = 1; reduction_axis4[2] = 2; reduction_axis4[3] = 3; - Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4); + Tensor<Scalar, 0, DataLayout> min2 = tensor.minimum(reduction_axis4); VERIFY_IS_EQUAL(min2.rank(), 0); VERIFY_IS_APPROX(min1(), min2()); @@ -194,7 +196,7 @@ static void test_simple_reductions() { VERIFY_IS_EQUAL(result.dimension(1), 7); for (int i = 0; i < 5; ++i) { for (int j = 0; j < 7; ++j) { - float sum = 0.0f; + Scalar sum = Scalar(0.0f); int count = 0; for (int k = 0; k < 2; ++k) { for (int l = 0; l < 3; ++l) { @@ -202,12 +204,12 @@ static void test_simple_reductions() { ++count; } } - VERIFY_IS_APPROX(result(i, j), sum / count); + VERIFY_IS_APPROX(result(i, j), sum / Scalar(count)); } } { - Tensor<float, 0, DataLayout> mean1 = tensor.mean(); + Tensor<Scalar, 0, DataLayout> mean1 = tensor.mean(); VERIFY_IS_EQUAL(mean1.rank(), 0); array<ptrdiff_t, 4> reduction_axis4; @@ -215,7 +217,7 @@ static void test_simple_reductions() { reduction_axis4[1] = 1; reduction_axis4[2] = 2; reduction_axis4[3] = 3; - Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4); + Tensor<Scalar, 0, DataLayout> mean2 = tensor.mean(reduction_axis4); VERIFY_IS_EQUAL(mean2.rank(), 0); VERIFY_IS_APPROX(mean1(), mean2()); @@ -225,11 +227,11 @@ static void test_simple_reductions() { Tensor<int, 1> ints(10); std::iota(ints.data(), ints.data() + ints.dimension(0), 0); - TensorFixedSize<bool, Sizes<> > all; - all = ints.all(); - VERIFY(!all()); - all = (ints >= ints.constant(0)).all(); - VERIFY(all()); + TensorFixedSize<bool, Sizes<> > all_; + all_ = ints.all(); + VERIFY(!all_()); + all_ = (ints >= ints.constant(0)).all(); + VERIFY(all_()); TensorFixedSize<bool, Sizes<> > any; any = (ints > ints.constant(10)).any(); @@ -368,7 +370,7 @@ static void test_static_dims() { Tensor<float, 2, DataLayout> out(72, 97); in.setRandom(); -#if !EIGEN_HAS_CONSTEXPR +#if !EIGEN_HAS_CONSTEXPR array<int, 2> reduction_axis; reduction_axis[0] = 1; reduction_axis[1] = 3; @@ -386,7 +388,7 @@ static void test_static_dims() { expected = (std::max)(expected, in(i, k, j, l)); } } - VERIFY_IS_APPROX(out(i, j), expected); + VERIFY_IS_EQUAL(out(i, j), expected); } } } @@ -417,7 +419,7 @@ static void test_innermost_last_dims() { expected = (std::max)(expected, in(l, k, i, j)); } } - VERIFY_IS_APPROX(out(i, j), expected); + VERIFY_IS_EQUAL(out(i, j), expected); } } } @@ -448,7 +450,7 @@ static void test_innermost_first_dims() { expected = (std::max)(expected, in(i, j, k, l)); } } - VERIFY_IS_APPROX(out(i, j), expected); + VERIFY_IS_EQUAL(out(i, j), expected); } } } @@ -479,16 +481,37 @@ static void test_reduce_middle_dims() { expected = (std::max)(expected, in(i, k, l, j)); } } - VERIFY_IS_APPROX(out(i, j), expected); + VERIFY_IS_EQUAL(out(i, j), expected); + } + } +} + +static void test_sum_accuracy() { + Tensor<float, 3> tensor(101, 101, 101); + for (float prescribed_mean : {1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f}) { + tensor.setRandom(); + tensor += tensor.constant(prescribed_mean); + + Tensor<float, 0> sum = tensor.sum(); + double expected_sum = 0.0; + for (int i = 0; i < 101; ++i) { + for (int j = 0; j < 101; ++j) { + for (int k = 0; k < 101; ++k) { + expected_sum += static_cast<double>(tensor(i, j, k)); + } + } } + VERIFY_IS_APPROX(sum(), static_cast<float>(expected_sum)); } } -void test_cxx11_tensor_reduction() { +EIGEN_DECLARE_TEST(cxx11_tensor_reduction) { CALL_SUBTEST(test_trivial_reductions<ColMajor>()); CALL_SUBTEST(test_trivial_reductions<RowMajor>()); - CALL_SUBTEST(test_simple_reductions<ColMajor>()); - CALL_SUBTEST(test_simple_reductions<RowMajor>()); + CALL_SUBTEST(( test_simple_reductions<float,ColMajor>() )); + CALL_SUBTEST(( test_simple_reductions<float,RowMajor>() )); + CALL_SUBTEST(( test_simple_reductions<Eigen::half,ColMajor>() )); + CALL_SUBTEST(( test_simple_reductions<Eigen::bfloat16,ColMajor>() )); CALL_SUBTEST(test_reductions_in_expr<ColMajor>()); CALL_SUBTEST(test_reductions_in_expr<RowMajor>()); CALL_SUBTEST(test_full_reductions<ColMajor>()); @@ -505,4 +528,5 @@ void test_cxx11_tensor_reduction() { CALL_SUBTEST(test_innermost_first_dims<RowMajor>()); CALL_SUBTEST(test_reduce_middle_dims<ColMajor>()); CALL_SUBTEST(test_reduce_middle_dims<RowMajor>()); + CALL_SUBTEST(test_sum_accuracy()); } diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu index 6858b43a7..122ac946b 100644 --- a/unsupported/test/cxx11_tensor_reduction_cuda.cu +++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu @@ -9,12 +9,9 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda + #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> @@ -22,7 +19,7 @@ template<typename Type, int DataLayout> static void test_full_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -70,7 +67,7 @@ static void test_first_dim_reductions() { Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); // Create device - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice dev(&stream); // Create data(T) @@ -110,7 +107,7 @@ static void test_last_dim_reductions() { Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); // Create device - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice dev(&stream); // Create data @@ -137,7 +134,7 @@ static void test_last_dim_reductions() { } -void test_cxx11_tensor_reduction_cuda() { +EIGEN_DECLARE_TEST(cxx11_tensor_reduction_gpu) { CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index a9ef82907..a297716e4 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -13,38 +13,168 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 #include "main.h" + #include <unsupported/Eigen/CXX11/Tensor> +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 753; + const IndexType num_cols = 537; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + array<IndexType, 2> outRange = {{1, 1}}; -static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange); + Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange); - const int num_rows = 452; - const int num_cols = 765; - array<int, 2> tensorRange = {{num_rows, num_cols}}; + in.setRandom(); + auto dim = DSizes<IndexType, 2>(1, 1); + full_redux = in.sum().reshape(dim); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate( + sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, + outRange); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); + sycl_device.memcpyDeviceToHost( + full_redux_gpu.data(), gpu_out_data, + (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) + << ", CPU FULL: " << full_redux(0, 0) << "\n"; + VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_sum_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); - Tensor<float, 2> in(tensorRange); - Tensor<float, 0> full_redux; - Tensor<float, 0> full_redux_gpu; + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.sum(); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - full_redux = in.sum(); + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); - float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); - float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} - TensorMap<Tensor<float, 2> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<float, 0> > out_gpu(gpu_out_data); +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + + full_redux = in.maximum(); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the max. + // As we don't include this in the reduction the result should not be 2. + in(0) = static_cast<DataType>(2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.maximum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); - out_gpu.device(sycl_device) = in_gpu.sum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); @@ -52,87 +182,833 @@ static void test_full_reductions_sycl(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_out_data); } -static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) { +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + array<IndexType, 1> argRange = {{num_cols}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + // red_axis[1]=1; + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange); + Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + in_arg1.setRandom(); + in_arg2.setRandom(); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg1_data = static_cast<DataType*>(sycl_device.allocate( + in_arg1.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg2_data = static_cast<DataType*>(sycl_device.allocate( + in_arg2.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg__gpu_helper_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg_data = static_cast<bool*>(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu( + gpu_in_arg1_data, tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu( + gpu_in_arg2_data, tensorRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu( + gpu_out_arg_data, argRange); + TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper( + gpu_out_arg__gpu_helper_data, argRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + + // CPU VERSION + out_arg_cpu = + (in_arg1.argmax(1) == in_arg2.argmax(1)) + .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); + full_redux = (out_arg_cpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + + // GPU VERSION + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg1_data, in_arg1.data(), + (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg2_data, in_arg2.data(), + (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); + out_Argout_gpu_helper.device(sycl_device) = + (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); + out_Argout_gpu.device(sycl_device) = + (out_Argout_gpu_helper) + .select(out_Argout_gpu.constant(true), + out_Argout_gpu.constant(false)); + out_gpu.device(sycl_device) = + (out_Argout_gpu.template cast<float>()) + .reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() + << '\n'; + VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_in_arg1_data); + sycl_device.deallocate(gpu_in_arg2_data); + sycl_device.deallocate(gpu_out_arg__gpu_helper_data); + sycl_device.deallocate(gpu_out_arg_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.mean(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_mean_with_odd_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + // This is a particular case which illustrates a possible problem when the + // number of local threads in a workgroup is even, but is not a power of two. + using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + // 2177 = (17 * 128) + 1 gives rise to 18 local threads. + // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. + const IndexType n_elems = 8707; + array<IndexType, 1> tensor_range = {{n_elems}}; + + data_tensor in(tensor_range); + DataType full_redux; + DataType full_redux_gpu; + TensorMap<scalar_tensor> red_cpu(&full_redux); + TensorMap<scalar_tensor> red_gpu(&full_redux_gpu); + + const DataType const_val = static_cast<DataType>(0.6391); + in = in.constant(const_val); + + Eigen::IndexList<Eigen::type2index<0>> red_axis; + red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + VERIFY_IS_APPROX(const_val, red_cpu()); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = + in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>()); + sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu, full_redux); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_min_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 876; + const IndexType num_cols = 953; + array<IndexType, 2> tensorRange = {{num_rows, num_cols}}; + + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 0, DataLayout, IndexType> full_redux; + Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu; + + in.setRandom(); + + full_redux = in.minimum(); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_full_reductions_min_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the min. + // As we don't include this in the reduction the result should not be -2. + in(0) = static_cast<DataType>(-2); + + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.minimum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast<DataType*>(sycl_device.allocate(sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<scalar_tensor> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.maximum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> reduced_range = {{num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = num_cols; - int dim_x = 145; - int dim_y = 1; - int dim_z = 67; + data_tensor in(tensor_range); + reduced_tensor redux; + reduced_tensor redux_gpu(reduced_range); - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; + in.setRandom(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } + + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 0; - array<int, 2> reduced_tensorRange = {{dim_y, dim_z}}; - Tensor<float, 3> in(tensorRange); - Tensor<float, 2> redux(reduced_tensorRange); - Tensor<float, 2> redux_gpu(reduced_tensorRange); + const IndexType offset = 64; + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + redux = in_offset.maximum(red_axis); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(redux(i), in(i)); + } + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate(n_reduced * sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, + n_reduced * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), redux(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>; + using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array<IndexType, 2> tensor_range = {{num_rows, num_cols}}; + array<IndexType, 1> full_reduced_range = {{num_rows}}; + array<IndexType, 1> reduced_range = {{num_rows - 1}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = reduced_range[0]; + + data_tensor in(tensor_range); + reduced_tensor redux(full_reduced_range); + reduced_tensor redux_gpu(reduced_range); in.setRandom(); + redux.setZero(); + array<IndexType, 2> tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast<DataType>(2); + } + + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 1; + + const IndexType offset = 64; + // Introduce an offset in both the input and the output. + TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range); + TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range); + red_offset = in_offset.maximum(red_axis); + + // Check that the first value hasn't been changed and that the reduced values + // are not equal to the previously set maximum in the input outside the range. + VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); + } + + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>( + sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); + + TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), + n_reduced * sizeof(DataType)); - redux= in.sum(red_axis); + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), red_offset(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} - float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); - float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float))); +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { + array<IndexType, 2> tensorRange = {{dim_x, dim_y}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 1> reduced_tensorRange = {{dim_y}}; - TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); + in.setRandom(); + redux = in.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType i = 0; i < redux.size(); i++) { + VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); + } + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_first_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 0; + array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } -static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) { +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; + red_axis[0] = 2; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; + + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - int dim_x = 567; - int dim_y = 1; - int dim_z = 47; + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}}; - Eigen::array<int, 1> red_axis; + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array<IndexType, 1> red_axis; red_axis[0] = 2; - array<int, 2> reduced_tensorRange = {{dim_x, dim_y}}; + array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}}; - Tensor<float, 3> in(tensorRange); - Tensor<float, 2> redux(reduced_tensorRange); - Tensor<float, 2> redux_gpu(reduced_tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange); + Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange); in.setRandom(); - redux= in.sum(red_axis); + redux = in.sum(red_axis); - float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float))); - float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float))); + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - TensorMap<Tensor<float, 3> > in_gpu(gpu_in_data, tensorRange); - TensorMap<Tensor<float, 2> > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu( + gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(int j=0; j<reduced_tensorRange[0]; j++ ) - for(int k=0; k<reduced_tensorRange[1]; k++ ) - VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k)); + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + // auto red_axis = Sizes<0,1>(0,1); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + + redux_fix = in_fix.sum(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_last_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + Eigen::IndexList<Eigen::type2index<1>> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix; + TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + redux_fix = in_fix.mean(red_axis); + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + sycl_device.synchronize(); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +// SYCL supports a generic case of reduction where the accumulator is a +// different type than the input data This is an example on how to get if a +// Tensor contains nan and/or inf in one reduction +template <typename InT, typename OutT> +struct CustomReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + static constexpr OutT InfBit = 1; + static constexpr OutT NanBit = 2; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, + OutT* accum) const { + if (Eigen::numext::isinf(x)) + *accum |= InfBit; + else if (Eigen::numext::isnan(x)) + *accum |= NanBit; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, + OutT* accum) const { + *accum |= x; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { + return OutT(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { + return accum; + } +}; + +template <typename DataType, typename AccumType, int DataLayout, + typename IndexType> +static void test_full_reductions_custom_sycl( + const Eigen::SyclDevice& sycl_device) { + constexpr IndexType InSize = 64; + auto tensorRange = Sizes<InSize>(InSize); + Eigen::IndexList<Eigen::type2index<0>> dims; + auto reduced_tensorRange = Sizes<>(); + TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix; + TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix; + + CustomReducer<DataType, AccumType> reducer; + + in_fix.setRandom(); + + size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); + DataType* gpu_in_data = + static_cast<DataType*>(sycl_device.allocate(in_size_bytes)); + AccumType* gpu_out_data = + static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType))); + + TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); + out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); + sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, + sizeof(AccumType)); + VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_full_per_device(const Dev& sycl_device) { + test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + + test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>( + sycl_device); + test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); } -void test_cxx11_tensor_reduction_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST((test_full_reductions_sycl(sycl_device))); - CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device))); - CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device))); +template <typename DataType, typename Dev> +void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { + test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>( + sycl_device); + test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { + test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, + 129, 8); + test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + sycl_device.synchronize(); +} + +template <typename DataType, typename Dev> +void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { + test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>( + sycl_device); + test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device); + test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device); + sycl_device.synchronize(); +} +EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device)); + CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device)); + CALL_SUBTEST_3( + sycl_reduction_test_first_dim_per_device<float>(sycl_device)); + CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device)); + } } diff --git a/unsupported/test/cxx11_tensor_ref.cpp b/unsupported/test/cxx11_tensor_ref.cpp index c8f105e3d..7dbd0478c 100644 --- a/unsupported/test/cxx11_tensor_ref.cpp +++ b/unsupported/test/cxx11_tensor_ref.cpp @@ -235,7 +235,7 @@ static void test_nested_ops_with_ref() } -void test_cxx11_tensor_ref() +EIGEN_DECLARE_TEST(cxx11_tensor_ref) { CALL_SUBTEST(test_simple_lvalue_ref()); CALL_SUBTEST(test_simple_rvalue_ref()); diff --git a/unsupported/test/cxx11_tensor_reverse.cpp b/unsupported/test/cxx11_tensor_reverse.cpp index b35b8d29e..5e44ec007 100644 --- a/unsupported/test/cxx11_tensor_reverse.cpp +++ b/unsupported/test/cxx11_tensor_reverse.cpp @@ -179,7 +179,7 @@ static void test_expr_reverse(bool LValue) } -void test_cxx11_tensor_reverse() +EIGEN_DECLARE_TEST(cxx11_tensor_reverse) { CALL_SUBTEST(test_simple_reverse<ColMajor>()); CALL_SUBTEST(test_simple_reverse<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp new file mode 100644 index 000000000..dd30c235d --- /dev/null +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -0,0 +1,253 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2015 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; + + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> reversed_tensor(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = true; + dim_rev[3] = false; + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate( + reversed_tensor.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu(gpu_out_data, + tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(i, 2 - j, 4 - k, l)); + } + } + } + } + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = false; + + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l)); + } + } + } + } + + dim_rev[0] = true; + dim_rev[1] = false; + dim_rev[2] = false; + dim_rev[3] = true; + out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(1 - i, j, k, 6 - l)); + } + } + } + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, + bool LValue) { + IndexType dim1 = 2; + IndexType dim2 = 3; + IndexType dim3 = 5; + IndexType dim4 = 7; + + array<IndexType, 4> tensorRange = {{dim1, dim2, dim3, dim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> expected(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> result(tensorRange); + tensor.setRandom(); + + array<bool, 4> dim_rev; + dim_rev[0] = false; + dim_rev[1] = true; + dim_rev[2] = false; + dim_rev[3] = true; + + DataType* gpu_in_data = static_cast<DataType*>( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_expected = static_cast<DataType*>(sycl_device.allocate( + expected.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_result = static_cast<DataType*>( + sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > in_gpu(gpu_in_data, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_expected( + gpu_out_data_expected, tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType> > out_gpu_result( + gpu_out_data_result, tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); + + if (LValue) { + out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; + } else { + out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); + } + sycl_device.memcpyDeviceToHost( + expected.data(), gpu_out_data_expected, + expected.dimensions().TotalSize() * sizeof(DataType)); + + array<IndexType, 4> src_slice_dim; + src_slice_dim[0] = 2; + src_slice_dim[1] = 3; + src_slice_dim[2] = 1; + src_slice_dim[3] = 7; + array<IndexType, 4> src_slice_start; + src_slice_start[0] = 0; + src_slice_start[1] = 0; + src_slice_start[2] = 0; + src_slice_start[3] = 0; + array<IndexType, 4> dst_slice_dim = src_slice_dim; + array<IndexType, 4> dst_slice_start = src_slice_start; + + for (IndexType i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); + } + src_slice_start[2] += 1; + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); + + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); + } + } + } + } + + dst_slice_start[2] = 0; + result.setRandom(); + sycl_device.memcpyHostToDevice( + gpu_out_data_result, result.data(), + (result.dimensions().TotalSize()) * sizeof(DataType)); + for (IndexType i = 0; i < 5; ++i) { + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } + dst_slice_start[2] += 1; + } + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); + + for (IndexType i = 0; i < expected.dimension(0); ++i) { + for (IndexType j = 0; j < expected.dimension(1); ++j) { + for (IndexType k = 0; k < expected.dimension(2); ++k) { + for (IndexType l = 0; l < expected.dimension(3); ++l) { + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); + } + } + } + } +} + +template <typename DataType> +void sycl_reverse_test_per_device(const cl::sycl::device& d) { + QueueInterface queueInterface(d); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_reverse<DataType, RowMajor, int64_t>(sycl_device); + test_simple_reverse<DataType, ColMajor, int64_t>(sycl_device); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, false); + test_expr_reverse<DataType, RowMajor, int64_t>(sycl_device, true); + test_expr_reverse<DataType, ColMajor, int64_t>(sycl_device, true); +} +EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.get_info<cl::sycl::info::device::name>() << std::endl; + CALL_SUBTEST_1(sycl_reverse_test_per_device<short>(device)); + CALL_SUBTEST_2(sycl_reverse_test_per_device<int>(device)); + CALL_SUBTEST_3(sycl_reverse_test_per_device<unsigned int>(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST_4(sycl_reverse_test_per_device<double>(device)); +#endif + CALL_SUBTEST_5(sycl_reverse_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_roundings.cpp b/unsupported/test/cxx11_tensor_roundings.cpp index 2c26151ab..83b592384 100644 --- a/unsupported/test/cxx11_tensor_roundings.cpp +++ b/unsupported/test/cxx11_tensor_roundings.cpp @@ -54,7 +54,7 @@ static void test_float_ceiling() } } -void test_cxx11_tensor_roundings() +EIGEN_DECLARE_TEST(cxx11_tensor_roundings) { CALL_SUBTEST(test_float_rounding()); CALL_SUBTEST(test_float_ceiling()); diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp index af59aa3ef..dccee9e84 100644 --- a/unsupported/test/cxx11_tensor_scan.cpp +++ b/unsupported/test/cxx11_tensor_scan.cpp @@ -98,7 +98,7 @@ static void test_tensor_maps() { } } -void test_cxx11_tensor_scan() { +EIGEN_DECLARE_TEST(cxx11_tensor_scan) { CALL_SUBTEST((test_1d_scan<ColMajor, float, true>())); CALL_SUBTEST((test_1d_scan<ColMajor, float, false>())); CALL_SUBTEST((test_1d_scan<RowMajor, float, true>())); diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu index 5f146f3c9..770a144f1 100644 --- a/unsupported/test/cxx11_tensor_scan_cuda.cu +++ b/unsupported/test/cxx11_tensor_scan_gpu.cu @@ -9,21 +9,20 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda + #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 -#include <cuda_fp16.h> -#endif #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -void test_cuda_cumsum(int m_size, int k_size, int n_size) +void test_gpu_cumsum(int m_size, int k_size, int n_size) { std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); @@ -38,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) float* d_t_input; float* d_t_result; - cudaMalloc((void**)(&d_t_input), t_input_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_input), t_input_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > @@ -54,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); t_result = t_input.cumsum(1); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { continue; @@ -67,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_input); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_input); + gpuFree((void*)d_t_result); } -void test_cxx11_tensor_scan_cuda() +EIGEN_DECLARE_TEST(cxx11_tensor_scan_gpu) { - CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); - CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128)); } diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp new file mode 100644 index 000000000..09c45fce5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp @@ -0,0 +1,141 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +typedef Tensor<float, 1>::DimensionPair DimPair; + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, int consume_dim, + bool exclusive) { + static const DataType error_threshold = 1e-4f; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size + << " consume_dim : " << consume_dim << ")" << std::endl; + Tensor<DataType, 3, DataLayout, IndexType> t_input(m_size, k_size, n_size); + Tensor<DataType, 3, DataLayout, IndexType> t_result(m_size, k_size, n_size); + Tensor<DataType, 3, DataLayout, IndexType> t_result_gpu(m_size, k_size, + n_size); + + t_input.setRandom(); + std::size_t t_input_bytes = t_input.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType* gpu_data_in = + static_cast<DataType*>(sycl_device.allocate(t_input_bytes)); + DataType* gpu_data_out = + static_cast<DataType*>(sycl_device.allocate(t_result_bytes)); + + array<IndexType, 3> tensorRange = {{m_size, k_size, n_size}}; + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_input( + gpu_data_in, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_t_result( + gpu_data_out, tensorRange); + sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes); + sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive); + + t_result = t_input.cumsum(consume_dim, exclusive); + + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out, + t_result_bytes); + sycl_device.synchronize(); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast<DataType>(std::fabs(static_cast<DataType>( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i) + << " vs SYCL : " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(gpu_data_in); + sycl_device.deallocate(gpu_data_out); +} + +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + true); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + true); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 2049, 1023, 127, 0, + false); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 2049, 127, 1, + false); +} +template <typename DataType, typename Dev> +void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum<DataType, ColMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + false); + test_sycl_cumsum<DataType, RowMajor, int64_t>(sycl_device, 1023, 127, 2049, 2, + false); +} +EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info<cl::sycl::info::device::name>() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1( + sycl_scan_test_exclusive_dim0_per_device<float>(sycl_device)); + CALL_SUBTEST_2( + sycl_scan_test_exclusive_dim1_per_device<float>(sycl_device)); + CALL_SUBTEST_3( + sycl_scan_test_exclusive_dim2_per_device<float>(sycl_device)); + CALL_SUBTEST_4( + sycl_scan_test_inclusive_dim0_per_device<float>(sycl_device)); + CALL_SUBTEST_5( + sycl_scan_test_inclusive_dim1_per_device<float>(sycl_device)); + CALL_SUBTEST_6( + sycl_scan_test_inclusive_dim2_per_device<float>(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index d11444a14..89a64c021 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -81,12 +81,12 @@ static void test_expr_shuffling() Tensor<float, 4, DataLayout> expected; expected = tensor.shuffle(shuffles); - Tensor<float, 4, DataLayout> result(5,7,3,2); + Tensor<float, 4, DataLayout> result(5, 7, 3, 2); - array<int, 4> src_slice_dim{{2,3,1,7}}; - array<int, 4> src_slice_start{{0,0,0,0}}; - array<int, 4> dst_slice_dim{{1,7,3,2}}; - array<int, 4> dst_slice_start{{0,0,0,0}}; + array<ptrdiff_t, 4> src_slice_dim{{2, 3, 1, 7}}; + array<ptrdiff_t, 4> src_slice_start{{0, 0, 0, 0}}; + array<ptrdiff_t, 4> dst_slice_dim{{1, 7, 3, 2}}; + array<ptrdiff_t, 4> dst_slice_start{{0, 0, 0, 0}}; for (int i = 0; i < 5; ++i) { result.slice(dst_slice_start, dst_slice_dim) = @@ -215,7 +215,60 @@ static void test_shuffle_unshuffle() } -void test_cxx11_tensor_shuffling() +template <int DataLayout> +static void test_empty_shuffling() +{ + Tensor<float, 4, DataLayout> tensor(2,3,0,7); + tensor.setRandom(); + array<ptrdiff_t, 4> shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor<float, 4, DataLayout> no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor<float, 4, DataLayout> shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 0); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) { CALL_SUBTEST(test_simple_shuffling<ColMajor>()); CALL_SUBTEST(test_simple_shuffling<RowMajor>()); @@ -225,4 +278,6 @@ void test_cxx11_tensor_shuffling() CALL_SUBTEST(test_shuffling_as_value<RowMajor>()); CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>()); CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>()); + CALL_SUBTEST(test_empty_shuffling<ColMajor>()); + CALL_SUBTEST(test_empty_shuffling<RowMajor>()); } diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp new file mode 100644 index 000000000..ca4e8b5ef --- /dev/null +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -0,0 +1,117 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// Benoit Steiner <benoit.steiner.goog@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + array<IndexType, 4> tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensorRange); + Tensor<DataType, 4, DataLayout, IndexType> no_shuffle(tensorRange); + tensor.setRandom(); + + const size_t buffSize = tensor.size() * sizeof(DataType); + array<IndexType, 4> shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(buffSize)); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(buffSize)); + + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu1(gpu_data1, + tensorRange); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu2(gpu_data2, + tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); + + gpu2.device(sycl_device) = gpu1.shuffle(shuffles); + sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize); + sycl_device.synchronize(); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), sizeDim1); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), sizeDim2); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), sizeDim3); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), sizeDim4); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + array<IndexType, 4> tensorrangeShuffle = { + {sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor<DataType, 4, DataLayout, IndexType> shuffle(tensorrangeShuffle); + DataType* gpu_data3 = static_cast<DataType*>(sycl_device.allocate(buffSize)); + TensorMap<Tensor<DataType, 4, DataLayout, IndexType>> gpu3( + gpu_data3, tensorrangeShuffle); + + gpu3.device(sycl_device) = gpu1.shuffle(shuffles); + sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); + sycl_device.synchronize(); + + VERIFY_IS_EQUAL(shuffle.dimension(0), sizeDim3); + VERIFY_IS_EQUAL(shuffle.dimension(1), sizeDim4); + VERIFY_IS_EQUAL(shuffle.dimension(2), sizeDim2); + VERIFY_IS_EQUAL(shuffle.dimension(3), sizeDim1); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { + for (IndexType l = 0; l < sizeDim4; ++l) { + VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i)); + } + } + } + } +} + +template <typename DataType, typename dev_Selector> +void sycl_shuffling_test_per_device(dev_Selector s) { + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_simple_shuffling_sycl<DataType, RowMajor, int64_t>(sycl_device); + test_simple_shuffling_sycl<DataType, ColMajor, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_shuffling_test_per_device<float>(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_simple.cpp b/unsupported/test/cxx11_tensor_simple.cpp index 5a0d339ef..6d70f5435 100644 --- a/unsupported/test/cxx11_tensor_simple.cpp +++ b/unsupported/test/cxx11_tensor_simple.cpp @@ -316,7 +316,7 @@ static void test_resize() VERIFY_IS_EQUAL(epsilon.size(), 3*5*7); } -void test_cxx11_tensor_simple() +EIGEN_DECLARE_TEST(cxx11_tensor_simple) { CALL_SUBTEST(test_0d()); CALL_SUBTEST(test_1d()); diff --git a/unsupported/test/cxx11_tensor_striding.cpp b/unsupported/test/cxx11_tensor_striding.cpp index 935b908cc..aefdfa9b4 100644 --- a/unsupported/test/cxx11_tensor_striding.cpp +++ b/unsupported/test/cxx11_tensor_striding.cpp @@ -110,7 +110,7 @@ static void test_striding_as_lvalue() } -void test_cxx11_tensor_striding() +EIGEN_DECLARE_TEST(cxx11_tensor_striding) { CALL_SUBTEST(test_simple_striding<ColMajor>()); CALL_SUBTEST(test_simple_striding<RowMajor>()); diff --git a/unsupported/test/cxx11_tensor_striding_sycl.cpp b/unsupported/test/cxx11_tensor_striding_sycl.cpp new file mode 100644 index 000000000..d3b1fa77c --- /dev/null +++ b/unsupported/test/cxx11_tensor_striding_sycl.cpp @@ -0,0 +1,203 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include <iostream> +#include <chrono> +#include <ctime> + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + + +template <typename DataType, int DataLayout, typename IndexType> +static void test_simple_striding(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{1,1,3,3}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 1; + strides[1] = 1; + strides[2] = 1; + strides[3] = 1; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_no_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + //no_stride = tensor.stride(strides); + + VERIFY_IS_EQUAL(no_stride.dimension(0), 2); + VERIFY_IS_EQUAL(no_stride.dimension(1), 3); + VERIFY_IS_EQUAL(no_stride.dimension(2), 5); + VERIFY_IS_EQUAL(no_stride.dimension(3), 7); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l)); + } + } + } + } + + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; +//Tensor<float, 4, DataLayout> stride; +// stride = tensor.stride(strides); + + gpu_stride.device(sycl_device)=gpu_tensor.stride(strides); + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + VERIFY_IS_EQUAL(stride.dimension(0), 1); + VERIFY_IS_EQUAL(stride.dimension(1), 1); + VERIFY_IS_EQUAL(stride.dimension(2), 3); + VERIFY_IS_EQUAL(stride.dimension(3), 3); + + for (IndexType i = 0; i < 1; ++i) { + for (IndexType j = 0; j < 1; ++j) { + for (IndexType k = 0; k < 3; ++k) { + for (IndexType l = 0; l < 3; ++l) { + VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l)); + } + } + } + } + + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + +template <typename DataType, int DataLayout, typename IndexType> +static void test_striding_as_lvalue(const Eigen::SyclDevice& sycl_device) +{ + + Eigen::array<IndexType, 4> tensor_dims = {{2,3,5,7}}; + Eigen::array<IndexType, 4> stride_dims = {{3,12,10,21}}; + + + Tensor<DataType, 4, DataLayout, IndexType> tensor(tensor_dims); + Tensor<DataType, 4, DataLayout,IndexType> no_stride(stride_dims); + Tensor<DataType, 4, DataLayout,IndexType> stride(stride_dims); + + + std::size_t tensor_bytes = tensor.size() * sizeof(DataType); + std::size_t no_stride_bytes = no_stride.size() * sizeof(DataType); + std::size_t stride_bytes = stride.size() * sizeof(DataType); + + DataType * d_tensor = static_cast<DataType*>(sycl_device.allocate(tensor_bytes)); + DataType * d_no_stride = static_cast<DataType*>(sycl_device.allocate(no_stride_bytes)); + DataType * d_stride = static_cast<DataType*>(sycl_device.allocate(stride_bytes)); + + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_tensor(d_tensor, tensor_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_no_stride(d_no_stride, stride_dims); + Eigen::TensorMap<Eigen::Tensor<DataType, 4, DataLayout, IndexType> > gpu_stride(d_stride, stride_dims); + + //Tensor<float, 4, DataLayout> tensor(2,3,5,7); + tensor.setRandom(); + array<IndexType, 4> strides; + strides[0] = 2; + strides[1] = 4; + strides[2] = 2; + strides[3] = 3; + +// Tensor<float, 4, DataLayout> result(3, 12, 10, 21); +// result.stride(strides) = tensor; + sycl_device.memcpyHostToDevice(d_tensor, tensor.data(), tensor_bytes); + gpu_stride.stride(strides).device(sycl_device)=gpu_tensor; + sycl_device.memcpyDeviceToHost(stride.data(), d_stride, stride_bytes); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), stride(2*i,4*j,2*k,3*l)); + } + } + } + } + + array<IndexType, 4> no_strides; + no_strides[0] = 1; + no_strides[1] = 1; + no_strides[2] = 1; + no_strides[3] = 1; +// Tensor<float, 4, DataLayout> result2(3, 12, 10, 21); +// result2.stride(strides) = tensor.stride(no_strides); + + gpu_no_stride.stride(strides).device(sycl_device)=gpu_tensor.stride(no_strides); + sycl_device.memcpyDeviceToHost(no_stride.data(), d_no_stride, no_stride_bytes); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 3; ++j) { + for (IndexType k = 0; k < 5; ++k) { + for (IndexType l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(2*i,4*j,2*k,3*l)); + } + } + } + } + sycl_device.deallocate(d_tensor); + sycl_device.deallocate(d_no_stride); + sycl_device.deallocate(d_stride); +} + + +template <typename Dev_selector> void tensorStridingPerDevice(Dev_selector& s){ + QueueInterface queueInterface(s); + auto sycl_device=Eigen::SyclDevice(&queueInterface); + test_simple_striding<float, ColMajor, int64_t>(sycl_device); + test_simple_striding<float, RowMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, ColMajor, int64_t>(sycl_device); + test_striding_as_lvalue<float, RowMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_striding_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(tensorStridingPerDevice(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp index 2f56eb495..2ca5c47db 100644 --- a/unsupported/test/cxx11_tensor_sugar.cpp +++ b/unsupported/test/cxx11_tensor_sugar.cpp @@ -73,7 +73,7 @@ static void test_scalar_sugar_sub_div() { } } -void test_cxx11_tensor_sugar() +EIGEN_DECLARE_TEST(cxx11_tensor_sugar) { CALL_SUBTEST(test_comparison_sugar()); CALL_SUBTEST(test_scalar_sugar_add_mul()); diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 6a9c33422..e6c5e2378 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -15,8 +15,8 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_sycl -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL #include "main.h" @@ -27,36 +27,188 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 5; + IndexType sizeDim3 = 1; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out1(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out2(tensorRange); + Tensor<DataType, 3, DataLayout, IndexType> out3(tensorRange); + + in1 = in1.random(); + + DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in1.data(),(in1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data2, in1.data(),(in1.size())*sizeof(DataType)); + gpu1.device(sycl_device) = gpu1 * 3.14f; + gpu2.device(sycl_device) = gpu2 * 2.7f; + sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1,(out1.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1,(out2.size())*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2,(out3.size())*sizeof(DataType)); + sycl_device.synchronize(); + + for (IndexType i = 0; i < in1.size(); ++i) { + // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n"; + VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); + VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); + VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); + } + + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<DataType, 1, DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 1, DataLayout, IndexType> out(tensorRange); + + in1 = in1.random(); + in2 = in1; + + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); + + TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_data, in1.data(),(in1.size())*sizeof(DataType)); + sycl_device.synchronize(); + in1.setZero(); + + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size()*sizeof(DataType)); + sycl_device.synchronize(); + + for (IndexType i = 0; i < in1.size(); ++i) { + VERIFY_IS_APPROX(out(i), in2(i)); + } + + sycl_device.deallocate(gpu_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array<IndexType, 1> tensorRange = {{full_size}}; + tensor_type in1(tensorRange); + tensor_type out(tensorRange); + + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu1(gpu_data, tensorRange); + + in1 = in1.random(); + // Copy all data to device, then permute on copy back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + // Permute copies to device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange); + // Copy all to device, permute copies on device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + sycl_device.deallocate(gpu_data_out); + sycl_device.deallocate(gpu_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array<IndexType, 1> tensorRange = {{full_size}}; + tensor_type cpu_out(tensorRange); + tensor_type out(tensorRange); + + cpu_out.setZero(); + + std::memset(cpu_out.data(), 0, half_size * sizeof(DataType)); + std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType)); + + DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap<tensor_type> gpu1(gpu_data, tensorRange); + + sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType)); + sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < full_size; ++i) { + VERIFY_IS_APPROX(out(i), cpu_out(i)); + } + + sycl_device.deallocate(gpu_data); +} + +template <typename DataType, int DataLayout, typename IndexType> +void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { - int sizeDim1 = 100; - int sizeDim2 = 100; - int sizeDim3 = 100; - array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - Tensor<float, 3> in1(tensorRange); - Tensor<float, 3> in2(tensorRange); - Tensor<float, 3> in3(tensorRange); - Tensor<float, 3> out(tensorRange); + IndexType sizeDim1 = 100; + IndexType sizeDim2 = 10; + IndexType sizeDim3 = 20; + array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor<DataType, 3,DataLayout, IndexType> in1(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in2(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> in3(tensorRange); + Tensor<DataType, 3,DataLayout, IndexType> out(tensorRange); in2 = in2.random(); in3 = in3.random(); - float * gpu_in1_data = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float))); - float * gpu_in2_data = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float))); - float * gpu_in3_data = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float))); - float * gpu_out_data = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float))); + DataType * gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size()*sizeof(DataType))); + DataType * gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size()*sizeof(DataType))); + DataType * gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size()*sizeof(DataType))); + DataType * gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size()*sizeof(DataType))); - TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange); - TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange); - TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange); - TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange); + TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); /// a=1.2f gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f); - sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.size())*sizeof(DataType)); + sycl_device.synchronize(); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(in1(i,j,k), 1.2f); } } @@ -65,10 +217,12 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { /// a=b*1.2f gpu_out.device(sycl_device) = gpu_in1 * 1.2f; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 1.2f); } @@ -77,12 +231,14 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { printf("a=b*1.2f Test Passed\n"); /// c=a*b - sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.size())*sizeof(DataType)); gpu_out.device(sycl_device) = gpu_in1 * gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in2(i,j,k)); @@ -93,10 +249,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { /// c=a+b gpu_out.device(sycl_device) = gpu_in1 + gpu_in2; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k)); @@ -107,10 +264,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { /// c=a*a gpu_out.device(sycl_device) = gpu_in1 * gpu_in1; - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * in1(i,j,k)); @@ -121,10 +279,11 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { //a*3.14f + b*2.7f gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f); - sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) * 3.14f + in2(i,j,k) * 2.7f); @@ -134,12 +293,13 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { printf("a*3.14f + b*2.7f Test Passed\n"); ///d= (a>0.5? b:c) - sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float)); + sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.size())*sizeof(DataType)); gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3); - sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float)); - for (int i = 0; i < sizeDim1; ++i) { - for (int j = 0; j < sizeDim2; ++j) { - for (int k = 0; k < sizeDim3; ++k) { + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.size())*sizeof(DataType)); + sycl_device.synchronize(); + for (IndexType i = 0; i < sizeDim1; ++i) { + for (IndexType j = 0; j < sizeDim2; ++j) { + for (IndexType k = 0; k < sizeDim3; ++k) { VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k)); @@ -152,8 +312,50 @@ void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_in3_data); sycl_device.deallocate(gpu_out_data); } -void test_cxx11_tensor_sycl() { - cl::sycl::gpu_selector s; - Eigen::SyclDevice sycl_device(s); - CALL_SUBTEST(test_sycl_cpu(sycl_device)); +template<typename Scalar1, typename Scalar2, int DataLayout, typename IndexType> +static void test_sycl_cast(const Eigen::SyclDevice& sycl_device){ + IndexType size = 20; + array<IndexType, 1> tensorRange = {{size}}; + Tensor<Scalar1, 1, DataLayout, IndexType> in(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out(tensorRange); + Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange); + + in = in.random(); + + Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size()*sizeof(Scalar1))); + Scalar2 * gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size()*sizeof(Scalar2))); + + TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange); + TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.size())*sizeof(Scalar1)); + gpu_out.device(sycl_device) = gpu_in. template cast<Scalar2>(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size()*sizeof(Scalar2)); + out_host = in. template cast<Scalar2>(); + for(IndexType i=0; i< size; i++) + { + VERIFY_IS_APPROX(out(i), out_host(i)); + } + printf("cast Test Passed\n"); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template<typename DataType, typename dev_Selector> void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device); + test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device); + test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device<float>(device)); + } } diff --git a/unsupported/test/cxx11_tensor_symmetry.cpp b/unsupported/test/cxx11_tensor_symmetry.cpp index d680e9b3b..fed269a9a 100644 --- a/unsupported/test/cxx11_tensor_symmetry.cpp +++ b/unsupported/test/cxx11_tensor_symmetry.cpp @@ -801,7 +801,7 @@ static void test_tensor_randacc() } } -void test_cxx11_tensor_symmetry() +EIGEN_DECLARE_TEST(cxx11_tensor_symmetry) { CALL_SUBTEST(test_symgroups_static()); CALL_SUBTEST(test_symgroups_dynamic()); diff --git a/unsupported/test/cxx11_tensor_thread_local.cpp b/unsupported/test/cxx11_tensor_thread_local.cpp new file mode 100644 index 000000000..7e866f6d1 --- /dev/null +++ b/unsupported/test/cxx11_tensor_thread_local.cpp @@ -0,0 +1,149 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_USE_THREADS + +#include <iostream> +#include <unordered_set> + +#include "main.h" +#include <Eigen/CXX11/ThreadPool> + +struct Counter { + Counter() = default; + + void inc() { + // Check that mutation happens only in a thread that created this counter. + VERIFY_IS_EQUAL(std::this_thread::get_id(), created_by); + counter_value++; + } + int value() { return counter_value; } + + std::thread::id created_by; + int counter_value = 0; +}; + +struct InitCounter { + void operator()(Counter& counter) { + counter.created_by = std::this_thread::get_id(); + } +}; + +void test_simple_thread_local() { + int num_threads = internal::random<int>(4, 32); + Eigen::ThreadPool thread_pool(num_threads); + Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter()); + + int num_tasks = 3 * num_threads; + Eigen::Barrier barrier(num_tasks); + + for (int i = 0; i < num_tasks; ++i) { + thread_pool.Schedule([&counter, &barrier]() { + Counter& local = counter.local(); + local.inc(); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + barrier.Notify(); + }); + } + + barrier.Wait(); + + counter.ForEach( + [](std::thread::id, Counter& cnt) { VERIFY_IS_EQUAL(cnt.value(), 3); }); +} + +void test_zero_sized_thread_local() { + Eigen::ThreadLocal<Counter, InitCounter> counter(0, InitCounter()); + + Counter& local = counter.local(); + local.inc(); + + int total = 0; + counter.ForEach([&total](std::thread::id, Counter& cnt) { + total += cnt.value(); + VERIFY_IS_EQUAL(cnt.value(), 1); + }); + + VERIFY_IS_EQUAL(total, 1); +} + +// All thread local values fits into the lock-free storage. +void test_large_number_of_tasks_no_spill() { + int num_threads = internal::random<int>(4, 32); + Eigen::ThreadPool thread_pool(num_threads); + Eigen::ThreadLocal<Counter, InitCounter> counter(num_threads, InitCounter()); + + int num_tasks = 10000; + Eigen::Barrier barrier(num_tasks); + + for (int i = 0; i < num_tasks; ++i) { + thread_pool.Schedule([&counter, &barrier]() { + Counter& local = counter.local(); + local.inc(); + barrier.Notify(); + }); + } + + barrier.Wait(); + + int total = 0; + std::unordered_set<std::thread::id> unique_threads; + + counter.ForEach([&](std::thread::id id, Counter& cnt) { + total += cnt.value(); + unique_threads.insert(id); + }); + + VERIFY_IS_EQUAL(total, num_tasks); + // Not all threads in a pool might be woken up to execute submitted tasks. + // Also thread_pool.Schedule() might use current thread if queue is full. + VERIFY_IS_EQUAL( + unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true); +} + +// Lock free thread local storage is too small to fit all the unique threads, +// and it spills to a map guarded by a mutex. +void test_large_number_of_tasks_with_spill() { + int num_threads = internal::random<int>(4, 32); + Eigen::ThreadPool thread_pool(num_threads); + Eigen::ThreadLocal<Counter, InitCounter> counter(1, InitCounter()); + + int num_tasks = 10000; + Eigen::Barrier barrier(num_tasks); + + for (int i = 0; i < num_tasks; ++i) { + thread_pool.Schedule([&counter, &barrier]() { + Counter& local = counter.local(); + local.inc(); + barrier.Notify(); + }); + } + + barrier.Wait(); + + int total = 0; + std::unordered_set<std::thread::id> unique_threads; + + counter.ForEach([&](std::thread::id id, Counter& cnt) { + total += cnt.value(); + unique_threads.insert(id); + }); + + VERIFY_IS_EQUAL(total, num_tasks); + // Not all threads in a pool might be woken up to execute submitted tasks. + // Also thread_pool.Schedule() might use current thread if queue is full. + VERIFY_IS_EQUAL( + unique_threads.size() <= (static_cast<size_t>(num_threads + 1)), true); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_thread_local) { + CALL_SUBTEST(test_simple_thread_local()); + CALL_SUBTEST(test_zero_sized_thread_local()); + CALL_SUBTEST(test_large_number_of_tasks_no_spill()); + CALL_SUBTEST(test_large_number_of_tasks_with_spill()); +} diff --git a/unsupported/test/cxx11_tensor_thread_pool.cpp b/unsupported/test/cxx11_tensor_thread_pool.cpp index 2ef665f30..b772a1d60 100644 --- a/unsupported/test/cxx11_tensor_thread_pool.cpp +++ b/unsupported/test/cxx11_tensor_thread_pool.cpp @@ -16,29 +16,72 @@ using Eigen::Tensor; +class TestAllocator : public Allocator { + public: + ~TestAllocator() EIGEN_OVERRIDE {} + EIGEN_DEVICE_FUNC void* allocate(size_t num_bytes) const EIGEN_OVERRIDE { + const_cast<TestAllocator*>(this)->alloc_count_++; + return internal::aligned_malloc(num_bytes); + } + EIGEN_DEVICE_FUNC void deallocate(void* buffer) const EIGEN_OVERRIDE { + const_cast<TestAllocator*>(this)->dealloc_count_++; + internal::aligned_free(buffer); + } + + int alloc_count() const { return alloc_count_; } + int dealloc_count() const { return dealloc_count_; } + + private: + int alloc_count_ = 0; + int dealloc_count_ = 0; +}; void test_multithread_elementwise() { - Tensor<float, 3> in1(2,3,7); - Tensor<float, 3> in2(2,3,7); - Tensor<float, 3> out(2,3,7); + Tensor<float, 3> in1(200, 30, 70); + Tensor<float, 3> in2(200, 30, 70); + Tensor<double, 3> out(200, 30, 70); in1.setRandom(); in2.setRandom(); Eigen::ThreadPool tp(internal::random<int>(3, 11)); Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); - out.device(thread_pool_device) = in1 + in2 * 3.14f; + out.device(thread_pool_device) = (in1 + in2 * 3.14f).cast<double>(); - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 3; ++j) { - for (int k = 0; k < 7; ++k) { - VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f); + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); } } } } +void test_async_multithread_elementwise() +{ + Tensor<float, 3> in1(200, 30, 70); + Tensor<float, 3> in2(200, 30, 70); + Tensor<double, 3> out(200, 30, 70); + + in1.setRandom(); + in2.setRandom(); + + Eigen::ThreadPool tp(internal::random<int>(3, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11)); + + Eigen::Barrier b(1); + out.device(thread_pool_device, [&b]() { b.Notify(); }) = (in1 + in2 * 3.14f).cast<double>(); + b.Wait(); + + for (int i = 0; i < 200; ++i) { + for (int j = 0; j < 30; ++j) { + for (int k = 0; k < 70; ++k) { + VERIFY_IS_APPROX(out(i, j, k), static_cast<double>(in1(i, j, k) + in2(i, j, k) * 3.14f)); + } + } + } +} void test_multithread_compound_assignment() { @@ -232,6 +275,273 @@ void test_multithread_contraction_agrees_with_singlethread() { } } +// Apply Sqrt to all output elements. +struct SqrtOutputKernel { + template <typename Index, typename Scalar> + EIGEN_ALWAYS_INLINE void operator()( + const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper, + const TensorContractionParams&, Index, Index, Index num_rows, + Index num_cols) const { + for (int i = 0; i < num_rows; ++i) { + for (int j = 0; j < num_cols; ++j) { + output_mapper(i, j) = std::sqrt(output_mapper(i, j)); + } + } + } +}; + +template <int DataLayout> +static void test_multithread_contraction_with_output_kernel() { + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31); + Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10); + Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in mat4 to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 1500, 248); + MapXf m_right(t_right.data(), 248, 1400); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY(&t_result.data()[i] != &m_result.data()[i]); + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + +template<int DataLayout> +void test_async_multithread_contraction_agrees_with_singlethread() +{ + int contract_size = internal::random<int>(100, 500); + + Tensor<float, 3, DataLayout> left(internal::random<int>(10, 40), + contract_size, + internal::random<int>(10, 40)); + + Tensor<float, 4, DataLayout> right( + internal::random<int>(1, 20), internal::random<int>(1, 20), contract_size, + internal::random<int>(1, 20)); + + left.setRandom(); + right.setRandom(); + + // add constants to shift values away from 0 for more precision + left += left.constant(1.5f); + right += right.constant(1.5f); + + typedef Tensor<float, 1>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}}); + + Eigen::ThreadPool tp(internal::random<int>(2, 11)); + Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(8, 32)); + + Tensor<float, 5, DataLayout> st_result; + st_result = left.contract(right, dims); + + Tensor<float, 5, DataLayout> tp_result(st_result.dimensions()); + + Eigen::Barrier barrier(1); + tp_result.device(thread_pool_device, [&barrier]() { barrier.Notify(); }) = + left.contract(right, dims); + barrier.Wait(); + + VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions())); + for (ptrdiff_t i = 0; i < st_result.size(); i++) { + // if both of the values are very small, then do nothing (because the test + // will fail due to numerical precision issues when values are small) + if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) { + VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]); + } + } +} + +// We are triggering 'evalShardedByInnerDim' optimization. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template <int DataLayout> +static void test_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + t_result.device(device) = t_left.contract(t_right, dims, SqrtOutputKernel()); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization. +template <int DataLayout> +static void test_async_sharded_by_inner_dim_contraction() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims); + barrier.Wait(); + + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]); + } +} + +// We are triggering 'evalShardedByInnerDim' optimization with output kernel. +template <int DataLayout> +static void test_async_sharded_by_inner_dim_contraction_with_output_kernel() +{ + typedef Tensor<float, 1>::DimensionPair DimPair; + + const int num_threads = internal::random<int>(4, 16); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads); + + Tensor<float, 2, DataLayout> t_left(2, 10000); + Tensor<float, 2, DataLayout> t_right(10000, 10); + Tensor<float, 2, DataLayout> t_result(2, 10); + + t_left.setRandom(); + t_right.setRandom(); + // Put trash in t_result to verify contraction clears output memory. + t_result.setRandom(); + + // Add a little offset so that the results won't be close to zero. + t_left += t_left.constant(1.0f); + t_right += t_right.constant(1.0f); + + typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf; + MapXf m_left(t_left.data(), 2, 10000); + MapXf m_right(t_right.data(), 10000, 10); + Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(2, 10); + + // this contraction should be equivalent to a single matrix multiplication + Eigen::array<DimPair, 1> dims({{DimPair(1, 0)}}); + + // compute results by separate methods + Eigen::Barrier barrier(1); + t_result.device(device, [&barrier]() { barrier.Notify(); }) = + t_left.contract(t_right, dims, SqrtOutputKernel()); + barrier.Wait(); + m_result = m_left * m_right; + + for (Index i = 0; i < t_result.dimensions().TotalSize(); i++) { + VERIFY_IS_APPROX(t_result.data()[i], std::sqrt(m_result.data()[i])); + } +} template<int DataLayout> void test_full_contraction() { @@ -320,14 +630,14 @@ void test_multithread_random() } template<int DataLayout> -void test_multithread_shuffle() +void test_multithread_shuffle(Allocator* allocator) { Tensor<float, 4, DataLayout> tensor(17,5,7,11); tensor.setRandom(); const int num_threads = internal::random<int>(2, 11); ThreadPool threads(num_threads); - Eigen::ThreadPoolDevice device(&threads, num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); Tensor<float, 4, DataLayout> shuffle(7,5,11,17); array<ptrdiff_t, 4> shuffles = {{2,1,3,0}}; @@ -344,10 +654,26 @@ void test_multithread_shuffle() } } +void test_threadpool_allocate(TestAllocator* allocator) +{ + const int num_threads = internal::random<int>(2, 11); + const int num_allocs = internal::random<int>(2, 11); + ThreadPool threads(num_threads); + Eigen::ThreadPoolDevice device(&threads, num_threads, allocator); + + for (int a = 0; a < num_allocs; ++a) { + void* ptr = device.allocate(512); + device.deallocate(ptr); + } + VERIFY(allocator != NULL); + VERIFY_IS_EQUAL(allocator->alloc_count(), num_allocs); + VERIFY_IS_EQUAL(allocator->dealloc_count(), num_allocs); +} -void test_cxx11_tensor_thread_pool() +EIGEN_DECLARE_TEST(cxx11_tensor_thread_pool) { CALL_SUBTEST_1(test_multithread_elementwise()); + CALL_SUBTEST_1(test_async_multithread_elementwise()); CALL_SUBTEST_1(test_multithread_compound_assignment()); CALL_SUBTEST_2(test_multithread_contraction<ColMajor>()); @@ -355,19 +681,41 @@ void test_cxx11_tensor_thread_pool() CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>()); CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_3(test_multithread_contraction_with_output_kernel<RowMajor>()); + + CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<ColMajor>()); + CALL_SUBTEST_4(test_async_multithread_contraction_agrees_with_singlethread<RowMajor>()); + + // Test EvalShardedByInnerDimContext parallelization strategy. + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<ColMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction<RowMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_5(test_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); + + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<ColMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction<RowMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<ColMajor>()); + CALL_SUBTEST_6(test_async_sharded_by_inner_dim_contraction_with_output_kernel<RowMajor>()); // Exercise various cases that have been problematic in the past. - CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>()); - CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>()); + CALL_SUBTEST_7(test_contraction_corner_cases<ColMajor>()); + CALL_SUBTEST_7(test_contraction_corner_cases<RowMajor>()); + + CALL_SUBTEST_8(test_full_contraction<ColMajor>()); + CALL_SUBTEST_8(test_full_contraction<RowMajor>()); + + CALL_SUBTEST_9(test_multithreaded_reductions<ColMajor>()); + CALL_SUBTEST_9(test_multithreaded_reductions<RowMajor>()); - CALL_SUBTEST_4(test_full_contraction<ColMajor>()); - CALL_SUBTEST_4(test_full_contraction<RowMajor>()); + CALL_SUBTEST_10(test_memcpy()); + CALL_SUBTEST_10(test_multithread_random()); - CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>()); - CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>()); + TestAllocator test_allocator; + CALL_SUBTEST_11(test_multithread_shuffle<ColMajor>(NULL)); + CALL_SUBTEST_11(test_multithread_shuffle<RowMajor>(&test_allocator)); + CALL_SUBTEST_11(test_threadpool_allocate(&test_allocator)); - CALL_SUBTEST_6(test_memcpy()); - CALL_SUBTEST_6(test_multithread_random()); - CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>()); - CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>()); + // Force CMake to split this test. + // EIGEN_SUFFIXES;1;2;3;4;5;6;7;8;9;10;11 } diff --git a/unsupported/test/cxx11_tensor_trace.cpp b/unsupported/test/cxx11_tensor_trace.cpp new file mode 100644 index 000000000..009722895 --- /dev/null +++ b/unsupported/test/cxx11_tensor_trace.cpp @@ -0,0 +1,172 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +#include <Eigen/CXX11/Tensor> + +using Eigen::Tensor; +using Eigen::array; + +template <int DataLayout> +static void test_0D_trace() { + Tensor<float, 0, DataLayout> tensor; + tensor.setRandom(); + array<ptrdiff_t, 0> dims; + Tensor<float, 0, DataLayout> result = tensor.trace(dims); + VERIFY_IS_EQUAL(result(), tensor()); +} + + +template <int DataLayout> +static void test_all_dimensions_trace() { + Tensor<float, 3, DataLayout> tensor1(5, 5, 5); + tensor1.setRandom(); + Tensor<float, 0, DataLayout> result1 = tensor1.trace(); + VERIFY_IS_EQUAL(result1.rank(), 0); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum += tensor1(i, i, i); + } + VERIFY_IS_EQUAL(result1(), sum); + + Tensor<float, 5, DataLayout> tensor2(7, 7, 7, 7, 7); + tensor2.setRandom(); + array<ptrdiff_t, 5> dims = { { 2, 1, 0, 3, 4 } }; + Tensor<float, 0, DataLayout> result2 = tensor2.trace(dims); + VERIFY_IS_EQUAL(result2.rank(), 0); + sum = 0.0f; + for (int i = 0; i < 7; ++i) { + sum += tensor2(i, i, i, i, i); + } + VERIFY_IS_EQUAL(result2(), sum); +} + + +template <int DataLayout> +static void test_simple_trace() { + Tensor<float, 3, DataLayout> tensor1(3, 5, 3); + tensor1.setRandom(); + array<ptrdiff_t, 2> dims1 = { { 0, 2 } }; + Tensor<float, 1, DataLayout> result1 = tensor1.trace(dims1); + VERIFY_IS_EQUAL(result1.rank(), 1); + VERIFY_IS_EQUAL(result1.dimension(0), 5); + float sum = 0.0f; + for (int i = 0; i < 5; ++i) { + sum = 0.0f; + for (int j = 0; j < 3; ++j) { + sum += tensor1(j, i, j); + } + VERIFY_IS_EQUAL(result1(i), sum); + } + + Tensor<float, 4, DataLayout> tensor2(5, 5, 7, 7); + tensor2.setRandom(); + array<ptrdiff_t, 2> dims2 = { { 2, 3 } }; + Tensor<float, 2, DataLayout> result2 = tensor2.trace(dims2); + VERIFY_IS_EQUAL(result2.rank(), 2); + VERIFY_IS_EQUAL(result2.dimension(0), 5); + VERIFY_IS_EQUAL(result2.dimension(1), 5); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 7; ++k) { + sum += tensor2(i, j, k, k); + } + VERIFY_IS_EQUAL(result2(i, j), sum); + } + } + + array<ptrdiff_t, 2> dims3 = { { 1, 0 } }; + Tensor<float, 2, DataLayout> result3 = tensor2.trace(dims3); + VERIFY_IS_EQUAL(result3.rank(), 2); + VERIFY_IS_EQUAL(result3.dimension(0), 7); + VERIFY_IS_EQUAL(result3.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 5; ++k) { + sum += tensor2(k, k, i, j); + } + VERIFY_IS_EQUAL(result3(i, j), sum); + } + } + + Tensor<float, 5, DataLayout> tensor3(3, 7, 3, 7, 3); + tensor3.setRandom(); + array<ptrdiff_t, 3> dims4 = { { 0, 2, 4 } }; + Tensor<float, 2, DataLayout> result4 = tensor3.trace(dims4); + VERIFY_IS_EQUAL(result4.rank(), 2); + VERIFY_IS_EQUAL(result4.dimension(0), 7); + VERIFY_IS_EQUAL(result4.dimension(1), 7); + for (int i = 0; i < 7; ++i) { + for (int j = 0; j < 7; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor3(k, i, k, j, k); + } + VERIFY_IS_EQUAL(result4(i, j), sum); + } + } + + Tensor<float, 5, DataLayout> tensor4(3, 7, 4, 7, 5); + tensor4.setRandom(); + array<ptrdiff_t, 2> dims5 = { { 1, 3 } }; + Tensor<float, 3, DataLayout> result5 = tensor4.trace(dims5); + VERIFY_IS_EQUAL(result5.rank(), 3); + VERIFY_IS_EQUAL(result5.dimension(0), 3); + VERIFY_IS_EQUAL(result5.dimension(1), 4); + VERIFY_IS_EQUAL(result5.dimension(2), 5); + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 4; ++j) { + for (int k = 0; k < 5; ++k) { + sum = 0.0f; + for (int l = 0; l < 7; ++l) { + sum += tensor4(i, l, j, l, k); + } + VERIFY_IS_EQUAL(result5(i, j, k), sum); + } + } + } +} + + +template<int DataLayout> +static void test_trace_in_expr() { + Tensor<float, 4, DataLayout> tensor(2, 3, 5, 3); + tensor.setRandom(); + array<ptrdiff_t, 2> dims = { { 1, 3 } }; + Tensor<float, 2, DataLayout> result(2, 5); + result = result.constant(1.0f) - tensor.trace(dims); + VERIFY_IS_EQUAL(result.rank(), 2); + VERIFY_IS_EQUAL(result.dimension(0), 2); + VERIFY_IS_EQUAL(result.dimension(1), 5); + float sum = 0.0f; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 5; ++j) { + sum = 0.0f; + for (int k = 0; k < 3; ++k) { + sum += tensor(i, k, j, k); + } + VERIFY_IS_EQUAL(result(i, j), 1.0f - sum); + } + } +} + + +EIGEN_DECLARE_TEST(cxx11_tensor_trace) { + CALL_SUBTEST(test_0D_trace<ColMajor>()); + CALL_SUBTEST(test_0D_trace<RowMajor>()); + CALL_SUBTEST(test_all_dimensions_trace<ColMajor>()); + CALL_SUBTEST(test_all_dimensions_trace<RowMajor>()); + CALL_SUBTEST(test_simple_trace<ColMajor>()); + CALL_SUBTEST(test_simple_trace<RowMajor>()); + CALL_SUBTEST(test_trace_in_expr<ColMajor>()); + CALL_SUBTEST(test_trace_in_expr<RowMajor>()); +} diff --git a/unsupported/test/cxx11_tensor_uint128.cpp b/unsupported/test/cxx11_tensor_uint128.cpp index d2a1e8673..46fceaa19 100644 --- a/unsupported/test/cxx11_tensor_uint128.cpp +++ b/unsupported/test/cxx11_tensor_uint128.cpp @@ -12,7 +12,7 @@ #include <Eigen/CXX11/Tensor> -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC || !defined(__SIZEOF_INT128__) #define EIGEN_NO_INT128 #else typedef __uint128_t uint128_t; @@ -144,7 +144,7 @@ void test_misc2() { #endif -void test_cxx11_tensor_uint128() +EIGEN_DECLARE_TEST(cxx11_tensor_uint128) { #ifdef EIGEN_NO_INT128 // Skip the test on compilers that don't support 128bit integers natively diff --git a/unsupported/test/cxx11_tensor_volume_patch.cpp b/unsupported/test/cxx11_tensor_volume_patch.cpp index ca6840f3b..862212e82 100644 --- a/unsupported/test/cxx11_tensor_volume_patch.cpp +++ b/unsupported/test/cxx11_tensor_volume_patch.cpp @@ -70,9 +70,9 @@ static void test_entire_volume_patch() const int dy = patch_y - 1; const int dx = patch_x - 1; - const int forward_pad_z = dz - dz / 2; - const int forward_pad_y = dy - dy / 2; - const int forward_pad_x = dx - dx / 2; + const int forward_pad_z = dz / 2; + const int forward_pad_y = dy / 2; + const int forward_pad_x = dx / 2; for (int pz = 0; pz < patch_z; pz++) { for (int py = 0; py < patch_y; py++) { @@ -105,7 +105,7 @@ static void test_entire_volume_patch() } } -void test_cxx11_tensor_volume_patch() +EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch) { CALL_SUBTEST(test_single_voxel_patch()); CALL_SUBTEST(test_entire_volume_patch()); diff --git a/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp new file mode 100644 index 000000000..8d99a48ed --- /dev/null +++ b/unsupported/test/cxx11_tensor_volume_patch_sycl.cpp @@ -0,0 +1,222 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: <eigen@codeplay.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX + +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include <unsupported/Eigen/CXX11/Tensor> + +using Eigen::Tensor; +static const int DataLayout = ColMajor; + +template <typename DataType, typename IndexType> +static void test_single_voxel_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + +IndexType sizeDim0 = 4; +IndexType sizeDim1 = 2; +IndexType sizeDim2 = 3; +IndexType sizeDim3 = 5; +IndexType sizeDim4 = 7; +array<IndexType, 5> tensorColMajorRange = {{sizeDim0, sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; +array<IndexType, 5> tensorRowMajorRange = {{sizeDim4, sizeDim3, sizeDim2, sizeDim1, sizeDim0}}; +Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); +Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); +tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + + + // single volume patch: ColMajor + array<IndexType, 6> patchColMajorTensorRange={{sizeDim0,1, 1, 1, sizeDim1*sizeDim2*sizeDim3, sizeDim4}}; + Tensor<DataType, 6, DataLayout,IndexType> single_voxel_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =single_voxel_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_single_voxel_patch_col_major(gpu_data_single_voxel_patch_col_major, patchColMajorTensorRange); + gpu_single_voxel_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_col_major.data(), gpu_data_single_voxel_patch_col_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(0), 4); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(1), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(4), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_col_major.dimension(5), 7); + + array<IndexType, 6> patchRowMajorTensorRange={{sizeDim4, sizeDim1*sizeDim2*sizeDim3, 1, 1, 1, sizeDim0}}; + Tensor<DataType, 6, RowMajor,IndexType> single_voxel_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =single_voxel_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_single_voxel_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_single_voxel_patch_row_major(gpu_data_single_voxel_patch_row_major, patchRowMajorTensorRange); + gpu_single_voxel_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(1, 1, 1); + sycl_device.memcpyDeviceToHost(single_voxel_patch_row_major.data(), gpu_data_single_voxel_patch_row_major, patchTensorBuffSize); + + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1); + VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4); + + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + for (IndexType i = 0; i < tensor_col_major.size(); ++i) { + VERIFY_IS_EQUAL(tensor_col_major.data()[i], single_voxel_patch_col_major.data()[i]); + VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]); + VERIFY_IS_EQUAL(tensor_col_major.data()[i], tensor_row_major.data()[i]); + } + + + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_col_major); + sycl_device.deallocate(gpu_data_single_voxel_patch_row_major); +} + +template <typename DataType, typename IndexType> +static void test_entire_volume_patch_sycl(const Eigen::SyclDevice& sycl_device) +{ + const int depth = 4; + const int patch_z = 2; + const int patch_y = 3; + const int patch_x = 5; + const int batch = 7; + + array<IndexType, 5> tensorColMajorRange = {{depth, patch_z, patch_y, patch_x, batch}}; + array<IndexType, 5> tensorRowMajorRange = {{batch, patch_x, patch_y, patch_z, depth}}; + Tensor<DataType, 5, DataLayout,IndexType> tensor_col_major(tensorColMajorRange); + Tensor<DataType, 5, RowMajor,IndexType> tensor_row_major(tensorRowMajorRange); + tensor_col_major.setRandom(); + + + DataType* gpu_data_col_major = static_cast<DataType*>(sycl_device.allocate(tensor_col_major.size()*sizeof(DataType))); + DataType* gpu_data_row_major = static_cast<DataType*>(sycl_device.allocate(tensor_row_major.size()*sizeof(DataType))); + TensorMap<Tensor<DataType, 5, ColMajor, IndexType>> gpu_col_major(gpu_data_col_major, tensorColMajorRange); + TensorMap<Tensor<DataType, 5, RowMajor, IndexType>> gpu_row_major(gpu_data_row_major, tensorRowMajorRange); + + sycl_device.memcpyHostToDevice(gpu_data_col_major, tensor_col_major.data(),(tensor_col_major.size())*sizeof(DataType)); + gpu_row_major.device(sycl_device)=gpu_col_major.swap_layout(); + sycl_device.memcpyDeviceToHost(tensor_row_major.data(), gpu_data_row_major, (tensor_col_major.size())*sizeof(DataType)); + + + // single volume patch: ColMajor + array<IndexType, 6> patchColMajorTensorRange={{depth,patch_z, patch_y, patch_x, patch_z*patch_y*patch_x, batch}}; + Tensor<DataType, 6, DataLayout,IndexType> entire_volume_patch_col_major(patchColMajorTensorRange); + size_t patchTensorBuffSize =entire_volume_patch_col_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_col_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, DataLayout,IndexType>> gpu_entire_volume_patch_col_major(gpu_data_entire_volume_patch_col_major, patchColMajorTensorRange); + gpu_entire_volume_patch_col_major.device(sycl_device)=gpu_col_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_col_major.data(), gpu_data_entire_volume_patch_col_major, patchTensorBuffSize); + + +// Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch); +// tensor.setRandom(); +// Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout(); + + //Tensor<float, 6> entire_volume_patch; + //entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(0), depth); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(1), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(2), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(3), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(4), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_col_major.dimension(5), batch); + +// Tensor<float, 6, RowMajor> entire_volume_patch_row_major; + //entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + + array<IndexType, 6> patchRowMajorTensorRange={{batch,patch_z*patch_y*patch_x, patch_x, patch_y, patch_z, depth}}; + Tensor<DataType, 6, RowMajor,IndexType> entire_volume_patch_row_major(patchRowMajorTensorRange); + patchTensorBuffSize =entire_volume_patch_row_major.size()*sizeof(DataType); + DataType* gpu_data_entire_volume_patch_row_major = static_cast<DataType*>(sycl_device.allocate(patchTensorBuffSize)); + TensorMap<Tensor<DataType, 6, RowMajor,IndexType>> gpu_entire_volume_patch_row_major(gpu_data_entire_volume_patch_row_major, patchRowMajorTensorRange); + gpu_entire_volume_patch_row_major.device(sycl_device)=gpu_row_major.extract_volume_patches(patch_z, patch_y, patch_x); + sycl_device.memcpyDeviceToHost(entire_volume_patch_row_major.data(), gpu_data_entire_volume_patch_row_major, patchTensorBuffSize); + + + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z); + VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth); + + const int dz = patch_z - 1; + const int dy = patch_y - 1; + const int dx = patch_x - 1; + + const int forward_pad_z = dz / 2; + const int forward_pad_y = dy / 2; + const int forward_pad_x = dx / 2; + + for (int pz = 0; pz < patch_z; pz++) { + for (int py = 0; py < patch_y; py++) { + for (int px = 0; px < patch_x; px++) { + const int patchId = pz + patch_z * (py + px * patch_y); + for (int z = 0; z < patch_z; z++) { + for (int y = 0; y < patch_y; y++) { + for (int x = 0; x < patch_x; x++) { + for (int b = 0; b < batch; b++) { + for (int d = 0; d < depth; d++) { + float expected = 0.0f; + float expected_row_major = 0.0f; + const int eff_z = z - forward_pad_z + pz; + const int eff_y = y - forward_pad_y + py; + const int eff_x = x - forward_pad_x + px; + if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 && + eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) { + expected = tensor_col_major(d, eff_z, eff_y, eff_x, b); + expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d); + } + VERIFY_IS_EQUAL(entire_volume_patch_col_major(d, z, y, x, patchId, b), expected); + VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major); + } + } + } + } + } + } + } + } + sycl_device.deallocate(gpu_data_col_major); + sycl_device.deallocate(gpu_data_row_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_col_major); + sycl_device.deallocate(gpu_data_entire_volume_patch_row_major); +} + + + +template<typename DataType, typename dev_Selector> void sycl_tensor_volume_patch_test_per_device(dev_Selector s){ +QueueInterface queueInterface(s); +auto sycl_device = Eigen::SyclDevice(&queueInterface); +std::cout << "Running on " << s.template get_info<cl::sycl::info::device::name>() << std::endl; +test_single_voxel_patch_sycl<DataType, int64_t>(sycl_device); +test_entire_volume_patch_sycl<DataType, int64_t>(sycl_device); +} +EIGEN_DECLARE_TEST(cxx11_tensor_volume_patch_sycl) +{ +for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_tensor_volume_patch_test_per_device<float>(device)); +} +} diff --git a/unsupported/test/dgmres.cpp b/unsupported/test/dgmres.cpp index 2b11807c8..5f63161b2 100644 --- a/unsupported/test/dgmres.cpp +++ b/unsupported/test/dgmres.cpp @@ -9,7 +9,7 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "../../test/sparse_solver.h" -#include <Eigen/src/IterativeSolvers/DGMRES.h> +#include <unsupported/Eigen/IterativeSolvers> template<typename T> void test_dgmres_T() { @@ -24,7 +24,7 @@ template<typename T> void test_dgmres_T() //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor) ); } -void test_dgmres() +EIGEN_DECLARE_TEST(dgmres) { CALL_SUBTEST_1(test_dgmres_T<double>()); CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >()); diff --git a/unsupported/test/forward_adolc.cpp b/unsupported/test/forward_adolc.cpp index 866db8e86..14a909d3b 100644 --- a/unsupported/test/forward_adolc.cpp +++ b/unsupported/test/forward_adolc.cpp @@ -35,7 +35,7 @@ struct TestFunc1 int m_inputs, m_values; TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {} - TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {} + TestFunc1(int inputs_, int values_) : m_inputs(inputs_), m_values(values_) {} int inputs() const { return m_inputs; } int values() const { return m_values; } @@ -119,7 +119,7 @@ template<typename Func> void adolc_forward_jacobian(const Func& f) VERIFY_IS_APPROX(j, jref); } -void test_forward_adolc() +EIGEN_DECLARE_TEST(forward_adolc) { adtl::setNumDir(NUMBER_DIRECTIONS); @@ -132,7 +132,7 @@ void test_forward_adolc() } { - // simple instanciation tests + // simple instantiation tests Matrix<adtl::adouble,2,1> x; foo(x); Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);; diff --git a/unsupported/test/gmres.cpp b/unsupported/test/gmres.cpp index f2969116b..8d2254b5b 100644 --- a/unsupported/test/gmres.cpp +++ b/unsupported/test/gmres.cpp @@ -24,7 +24,7 @@ template<typename T> void test_gmres_T() //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor) ); } -void test_gmres() +EIGEN_DECLARE_TEST(gmres) { CALL_SUBTEST_1(test_gmres_T<double>()); CALL_SUBTEST_2(test_gmres_T<std::complex<double> >()); diff --git a/unsupported/test/idrs.cpp b/unsupported/test/idrs.cpp new file mode 100644 index 000000000..f88c01632 --- /dev/null +++ b/unsupported/test/idrs.cpp @@ -0,0 +1,27 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr> +// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "../../test/sparse_solver.h" +#include <Eigen/IterativeSolvers> + +template<typename T> void test_idrs_T() +{ + IDRS<SparseMatrix<T>, DiagonalPreconditioner<T> > idrs_colmajor_diag; + IDRS<SparseMatrix<T>, IncompleteLUT<T> > idrs_colmajor_ilut; + + CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_diag) ); + CALL_SUBTEST( check_sparse_square_solving(idrs_colmajor_ilut) ); +} + +EIGEN_DECLARE_TEST(idrs) +{ + CALL_SUBTEST_1(test_idrs_T<double>()); + CALL_SUBTEST_2(test_idrs_T<std::complex<double> >()); +} diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp index e770049e5..b5b764c65 100644 --- a/unsupported/test/kronecker_product.cpp +++ b/unsupported/test/kronecker_product.cpp @@ -9,6 +9,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + #ifdef EIGEN_TEST_PART_1 #include "sparse.h" @@ -83,7 +84,7 @@ void check_sparse_kronecker_product(const MatrixType& ab) } -void test_kronecker_product() +EIGEN_DECLARE_TEST(kronecker_product) { // DM = dense matrix; SM = sparse matrix @@ -95,7 +96,7 @@ void test_kronecker_product() SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921; SM_a.insert(1,1) = DM_a.coeffRef(1,1) = 0.6469156566545853; SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789; - + MatrixXd DM_b(3,2); SparseMatrix<double> SM_b(3,2); SM_b.insert(0,0) = DM_b.coeffRef(0,0) = 0.9004440976767099; @@ -165,7 +166,7 @@ void test_kronecker_product() SM_a.insert(0,3) = -0.2; SM_a.insert(2,4) = 0.3; SM_a.finalize(); - + SM_b.insert(0,0) = 0.4; SM_b.insert(2,1) = -0.5; SM_b.finalize(); @@ -183,7 +184,7 @@ void test_kronecker_product() DM_b2.resize(4,8); DM_ab2 = kroneckerProduct(DM_a2,DM_b2); CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8)); - + for(int i = 0; i < g_repeat; i++) { double density = Eigen::internal::random<double>(0.01,0.5); @@ -196,35 +197,35 @@ void test_kronecker_product() MatrixXf dA(ra,ca), dB(rb,cb), dC; initSparse(density, dA, sA); initSparse(density, dB, sB); - + sC = kroneckerProduct(sA,sB); dC = kroneckerProduct(dA,dB); VERIFY_IS_APPROX(MatrixXf(sC),dC); - + sC = kroneckerProduct(sA.transpose(),sB); dC = kroneckerProduct(dA.transpose(),dB); VERIFY_IS_APPROX(MatrixXf(sC),dC); - + sC = kroneckerProduct(sA.transpose(),sB.transpose()); dC = kroneckerProduct(dA.transpose(),dB.transpose()); VERIFY_IS_APPROX(MatrixXf(sC),dC); - + sC = kroneckerProduct(sA,sB.transpose()); dC = kroneckerProduct(dA,dB.transpose()); VERIFY_IS_APPROX(MatrixXf(sC),dC); - + sC2 = kroneckerProduct(sA,sB); dC = kroneckerProduct(dA,dB); VERIFY_IS_APPROX(MatrixXf(sC2),dC); - + sC2 = kroneckerProduct(dA,sB); dC = kroneckerProduct(dA,dB); VERIFY_IS_APPROX(MatrixXf(sC2),dC); - + sC2 = kroneckerProduct(sA,dB); dC = kroneckerProduct(dA,dB); VERIFY_IS_APPROX(MatrixXf(sC2),dC); - + sC2 = kroneckerProduct(2*sA,sB); dC = kroneckerProduct(2*dA,dB); VERIFY_IS_APPROX(MatrixXf(sC2),dC); @@ -236,11 +237,10 @@ void test_kronecker_product() #ifdef EIGEN_TEST_PART_2 // simply check that for a dense kronecker product, sparse module is not needed - #include "main.h" #include <Eigen/KroneckerProduct> -void test_kronecker_product() +EIGEN_DECLARE_TEST(kronecker_product) { MatrixXd a(2,2), b(3,3), c; a.setRandom(); diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 64f168c16..7f9a81cd3 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -1445,7 +1445,7 @@ void testNistEckerle4(void) VERIFY_IS_APPROX(x[2], 4.5154121844E+02); } -void test_levenberg_marquardt() +EIGEN_DECLARE_TEST(levenberg_marquardt) { // Tests using the examples provided by (c)minpack CALL_SUBTEST(testLmder1()); diff --git a/unsupported/test/matrix_exponential.cpp b/unsupported/test/matrix_exponential.cpp index 50dec083d..b032cbf1d 100644 --- a/unsupported/test/matrix_exponential.cpp +++ b/unsupported/test/matrix_exponential.cpp @@ -119,7 +119,7 @@ void randomTest(const MatrixType& m, double tol) } } -void test_matrix_exponential() +EIGEN_DECLARE_TEST(matrix_exponential) { CALL_SUBTEST_2(test2dRotation<double>(1e-13)); CALL_SUBTEST_1(test2dRotation<float>(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 diff --git a/unsupported/test/matrix_function.cpp b/unsupported/test/matrix_function.cpp index 7c9b68a3c..6d753737d 100644 --- a/unsupported/test/matrix_function.cpp +++ b/unsupported/test/matrix_function.cpp @@ -23,9 +23,8 @@ inline bool test_isApprox_abs(const Type1& a, const Type2& b) // Returns a matrix with eigenvalues clustered around 0, 1 and 2. template<typename MatrixType> -MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size) +MatrixType randomMatrixWithRealEivals(const Index size) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; MatrixType diag = MatrixType::Zero(size, size); @@ -42,16 +41,15 @@ template <typename MatrixType, int IsComplex = NumTraits<typename internal::trai struct randomMatrixWithImagEivals { // Returns a matrix with eigenvalues clustered around 0 and +/- i. - static MatrixType run(const typename MatrixType::Index size); + static MatrixType run(const Index size); }; // Partial specialization for real matrices template<typename MatrixType> struct randomMatrixWithImagEivals<MatrixType, 0> { - static MatrixType run(const typename MatrixType::Index size) + static MatrixType run(const Index size) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; MatrixType diag = MatrixType::Zero(size, size); Index i = 0; @@ -77,9 +75,8 @@ struct randomMatrixWithImagEivals<MatrixType, 0> template<typename MatrixType> struct randomMatrixWithImagEivals<MatrixType, 1> { - static MatrixType run(const typename MatrixType::Index size) + static MatrixType run(const Index size) { - typedef typename MatrixType::Index Index; typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; const Scalar imagUnit(0, 1); @@ -171,7 +168,6 @@ void testMatrixType(const MatrixType& m) { // Matrices with clustered eigenvalue lead to different code paths // in MatrixFunction.h and are thus useful for testing. - typedef typename MatrixType::Index Index; const Index size = m.rows(); for (int i = 0; i < g_repeat; i++) { @@ -181,7 +177,40 @@ void testMatrixType(const MatrixType& m) } } -void test_matrix_function() +template<typename MatrixType> +void testMapRef(const MatrixType& A) +{ + // Test if passing Ref and Map objects is possible + // (Regression test for Bug #1796) + Index size = A.rows(); + MatrixType X; X.setRandom(size, size); + MatrixType Y(size,size); + Ref< MatrixType> R(Y); + Ref<const MatrixType> Rc(X); + Map< MatrixType> M(Y.data(), size, size); + Map<const MatrixType> Mc(X.data(), size, size); + + X = X*X; // make sure sqrt is possible + Y = X.sqrt(); + R = Rc.sqrt(); + M = Mc.sqrt(); + Y = X.exp(); + R = Rc.exp(); + M = Mc.exp(); + X = Y; // make sure log is possible + Y = X.log(); + R = Rc.log(); + M = Mc.log(); + + Y = X.cos() + Rc.cos() + Mc.cos(); + Y = X.sin() + Rc.sin() + Mc.sin(); + + Y = X.cosh() + Rc.cosh() + Mc.cosh(); + Y = X.sinh() + Rc.sinh() + Mc.sinh(); +} + + +EIGEN_DECLARE_TEST(matrix_function) { CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>())); CALL_SUBTEST_2(testMatrixType(Matrix3cf())); @@ -190,4 +219,9 @@ void test_matrix_function() CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>())); CALL_SUBTEST_6(testMatrixType(Matrix4cd())); CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13))); + + CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>())); + CALL_SUBTEST_2(testMapRef(Matrix3cf())); + CALL_SUBTEST_3(testMapRef(MatrixXf(8,8))); + CALL_SUBTEST_7(testMapRef(MatrixXd(13,13))); } diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index 7ccfacfdf..dbaf9dbdf 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -19,7 +19,7 @@ void test2dRotation(const T& tol) MatrixPower<Matrix<T,2,2> > Apow(A); for (int i=0; i<=20; ++i) { - angle = std::pow(T(10), (i-10) / T(5.)); + angle = std::pow(T(10), T(i-10) / T(5.)); c = std::cos(angle); s = std::sin(angle); B << c, s, -s, c; @@ -61,7 +61,7 @@ void test3dRotation(const T& tol) for (int i=0; i<=20; ++i) { v = Matrix<T,3,1>::Random(); v.normalize(); - angle = std::pow(T(10), (i-10) / T(5.)); + angle = std::pow(T(10), T(i-10) / T(5.)); VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol)); } } @@ -150,55 +150,55 @@ typedef Matrix<double,3,3,RowMajor> Matrix3dRowMajor; typedef Matrix<long double,3,3> Matrix3e; typedef Matrix<long double,Dynamic,Dynamic> MatrixXe; -void test_matrix_power() +EIGEN_DECLARE_TEST(matrix_power) { CALL_SUBTEST_2(test2dRotation<double>(1e-13)); - CALL_SUBTEST_1(test2dRotation<float>(2e-5)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 + CALL_SUBTEST_1(test2dRotation<float>(2e-5f)); // was 1e-5, relaxed for clang 2.8 / linux / x86-64 CALL_SUBTEST_9(test2dRotation<long double>(1e-13L)); CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14)); - CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5)); + CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5f)); CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L)); CALL_SUBTEST_10(test3dRotation<double>(1e-13)); - CALL_SUBTEST_11(test3dRotation<float>(1e-5)); + CALL_SUBTEST_11(test3dRotation<float>(1e-5f)); CALL_SUBTEST_12(test3dRotation<long double>(1e-13L)); CALL_SUBTEST_2(testGeneral(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testGeneral(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testGeneral(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3)); // see bug 614 + CALL_SUBTEST_1(testGeneral(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3f)); // see bug 614 CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testSingular(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testSingular(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testSingular(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3)); + CALL_SUBTEST_1(testSingular(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3f)); CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L)); CALL_SUBTEST_2(testLogThenExp(Matrix2d(), 1e-13)); CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13)); CALL_SUBTEST_3(testLogThenExp(Matrix4cd(), 1e-13)); CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8), 2e-12)); - CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4)); - CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4)); - CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4)); - CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3)); + CALL_SUBTEST_1(testLogThenExp(Matrix2f(), 1e-4f)); + CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4f)); + CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f)); + CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3f)); CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L)); CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13)); - CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4)); + CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L)); } diff --git a/unsupported/test/matrix_square_root.cpp b/unsupported/test/matrix_square_root.cpp index ea541e1ea..034f29217 100644 --- a/unsupported/test/matrix_square_root.cpp +++ b/unsupported/test/matrix_square_root.cpp @@ -18,7 +18,7 @@ void testMatrixSqrt(const MatrixType& m) VERIFY_IS_APPROX(sqrtA * sqrtA, A); } -void test_matrix_square_root() +EIGEN_DECLARE_TEST(matrix_square_root) { for (int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf())); diff --git a/unsupported/test/minres.cpp b/unsupported/test/minres.cpp index 8b300b78a..2eb40fef6 100644 --- a/unsupported/test/minres.cpp +++ b/unsupported/test/minres.cpp @@ -36,7 +36,7 @@ template<typename T> void test_minres_T() } -void test_minres() +EIGEN_DECLARE_TEST(minres) { CALL_SUBTEST_1(test_minres_T<double>()); // CALL_SUBTEST_2(test_minres_T<std::compex<double> >()); diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h deleted file mode 100644 index 8404f1ff8..000000000 --- a/unsupported/test/mpreal/mpreal.h +++ /dev/null @@ -1,3104 +0,0 @@ -/*
- MPFR C++: Multi-precision floating point number class for C++.
- Based on MPFR library: http://mpfr.org
-
- Project homepage: http://www.holoborodko.com/pavel/mpfr
- Contact e-mail: pavel@holoborodko.com
-
- Copyright (c) 2008-2015 Pavel Holoborodko
-
- Contributors:
- Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
- Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
- Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
- Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
- Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
- Rodney James, Jorge Leitao.
-
- Licensing:
- (A) MPFR C++ is under GNU General Public License ("GPL").
-
- (B) Non-free licenses may also be purchased from the author, for users who
- do not want their programs protected by the GPL.
-
- The non-free licenses are for users that wish to use MPFR C++ in
- their products but are unwilling to release their software
- under the GPL (which would require them to release source code
- and allow free redistribution).
-
- Such users can purchase an unlimited-use license from the author.
- Contact us for more details.
-
- GNU General Public License ("GPL") copyright permissions statement:
- **************************************************************************
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
- // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
- // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 2
-#define MPREAL_VERSION_STRING "3.6.2"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__)
- #define IsInf(x) (isinf)(x) // GNU C++/Intel ICC compiler on Linux
-#elif defined(_MSC_VER) // Microsoft Visual C++
- #define IsInf(x) (!_finite(x))
-#else
- #define IsInf(x) (std::isinf)(x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
- #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic). Borrowed from Eigen.
-#if (__has_feature(cxx_rvalue_references) || \
- defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
- (defined(_MSC_VER) && _MSC_VER >= 1600))
-
- #define MPREAL_HAVE_MOVE_SUPPORT
-
- // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
- #define mpfr_is_initialized(x) (0 != (x)->_mpfr_d)
- #define mpfr_set_uninitialized(x) ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters.
-#if (__has_feature(cxx_explicit_conversions) || \
- (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
- (defined(_MSC_VER) && _MSC_VER >= 1800))
-
- #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
- #define MPREAL_MSVC_DEBUGVIEW_CODE DebugView = toString();
- #define MPREAL_MSVC_DEBUGVIEW_DATA std::string DebugView;
-#else
- #define MPREAL_MSVC_DEBUGVIEW_CODE
- #define MPREAL_MSVC_DEBUGVIEW_DATA
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
- #include <cstdlib> // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1 // Triggers overflow exception during conversion to double if mpreal
- // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
- // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be compatible with new versions of MPFR
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x) ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
- #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
- #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
- mpfr_t mp;
-
-public:
-
- // Get default rounding mode & precision
- inline static mp_rnd_t get_default_rnd() { return (mp_rnd_t)(mpfr_get_default_rounding_mode()); }
- inline static mp_prec_t get_default_prec() { return mpfr_get_default_prec(); }
-
- // Constructors && type conversions
- mpreal();
- mpreal(const mpreal& u);
- mpreal(const mpf_t u);
- mpreal(const mpz_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const mpq_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const unsigned int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-
- // Construct mpreal from mpfr_t structure.
- // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
- mpreal(const mpfr_t u, bool shared = false);
-
- mpreal(const char* s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
- mpreal(const std::string& s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
- ~mpreal();
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
- mpreal& operator=(mpreal&& v);
- mpreal(mpreal&& u);
-#endif
-
- // Operations
- // =
- // +, -, *, /, ++, --, <<, >>
- // *=, +=, -=, /=,
- // <, >, ==, <=, >=
-
- // =
- mpreal& operator=(const mpreal& v);
- mpreal& operator=(const mpf_t v);
- mpreal& operator=(const mpz_t v);
- mpreal& operator=(const mpq_t v);
- mpreal& operator=(const long double v);
- mpreal& operator=(const double v);
- mpreal& operator=(const unsigned long int v);
- mpreal& operator=(const unsigned long long int v);
- mpreal& operator=(const long long int v);
- mpreal& operator=(const unsigned int v);
- mpreal& operator=(const long int v);
- mpreal& operator=(const int v);
- mpreal& operator=(const char* s);
- mpreal& operator=(const std::string& s);
- template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
- // +
- mpreal& operator+=(const mpreal& v);
- mpreal& operator+=(const mpf_t v);
- mpreal& operator+=(const mpz_t v);
- mpreal& operator+=(const mpq_t v);
- mpreal& operator+=(const long double u);
- mpreal& operator+=(const double u);
- mpreal& operator+=(const unsigned long int u);
- mpreal& operator+=(const unsigned int u);
- mpreal& operator+=(const long int u);
- mpreal& operator+=(const int u);
-
- mpreal& operator+=(const long long int u);
- mpreal& operator+=(const unsigned long long int u);
- mpreal& operator-=(const long long int u);
- mpreal& operator-=(const unsigned long long int u);
- mpreal& operator*=(const long long int u);
- mpreal& operator*=(const unsigned long long int u);
- mpreal& operator/=(const long long int u);
- mpreal& operator/=(const unsigned long long int u);
-
- const mpreal operator+() const;
- mpreal& operator++ ();
- const mpreal operator++ (int);
-
- // -
- mpreal& operator-=(const mpreal& v);
- mpreal& operator-=(const mpz_t v);
- mpreal& operator-=(const mpq_t v);
- mpreal& operator-=(const long double u);
- mpreal& operator-=(const double u);
- mpreal& operator-=(const unsigned long int u);
- mpreal& operator-=(const unsigned int u);
- mpreal& operator-=(const long int u);
- mpreal& operator-=(const int u);
- const mpreal operator-() const;
- friend const mpreal operator-(const unsigned long int b, const mpreal& a);
- friend const mpreal operator-(const unsigned int b, const mpreal& a);
- friend const mpreal operator-(const long int b, const mpreal& a);
- friend const mpreal operator-(const int b, const mpreal& a);
- friend const mpreal operator-(const double b, const mpreal& a);
- mpreal& operator-- ();
- const mpreal operator-- (int);
-
- // *
- mpreal& operator*=(const mpreal& v);
- mpreal& operator*=(const mpz_t v);
- mpreal& operator*=(const mpq_t v);
- mpreal& operator*=(const long double v);
- mpreal& operator*=(const double v);
- mpreal& operator*=(const unsigned long int v);
- mpreal& operator*=(const unsigned int v);
- mpreal& operator*=(const long int v);
- mpreal& operator*=(const int v);
-
- // /
- mpreal& operator/=(const mpreal& v);
- mpreal& operator/=(const mpz_t v);
- mpreal& operator/=(const mpq_t v);
- mpreal& operator/=(const long double v);
- mpreal& operator/=(const double v);
- mpreal& operator/=(const unsigned long int v);
- mpreal& operator/=(const unsigned int v);
- mpreal& operator/=(const long int v);
- mpreal& operator/=(const int v);
- friend const mpreal operator/(const unsigned long int b, const mpreal& a);
- friend const mpreal operator/(const unsigned int b, const mpreal& a);
- friend const mpreal operator/(const long int b, const mpreal& a);
- friend const mpreal operator/(const int b, const mpreal& a);
- friend const mpreal operator/(const double b, const mpreal& a);
-
- //<<= Fast Multiplication by 2^u
- mpreal& operator<<=(const unsigned long int u);
- mpreal& operator<<=(const unsigned int u);
- mpreal& operator<<=(const long int u);
- mpreal& operator<<=(const int u);
-
- //>>= Fast Division by 2^u
- mpreal& operator>>=(const unsigned long int u);
- mpreal& operator>>=(const unsigned int u);
- mpreal& operator>>=(const long int u);
- mpreal& operator>>=(const int u);
-
- // Type Conversion operators
- bool toBool ( ) const;
- long toLong (mp_rnd_t mode = GMP_RNDZ) const;
- unsigned long toULong (mp_rnd_t mode = GMP_RNDZ) const;
- long long toLLong (mp_rnd_t mode = GMP_RNDZ) const;
- unsigned long long toULLong (mp_rnd_t mode = GMP_RNDZ) const;
- float toFloat (mp_rnd_t mode = GMP_RNDN) const;
- double toDouble (mp_rnd_t mode = GMP_RNDN) const;
- long double toLDouble (mp_rnd_t mode = GMP_RNDN) const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
- explicit operator bool () const { return toBool(); }
- explicit operator int () const { return int(toLong()); }
- explicit operator long () const { return toLong(); }
- explicit operator long long () const { return toLLong(); }
- explicit operator unsigned () const { return unsigned(toULong()); }
- explicit operator unsigned long () const { return toULong(); }
- explicit operator unsigned long long () const { return toULLong(); }
- explicit operator float () const { return toFloat(); }
- explicit operator double () const { return toDouble(); }
- explicit operator long double () const { return toLDouble(); }
-#endif
-
- // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
- ::mpfr_ptr mpfr_ptr();
- ::mpfr_srcptr mpfr_ptr() const;
- ::mpfr_srcptr mpfr_srcptr() const;
-
- // Convert mpreal to string with n significant digits in base b
- // n = -1 -> convert with the maximum available digits
- std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- std::string toString(const std::string& format) const;
-#endif
-
- std::ostream& output(std::ostream& os) const;
-
- // Math Functions
- friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
- friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
- friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
- friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
- friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
- friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
- friend int cmpabs(const mpreal& a,const mpreal& b);
-
- friend const mpreal log (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
- friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal acos (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asin (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atan (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
- friend const mpreal acot (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asec (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acsc (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal cosh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sinh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tanh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal sech (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal csch (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal coth (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
- friend const mpreal fac_ui (unsigned long int v, mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal eint (const mpreal& v, mp_rnd_t rnd_mode);
-
- friend const mpreal gamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal tgamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal lngamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal lgamma (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
- friend const mpreal zeta (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal erf (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal erfc (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
- friend const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
- friend const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
- friend const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
- friend int sgn(const mpreal& v); // returns -1 or +1
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- friend int sinh_cosh (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal li2 (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal rec_sqrt (const mpreal& v, mp_rnd_t rnd_mode);
-
- // MATLAB's semantic equivalents
- friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
- friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- friend const mpreal digamma (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal ai (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
- friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear
- friend const mpreal grandom (unsigned int seed);
-#endif
-
- // Uniformly distributed random number generation in [0,1] using
- // Mersenne-Twister algorithm by default.
- // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
- // Check urandom() for more precise control.
- friend const mpreal random(unsigned int seed);
-
- // Splits mpreal value into fractional and integer parts.
- // Returns fractional part and stores integer part in n.
- friend const mpreal modf(const mpreal& v, mpreal& n);
-
- // Constants
- // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
- friend const mpreal const_log2 (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_pi (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_euler (mp_prec_t prec, mp_rnd_t rnd_mode);
- friend const mpreal const_catalan (mp_prec_t prec, mp_rnd_t rnd_mode);
-
- // returns +inf iff sign>=0 otherwise -inf
- friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
- // Output/ Input
- friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
- friend std::istream& operator>>(std::istream& is, mpreal& v);
-
- // Integer Related Functions
- friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal ceil (const mpreal& v);
- friend const mpreal floor(const mpreal& v);
- friend const mpreal round(const mpreal& v);
- friend const mpreal trunc(const mpreal& v);
- friend const mpreal rint_ceil (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_floor (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_round (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal rint_trunc (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal frac (const mpreal& v, mp_rnd_t rnd_mode);
- friend const mpreal remainder ( const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
- // Miscellaneous Functions
- friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
- friend const mpreal nextabove (const mpreal& x);
- friend const mpreal nextbelow (const mpreal& x);
-
- // use gmp_randinit_default() to init state, gmp_randclear() to clear
- friend const mpreal urandomb (gmp_randstate_t& state);
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
- friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
- // Instance Checkers
- friend bool (isnan) (const mpreal& v);
- friend bool (isinf) (const mpreal& v);
- friend bool (isfinite) (const mpreal& v);
-
- friend bool isnum (const mpreal& v);
- friend bool iszero (const mpreal& v);
- friend bool isint (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- friend bool isregular(const mpreal& v);
-#endif
-
- // Set/Get instance properties
- inline mp_prec_t get_prec() const;
- inline void set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd()); // Change precision with rounding mode
-
- // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
- inline mpreal& setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
- inline int getPrecision() const;
-
- // Set mpreal to +/- inf, NaN, +/-0
- mpreal& setInf (int Sign = +1);
- mpreal& setNan ();
- mpreal& setZero (int Sign = +1);
- mpreal& setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
- //Exponent
- mp_exp_t get_exp();
- int set_exp(mp_exp_t e);
- int check_range (int t, mp_rnd_t rnd_mode = get_default_rnd());
- int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
- // Inexact conversion from float
- inline bool fits_in_bits(double x, int n);
-
- // Set/Get global properties
- static void set_default_prec(mp_prec_t prec);
- static void set_default_rnd(mp_rnd_t rnd_mode);
-
- static mp_exp_t get_emin (void);
- static mp_exp_t get_emax (void);
- static mp_exp_t get_emin_min (void);
- static mp_exp_t get_emin_max (void);
- static mp_exp_t get_emax_min (void);
- static mp_exp_t get_emax_max (void);
- static int set_emin (mp_exp_t exp);
- static int set_emax (mp_exp_t exp);
-
- // Efficient swapping of two mpreal values - needed for std algorithms
- friend void swap(mpreal& x, mpreal& y);
-
- friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
- friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
- // Human friendly Debug Preview in Visual Studio.
- // Put one of these lines:
- //
- // mpfr::mpreal=<DebugView> ; Show value only
- // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits ; Show value & precision
- //
- // at the beginning of
- // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
- MPREAL_MSVC_DEBUGVIEW_DATA
-
- // "Smart" resources deallocation. Checks if instance initialized before deletion.
- void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
- std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal()
-{
- mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
- mpfr_set_zero_fast(mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u)
-{
- mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
- mpfr_set (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
- mpfr_set_uninitialized(mpfr_ptr()); // make sure "other" holds no pointer to actual data
- mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
- mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t u, bool shared)
-{
- if(shared)
- {
- std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
- }
- else
- {
- mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
- mpfr_set (mpfr_ptr(), u, mpreal::get_default_rnd());
- }
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
- mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
- mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
- mpfr_set_z(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
- mpfr_set_q(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
- if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
- {
- mpfr_set_d(mpfr_ptr(), u, mode);
- }else
- throw conversion_overflow();
-#else
- mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ld(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_uj(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_sj(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ui(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_ui(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_si(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_si(mpfr_ptr(), u, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_str(mpfr_ptr(), s, base, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
- mpfr_init2 (mpfr_ptr(), prec);
- mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
- if(mpfr_is_initialized(x))
-#endif
- mpfr_clear(x);
-}
-
-inline mpreal::~mpreal()
-{
- clear(mpfr_ptr());
-}
-
-// internal namespace needed for template magic
-namespace internal{
-
- // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
- // This is needed for smooth integration with libraries based on expression templates, like Eigen.
- // TODO: Do the same for boolean operators.
- template <typename ArgumentType> struct result_type {};
-
- template <> struct result_type<mpreal> {typedef mpreal type;};
- template <> struct result_type<mpz_t> {typedef mpreal type;};
- template <> struct result_type<mpq_t> {typedef mpreal type;};
- template <> struct result_type<long double> {typedef mpreal type;};
- template <> struct result_type<double> {typedef mpreal type;};
- template <> struct result_type<unsigned long int> {typedef mpreal type;};
- template <> struct result_type<unsigned int> {typedef mpreal type;};
- template <> struct result_type<long int> {typedef mpreal type;};
- template <> struct result_type<int> {typedef mpreal type;};
- template <> struct result_type<long long> {typedef mpreal type;};
- template <> struct result_type<unsigned long long> {typedef mpreal type;};
-}
-
-// + Addition
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs; }
-
-// - Subtraction
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs; }
-
-// * Multiplication
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs; }
-
-// / Division
-template <typename Rhs>
-inline const typename internal::result_type<Rhs>::type
- operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs; }
-
-template <typename Lhs>
-inline const typename internal::result_type<Lhs>::type
- operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs; }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);
-
-// Gives max & min values for the required precision,
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-// maxUlps - a and b can be apart by maxUlps binary numbers.
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-// bits = ceil(digits*log[2](10))
-// digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
- if (this != &v)
- {
- mp_prec_t tp = mpfr_get_prec( mpfr_srcptr());
- mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
- if(tp != vp){
- clear(mpfr_ptr());
- mpfr_init2(mpfr_ptr(), vp);
- }
-
- mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
- mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
- mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
- mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)
-{
- mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)
-{
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
- if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
- {
- mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
- }else
- throw conversion_overflow();
-#else
- mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)
-{
- mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)
-{
- mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)
-{
- mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)
-{
- mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)
-{
- mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{
- mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
- // Use other converters for more precise control on base & precision & rounding:
- //
- // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
- // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
- //
- // Here we assume base = 10 and we use precision of target variable.
-
- mpfr_t t;
-
- mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
- if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
- {
- mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
-
- clear(t);
- return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
- // Use other converters for more precise control on base & precision & rounding:
- //
- // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
- // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
- //
- // Here we assume base = 10 and we use precision of target variable.
-
- mpfr_t t;
-
- mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
- if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
- {
- mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- }
-
- clear(t);
- return *this;
-}
-
-template <typename real_t>
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
- return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
- mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
- *this += mpreal(u);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
- mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
- mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
- *this += mpreal(u);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
- *this += mpreal(u);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
- mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
- mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
- mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
- mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u) { *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){ *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator-=(const long long int u) { *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){ *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator*=(const long long int u) { *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){ *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator/=(const long long int u) { *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){ *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; }
-
-inline const mpreal mpreal::operator+()const { return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline mpreal& mpreal::operator++()
-{
- return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
- mpreal x(*this);
- *this += 1;
- return x;
-}
-
-inline mpreal& mpreal::operator--()
-{
- return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
- mpreal x(*this);
- *this -= 1;
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
- mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
- mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
- mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
- *this -= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this -= mpreal(v);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
- mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
- mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
- mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
- mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
- mpreal u(*this);
- mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
- return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline const mpreal operator-(const double b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-#else
- mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
- x -= a;
- return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
- mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
- mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
- mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
- *this *= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this *= mpreal(v);
-#endif
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
- mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
- mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
- mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
- mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
- mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
- mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
- mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
- mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
- *this /= mpreal(v);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
- *this /= mpreal(v);
-#endif
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
- mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
- mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
- mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
- mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
- mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
- mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
- return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal operator/(const double b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
- mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
- return x;
-#else
- mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
- x /= a;
- return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
- mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
- mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
- mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
- mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
- mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
- mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
- mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
- mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
- return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
- return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
- return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
- return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
- return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
- return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
- return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
- return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
- mpreal x(v);
- mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING:
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
-//
-// isnan(b) = (b != b)
-// isnan(b) = !(b == b) (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator > (const mpreal& a, const mpreal& b ){ return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator > (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 ); }
-inline bool operator > (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 ); }
-
-inline bool operator >= (const mpreal& a, const mpreal& b ){ return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); }
-// inline bool operator >= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 ); }
-inline bool operator >= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 ); }
-
-inline bool operator < (const mpreal& a, const mpreal& b ){ return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator < (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 ); }
-inline bool operator < (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 ); }
-
-inline bool operator <= (const mpreal& a, const mpreal& b ){ return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 ); }
-inline bool operator <= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 ); }
-
-inline bool operator == (const mpreal& a, const mpreal& b ){ return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); }
-inline bool operator == (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 ); }
-inline bool operator == (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 ); }
-
-inline bool operator != (const mpreal& a, const mpreal& b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const unsigned long int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const unsigned int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const long int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const int b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const long double b ){ return !(a == b); }
-inline bool operator != (const mpreal& a, const double b ){ return !(a == b); }
-
-inline bool (isnan) (const mpreal& op){ return (mpfr_nan_p (op.mpfr_srcptr()) != 0 ); }
-inline bool (isinf) (const mpreal& op){ return (mpfr_inf_p (op.mpfr_srcptr()) != 0 ); }
-inline bool (isfinite) (const mpreal& op){ return (mpfr_number_p (op.mpfr_srcptr()) != 0 ); }
-inline bool iszero (const mpreal& op){ return (mpfr_zero_p (op.mpfr_srcptr()) != 0 ); }
-inline bool isint (const mpreal& op){ return (mpfr_integer_p(op.mpfr_srcptr()) != 0 ); }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){ return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool mpreal::toBool ( ) const { return mpfr_zero_p (mpfr_srcptr()) == 0; }
-inline long mpreal::toLong (mp_rnd_t mode) const { return mpfr_get_si (mpfr_srcptr(), mode); }
-inline unsigned long mpreal::toULong (mp_rnd_t mode) const { return mpfr_get_ui (mpfr_srcptr(), mode); }
-inline float mpreal::toFloat (mp_rnd_t mode) const { return mpfr_get_flt(mpfr_srcptr(), mode); }
-inline double mpreal::toDouble (mp_rnd_t mode) const { return mpfr_get_d (mpfr_srcptr(), mode); }
-inline long double mpreal::toLDouble(mp_rnd_t mode) const { return mpfr_get_ld (mpfr_srcptr(), mode); }
-inline long long mpreal::toLLong (mp_rnd_t mode) const { return mpfr_get_sj (mpfr_srcptr(), mode); }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode) const { return mpfr_get_uj (mpfr_srcptr(), mode); }
-
-inline ::mpfr_ptr mpreal::mpfr_ptr() { return mp; }
-inline ::mpfr_srcptr mpreal::mpfr_ptr() const { return mp; }
-inline ::mpfr_srcptr mpreal::mpfr_srcptr() const { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
- std::ostringstream oss;
- oss << f << t;
- return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
- char *s = NULL;
- std::string out;
-
- if( !format.empty() )
- {
- if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
- {
- out = std::string(s);
-
- mpfr_free_str(s);
- }
- }
-
- return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
- // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
- (void)b;
- (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
- std::ostringstream format;
-
- int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-
- format << "%." << digits << "RNg";
-
- return toString(format.str());
-
-#else
-
- char *s, *ns = NULL;
- size_t slen, nslen;
- mp_exp_t exp;
- std::string out;
-
- if(mpfr_inf_p(mp))
- {
- if(mpfr_sgn(mp)>0) return "+Inf";
- else return "-Inf";
- }
-
- if(mpfr_zero_p(mp)) return "0";
- if(mpfr_nan_p(mp)) return "NaN";
-
- s = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
- ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
- if(s!=NULL && ns!=NULL)
- {
- slen = strlen(s);
- nslen = strlen(ns);
- if(nslen<=slen)
- {
- mpfr_free_str(s);
- s = ns;
- slen = nslen;
- }
- else {
- mpfr_free_str(ns);
- }
-
- // Make human eye-friendly formatting if possible
- if (exp>0 && static_cast<size_t>(exp)<slen)
- {
- if(s[0]=='-')
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+exp) ptr--;
-
- if(ptr==s+exp) out = std::string(s,exp+1);
- else out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
- //out = string(s,exp+1)+'.'+string(s+exp+1);
- }
- else
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+exp-1) ptr--;
-
- if(ptr==s+exp-1) out = std::string(s,exp);
- else out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
- //out = string(s,exp)+'.'+string(s+exp);
- }
-
- }else{ // exp<0 || exp>slen
- if(s[0]=='-')
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s+1) ptr--;
-
- if(ptr==s+1) out = std::string(s,2);
- else out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
- //out = string(s,2)+'.'+string(s+2);
- }
- else
- {
- // Remove zeros starting from right end
- char* ptr = s+slen-1;
- while (*ptr=='0' && ptr>s) ptr--;
-
- if(ptr==s) out = std::string(s,1);
- else out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
- //out = string(s,1)+'.'+string(s+1);
- }
-
- // Make final string
- if(--exp)
- {
- if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
- else out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
- }
- }
-
- mpfr_free_str(s);
- return out;
- }else{
- return "conversion error!";
- }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const
-{
- std::ostringstream format;
- const std::ios::fmtflags flags = os.flags();
-
- format << ((flags & std::ios::showpos) ? "%+" : "%");
- if (os.precision() >= 0)
- format << '.' << os.precision() << "R*"
- << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
- (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
- 'g');
- else
- format << "R*e";
-
- char *s = NULL;
- if(!(mpfr_asprintf(&s, format.str().c_str(),
- mpfr::mpreal::get_default_rnd(),
- mpfr_srcptr())
- < 0))
- {
- os << std::string(s);
- mpfr_free_str(s);
- }
- return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
- return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
- // TODO: use cout::hexfloat and other flags to setup base
- std::string tmp;
- is >> tmp;
- mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
- return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Bits - decimal digits relation
-// bits = ceil(digits*log[2](10))
-// digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
- const double LOG2_10 = 3.3219280948873624;
-
- return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
- const double LOG10_2 = 0.30102999566398119;
-
- return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline int sgn(const mpreal& op)
-{
- return mpfr_sgn(op.mpfr_srcptr());
-}
-
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
- mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
- return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
- mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign)
-{
- mpfr_set_inf(mpfr_ptr(), sign);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setNan()
-{
- mpfr_set_nan(mpfr_ptr());
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- mpfr_set_zero(mpfr_ptr(), sign);
-#else
- mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
- setSign(sign);
-#endif
-
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
- return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
- mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp ()
-{
- return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
- int x = mpfr_set_exp(mpfr_ptr(), e);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
- mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
- mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
- *exp = mpfr_get_exp(y.mpfr_srcptr());
- mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
- return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
- mpreal x(v);
-
- // rounding is not important since we are just increasing the exponent (= exact operation)
- mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
- return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
- return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
- /* the smallest eps such that 1 + eps != 1 */
- return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{
- /* the smallest eps such that x + eps != x */
- if( x < 0)
- {
- return nextabove(-x) + x;
- }else{
- return nextabove( x) - x;
- }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
- /* min = 1/2 * 2^emin = 2^(emin - 1) */
- return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
- /* max = (1 - eps) * 2^emax, eps is machine epsilon */
- return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
- return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
- return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
- return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
- mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
- return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
- return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
- mpreal f(v);
-
- // rounding is not important since we are using the same number
- mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
- mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
- return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
- return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
- int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
- MPREAL_MSVC_DEBUGVIEW_CODE;
- return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
- return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
- return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
- return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
- return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
- return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
- return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
- return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
- return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f) \
- mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); \
- mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r); \
- return y;
-
-inline const mpreal sqr (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqr ); }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt); }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
- mpreal y;
- mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
- return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
- return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
- if (v>=0) return sqrt(static_cast<unsigned long int>(v),rnd_mode);
- else return mpreal().setNan(); // NaN
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
- if (v>=0) return sqrt(static_cast<unsigned long int>(v),rnd_mode);
- else return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
- mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
- return y;
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
- mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
- return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
- return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt (const long double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); }
-inline const mpreal sqrt (const double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); }
-
-inline const mpreal cbrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt ); }
-inline const mpreal fabs (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); }
-inline const mpreal abs (const mpreal& x, mp_rnd_t r) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); }
-inline const mpreal log (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log ); }
-inline const mpreal log2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log2 ); }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log10); }
-inline const mpreal exp (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp ); }
-inline const mpreal exp2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 ); }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp10); }
-inline const mpreal cos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cos ); }
-inline const mpreal sin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sin ); }
-inline const mpreal tan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tan ); }
-inline const mpreal sec (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sec ); }
-inline const mpreal csc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csc ); }
-inline const mpreal cot (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cot ); }
-inline const mpreal acos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acos ); }
-inline const mpreal asin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asin ); }
-inline const mpreal atan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atan ); }
-
-inline const mpreal logb (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { return log2 (abs(x),r); }
-
-inline const mpreal acot (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atan (1/v, r); }
-inline const mpreal asec (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acos (1/v, r); }
-inline const mpreal acsc (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asin (1/v, r); }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atanh(1/v, r); }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acosh(1/v, r); }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asinh(1/v, r); }
-
-inline const mpreal cosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cosh ); }
-inline const mpreal sinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sinh ); }
-inline const mpreal tanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tanh ); }
-inline const mpreal sech (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sech ); }
-inline const mpreal csch (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csch ); }
-inline const mpreal coth (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(coth ); }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acosh); }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asinh); }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atanh); }
-
-inline const mpreal log1p (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log1p ); }
-inline const mpreal expm1 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(expm1 ); }
-inline const mpreal eint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(eint ); }
-inline const mpreal gamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); }
-inline const mpreal tgamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma); }
-inline const mpreal zeta (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(zeta ); }
-inline const mpreal erf (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erf ); }
-inline const mpreal erfc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erfc ); }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j0 ); }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j1 ); }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y0 ); }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y1 ); }
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
- mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
- return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec = mpreal::get_default_prec(),
- mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(0, prec);
- mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
- return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(v);
- int tsignp;
-
- if(signp) mpfr_lgamma(x.mpfr_ptr(), signp,v.mpfr_srcptr(),rnd_mode);
- else mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
- return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, x.getPrecision());
- mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
- return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal y(0, x.getPrecision());
- mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
- return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2, p3;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
- p3 = v3.get_prec();
-
- a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
- mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2, p3;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
- p3 = v3.get_prec();
-
- a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
- mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t p1, p2;
-
- p1 = v1.get_prec();
- p2 = v2.get_prec();
-
- a.set_prec(p1>p2?p1:p2);
-
- mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
- return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
- mpfr_srcptr *p = new mpfr_srcptr[n];
-
- for (unsigned long int i = 0; i < n; i++)
- p[i] = tab[i].mpfr_srcptr();
-
- mpreal x;
- status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-
- delete [] p;
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
- MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- /* R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
- return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- (void)rnd_mode;
-
- /*
-
- m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
- The following are true by convention:
- - mod(x,0) is x
- - mod(x,x) is 0
- - mod(x,y) for x != y and y != 0 has the same sign as y.
-
- */
-
- if(iszero(y)) return x;
- if(x == y) return 0;
-
- mpreal m = x - floor(x / y) * y;
-
- m.setSign(sgn(y)); // make sure result has the same sign as Y
-
- return m;
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mp_prec_t yp, xp;
-
- yp = y.get_prec();
- xp = x.get_prec();
-
- a.set_prec(yp>xp?yp:xp);
-
- mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
- return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(v);
- mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
- return x;
-}
-#endif // MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(digamma); }
-inline const mpreal ai (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(ai); }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_log2(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_pi(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_euler(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
- mpreal x(0, p);
- mpfr_const_catalan(x.mpfr_ptr(), r);
- return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
- mpreal x(0, p);
- mpfr_set_inf(x.mpfr_ptr(), sign);
- return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
- mpreal x(v);
- mpfr_ceil(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
- mpreal x(v);
- mpfr_floor(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
- mpreal x(v);
- mpfr_round(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
- mpreal x(v);
- mpfr_trunc(x.mp,v.mp);
- return x;
-}
-
-inline const mpreal rint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint ); }
-inline const mpreal rint_ceil (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil ); }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor); }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round); }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc); }
-inline const mpreal frac (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(frac ); }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void swap (mpreal& a, mpreal& b) { mpfr_swap(a.mp,b.mp); }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){ return (x>y?x:y); }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){ return (x<y?x:y); }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal a;
- mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
- return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
- mpreal a(x);
- mpfr_nexttoward(a.mp,y.mp);
- return a;
-}
-
-inline const mpreal nextabove (const mpreal& x)
-{
- mpreal a(x);
- mpfr_nextabove(a.mp);
- return a;
-}
-
-inline const mpreal nextbelow (const mpreal& x)
-{
- mpreal a(x);
- mpfr_nextbelow(a.mp);
- return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
- mpreal x;
- mpfr_urandomb(x.mpfr_ptr(),state);
- return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x;
- mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
- return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
- mpreal x;
- mpfr_random2(x.mpfr_ptr(),size,exp);
- return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random(); <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
- static gmp_randstate_t state;
- static bool initialize = true;
-
- if(initialize)
- {
- gmp_randinit_default(state);
- gmp_randseed_ui(state,0);
- initialize = false;
- }
-
- if(seed != 0) gmp_randseed_ui(state,seed);
-
- return mpfr::urandom(state);
-#else
- if(seed != 0) std::srand(seed);
- return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x;
- mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
- return x;
-}
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
- static gmp_randstate_t state;
- static bool initialize = true;
-
- if(initialize)
- {
- gmp_randinit_default(state);
- gmp_randseed_ui(state,0);
- initialize = false;
- }
-
- if(seed != 0) gmp_randseed_ui(state,seed);
-
- return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{
- mpfr_set_default_prec(prec);
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{
- mpfr_set_default_rounding_mode(rnd_mode);
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{
- int i;
- double t;
- return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
- mpreal x(a);
- mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),b,rnd_mode);
- else return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),b,rnd_mode);
- else return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- mpreal x(a);
- mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
- return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
- if (a>0)
- {
- if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
- else return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- }else{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
- }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
- if (a>=0) return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
- else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
- return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
- // we are allowed to extend namespace std with specializations only
- template <>
- inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
- {
- return mpfr::swap(x, y);
- }
-
- template<>
- class numeric_limits<mpfr::mpreal>
- {
- public:
- static const bool is_specialized = true;
- static const bool is_signed = true;
- static const bool is_integer = false;
- static const bool is_exact = false;
- static const int radix = 2;
-
- static const bool has_infinity = true;
- static const bool has_quiet_NaN = true;
- static const bool has_signaling_NaN = true;
-
- static const bool is_iec559 = true; // = IEEE 754
- static const bool is_bounded = true;
- static const bool is_modulo = false;
- static const bool traps = true;
- static const bool tinyness_before = true;
-
- static const float_denorm_style has_denorm = denorm_absent;
-
- inline static mpfr::mpreal (min) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::minval(precision); }
- inline static mpfr::mpreal (max) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(precision); }
- inline static mpfr::mpreal lowest (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(precision); }
-
- // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
- inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(precision); }
-
- // Returns smallest eps such that x + eps != x (relative machine epsilon)
- inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) { return mpfr::machine_epsilon(x); }
-
- inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
- if(r == GMP_RNDN) return mpfr::mpreal(0.5, precision);
- else return mpfr::mpreal(1.0, precision);
- }
-
- inline static const mpfr::mpreal infinity() { return mpfr::const_infinity(); }
- inline static const mpfr::mpreal quiet_NaN() { return mpfr::mpreal().setNan(); }
- inline static const mpfr::mpreal signaling_NaN() { return mpfr::mpreal().setNan(); }
- inline static const mpfr::mpreal denorm_min() { return (min)(); }
-
- // Please note, exponent range is not fixed in MPFR
- static const int min_exponent = MPFR_EMIN_DEFAULT;
- static const int max_exponent = MPFR_EMAX_DEFAULT;
- MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
- MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
- // Following members should be constant according to standard, but they can be variable in MPFR
- // So we define them as functions here.
- //
- // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
- // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
- // See below for compatible implementation.
- inline static float_round_style round_style()
- {
- mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
- switch (r)
- {
- case GMP_RNDN: return round_to_nearest;
- case GMP_RNDZ: return round_toward_zero;
- case GMP_RNDU: return round_toward_infinity;
- case GMP_RNDD: return round_toward_neg_infinity;
- default: return round_indeterminate;
- }
- }
-
- inline static int digits() { return int(mpfr::mpreal::get_default_prec()); }
- inline static int digits(const mpfr::mpreal& x) { return x.getPrecision(); }
-
- inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- return mpfr::bits2digits(precision);
- }
-
- inline static int digits10(const mpfr::mpreal& x)
- {
- return mpfr::bits2digits(x.getPrecision());
- }
-
- inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
- {
- return digits10(precision);
- }
-#else
- // Digits and round_style are NOT constants when it comes to mpreal.
- // If possible, please use functions digits() and round_style() defined above.
- //
- // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
- // Change them accordingly to your application.
- //
- // For example, if you use 256 bits of precision uniformly in your program, then:
- // digits = 256
- // digits10 = 77
- // max_digits10 = 78
- //
- // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
- static const std::float_round_style round_style = round_to_nearest;
- static const int digits = 53;
- static const int digits10 = 15;
- static const int max_digits10 = 16;
-#endif
- };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 685e7ea45..10beb0714 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -1,3 +1,4 @@ +#include <mpreal.h> // Must be included before main.h. #include "main.h" #include <Eigen/MPRealSupport> #include <Eigen/LU> @@ -7,7 +8,7 @@ using namespace mpfr; using namespace Eigen; -void test_mpreal_support() +EIGEN_DECLARE_TEST(mpreal_support) { // set precision to 256 bits (double has only 53 bits) mpreal::set_default_prec(256); diff --git a/unsupported/test/openglsupport.cpp b/unsupported/test/openglsupport.cpp index 706a816f7..1c4438134 100644 --- a/unsupported/test/openglsupport.cpp +++ b/unsupported/test/openglsupport.cpp @@ -9,15 +9,24 @@ #include <main.h> #include <iostream> +#include <string> + +#if defined(__APPLE_CC__) + // Prevent deprecation warnings caused by GLEW on MacOS. + #define GL_SILENCE_DEPRECATION 1 +#endif #include <GL/glew.h> #include <Eigen/OpenGLSupport> -#include <GL/glut.h> -using namespace Eigen; - - +#if defined(__APPLE_CC__) + #include <GLUT/glut.h> +#else + #include <GL/freeglut.h> +#endif +using namespace Eigen; #define VERIFY_MATRIX(CODE,REF) { \ + glMatrixMode(GL_MODELVIEW); \ glLoadIdentity(); \ CODE; \ Matrix<float,4,4,ColMajor> m; m.setZero(); \ @@ -40,7 +49,7 @@ using namespace Eigen; } \ VERIFY_IS_APPROX(value, data); \ } - + #define VERIFY_UNIFORMi(NAME,TYPE) { \ TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \ TYPE data; \ @@ -53,175 +62,324 @@ using namespace Eigen; } \ VERIFY_IS_APPROX(value, data); \ } - -void printInfoLog(GLuint objectID) + +void printProgramInfoLog(GLuint objectID) { int infologLength, charsWritten; GLchar *infoLog; - glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength); + glGetProgramiv(objectID, GL_INFO_LOG_LENGTH, &infologLength); if(infologLength > 0) { infoLog = new GLchar[infologLength]; glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog); - if (charsWritten>0) + if (charsWritten > 0) + std::cerr << "Program info : \n" << infoLog << std::endl; + delete[] infoLog; + } +} + +void printShaderInfoLog(GLuint objectID) +{ + int infologLength, charsWritten; + GLchar *infoLog; + glGetShaderiv(objectID, GL_INFO_LOG_LENGTH, &infologLength); + if(infologLength > 0) + { + infoLog = new GLchar[infologLength]; + glGetShaderInfoLog(objectID, infologLength, &charsWritten, infoLog); + if (charsWritten > 0) std::cerr << "Shader info : \n" << infoLog << std::endl; delete[] infoLog; } } -GLint createShader(const char* vtx, const char* frg) +GLint createProgram(const char* vtx, const char* frg, bool print_errors = true) { GLint prg_id = glCreateProgram(); GLint vtx_id = glCreateShader(GL_VERTEX_SHADER); GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER); GLint ok; - + glShaderSource(vtx_id, 1, &vtx, 0); glCompileShader(vtx_id); - glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok); + glGetShaderiv(vtx_id, GL_COMPILE_STATUS, &ok); if(!ok) { - std::cerr << "vtx compilation failed\n"; + if (print_errors) + { + std::cerr << "vtx compilation failed\n"; + std::cerr << "Source:\n" << vtx << "\n"; + printShaderInfoLog(vtx_id); + } + glDeleteShader(vtx_id); + return GL_ZERO; } - + glShaderSource(frg_id, 1, &frg, 0); glCompileShader(frg_id); - glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok); + glGetShaderiv(frg_id, GL_COMPILE_STATUS, &ok); if(!ok) { - std::cerr << "frg compilation failed\n"; + if (print_errors) + { + std::cerr << "frg compilation failed.\n"; + std::cerr << "Source:\n" << frg << "\n"; + printShaderInfoLog(frg_id); + } + glDeleteShader(vtx_id); + glDeleteShader(frg_id); + return GL_ZERO; } - + glAttachShader(prg_id, vtx_id); glAttachShader(prg_id, frg_id); glLinkProgram(prg_id); - glGetProgramiv(prg_id,GL_LINK_STATUS,&ok); + + // Delete shaders once linked. + glDeleteShader(vtx_id); + glDeleteShader(frg_id); + glGetProgramiv(prg_id, GL_LINK_STATUS, &ok); if(!ok) { - std::cerr << "linking failed\n"; + if (print_errors) + { + std::cerr << "linking failed.\n"; + printProgramInfoLog(prg_id); + } + glDeleteProgram(prg_id); + return GL_ZERO; } - printInfoLog(prg_id); - + glUseProgram(prg_id); return prg_id; } -void test_openglsupport() +GLint createProgram(const std::string& vtx, const std::string& frg, bool print_errors = true) { - int argc = 0; - glutInit(&argc, 0); - glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); - glutInitWindowPosition (0,0); - glutInitWindowSize(10, 10); + return createProgram(vtx.c_str(), frg.c_str(), print_errors); +} - if(glutCreateWindow("Eigen") <= 0) +std::string getGlslVersionString(int gl_major_version, int gl_minor_version) +{ + switch (gl_major_version) { - std::cerr << "Error: Unable to create GLUT Window.\n"; - exit(1); + case 2: + switch (gl_minor_version) + { + case 0: + return "#version 110"; + case 1: + return "#version 120"; + } + break; + case 3: + switch (gl_minor_version) + { + case 0: + return "#version 130"; + case 1: + return "#version 140"; + case 2: + return "#version 150"; + case 3: + return "#version 330"; + } + break; + case 4: + switch (gl_minor_version) + { + case 0: + return "#version 400"; + case 1: + return "#version 410"; + case 2: + return "#version 420"; + case 3: + return "#version 430"; + case 4: + return "#version 440"; + case 5: + return "#version 450"; + case 6: + return "#version 460"; + } + break; } - - glewExperimental = GL_TRUE; - if(glewInit() != GLEW_OK) - { - std::cerr << "Warning: Failed to initialize GLEW\n"; + return ""; +} + +void find_and_replace( + std::string& str, + const std::string& find, + const std::string& replace) +{ + size_t loc = 0; + size_t flen = find.length(); + size_t rlen = replace.length(); + while ( (loc = str.find(find, loc)) != std::string::npos) { + str.replace(loc, flen, replace); + loc += rlen; } +} - Vector3f v3f; - Matrix3f rot; - glBegin(GL_POINTS); - - glVertex(v3f); - glVertex(2*v3f+v3f); - glVertex(rot*v3f); - - glEnd(); - - // 4x4 matrices - Matrix4f mf44; mf44.setRandom(); - VERIFY_MATRIX(glLoadMatrix(mf44), mf44); - VERIFY_MATRIX(glMultMatrix(mf44), mf44); - Matrix4d md44; md44.setRandom(); - VERIFY_MATRIX(glLoadMatrix(md44), md44); - VERIFY_MATRIX(glMultMatrix(md44), md44); - - // Quaternion - Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random())); - VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix()); - - Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random())); - VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix()); - - // 3D Transform - Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom(); - VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix()); - VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix()); - - Transform<float,3,Affine> af3(acf3); - VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix()); - VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix()); - - Transform<float,3,Projective> pf3; pf3.matrix().setRandom(); - VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix()); - VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix()); - - Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom(); - VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix()); - VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix()); - - Transform<double,3,Affine> ad3(acd3); - VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix()); - VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix()); - - Transform<double,3,Projective> pd3; pd3.matrix().setRandom(); - VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix()); - VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix()); - - // translations (2D and 3D) - { - Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0; - VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix()); - Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0; - VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix()); - - Vector3f vf3; vf3.setRandom(); - VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix()); - Vector3d vd3; vd3.setRandom(); - VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix()); - - Translation<float,3> tf3; tf3.vector().setRandom(); - VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix()); - - Translation<double,3> td3; td3.vector().setRandom(); - VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix()); +// Finds and replaces a set of substrings in a string. +std::string format( + const std::string& str, + const std::vector<std::string>& find, + const std::vector<std::string>& replace) +{ + std::string out = str; + for (std::size_t i=0; i<find.size(); ++i) { + find_and_replace(out, find[i], replace[i]); } - - // scaling (2D and 3D) + return out; +} + +// GLUT display function that runs test. Must be run within the display loop +// in order to properly destroy resources. +void openglsupport_test_loop() +{ + // Get context info. + const GLubyte* gl_version_string = glGetString(GL_VERSION); + std::cerr << "GL version: " << gl_version_string << std::endl; + std::cerr << "GLSL version: " << glGetString(GL_SHADING_LANGUAGE_VERSION) << std::endl; + // Parse version from string since GL_MAJOR_VERSION is only supported in GL 3.0+. + // Version string guaranteed to be <major>.<minor><vender extension>. + GLint gl_major_version = gl_version_string[0] - '0'; + GLint gl_minor_version = gl_version_string[2] - '0'; + bool legacy_gl = gl_major_version < 3 || (gl_major_version == 3 && gl_minor_version < 2); + + // Fixed-function pipeline removed in OpenGL 3.2. + if (legacy_gl) { - Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1; - VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix()); - Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1; - VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix()); - - Vector3f vf3; vf3.setRandom(); - VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix()); - Vector3d vd3; vd3.setRandom(); - VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix()); - - UniformScaling<float> usf(internal::random<float>()); - VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix()); - - UniformScaling<double> usd(internal::random<double>()); - VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix()); + // Draw a basic triangle. + Vector3f v3f; + Matrix3f rot; + glBegin(GL_POINTS); + { + glVertex(v3f); + glVertex(2*v3f+v3f); + glVertex(rot*v3f); + } + glEnd(); + + // 4x4 matrices + Matrix4f mf44; mf44.setRandom(); + VERIFY_MATRIX(glLoadMatrix(mf44), mf44); + VERIFY_MATRIX(glMultMatrix(mf44), mf44); + Matrix4d md44; md44.setRandom(); + VERIFY_MATRIX(glLoadMatrix(md44), md44); + VERIFY_MATRIX(glMultMatrix(md44), md44); + + // Quaternion + Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random())); + VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix()); + + Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random())); + VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix()); + + // 3D Transform + Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom(); + VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix()); + VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix()); + + Transform<float,3,Affine> af3(acf3); + VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix()); + VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix()); + + Transform<float,3,Projective> pf3; pf3.matrix().setRandom(); + VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix()); + VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix()); + + Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom(); + VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix()); + VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix()); + + Transform<double,3,Affine> ad3(acd3); + VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix()); + VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix()); + + Transform<double,3,Projective> pd3; pd3.matrix().setRandom(); + VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix()); + VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix()); + + // translations (2D and 3D) + { + Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0; + VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix()); + Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0; + VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix()); + + Vector3f vf3; vf3.setRandom(); + VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix()); + Vector3d vd3; vd3.setRandom(); + VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix()); + + Translation<float,3> tf3; tf3.vector().setRandom(); + VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix()); + + Translation<double,3> td3; td3.vector().setRandom(); + VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix()); + } + + // scaling (2D and 3D) + { + Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1; + VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix()); + Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1; + VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix()); + + Vector3f vf3; vf3.setRandom(); + VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix()); + Vector3d vd3; vd3.setRandom(); + VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix()); + + UniformScaling<float> usf(internal::random<float>()); + VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix()); + + UniformScaling<double> usd(internal::random<double>()); + VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix()); + } + } else { + std::cerr << "Warning: fixed-function pipeline was not tested.\n"; + } + + // Dynamic shader substitution variables. + // Modern shaders require a version string, and newer runtimes fail to + // compile old GLSL versions. Thus, we dynamically set the GLSL version + // string based on runtime. Also, pre OpenGL 3.0, the output gl_FragColor was + // built-in. This was deprecated in OpenGL 3.0, requiring us to explicitly + // define the output variable. + std::vector<std::string> glsl_vars; + glsl_vars.push_back("${GLSL_VERSION}"); + glsl_vars.push_back("${FRAG_OUTPUT_DECLARATION}"); + glsl_vars.push_back("${FRAG_OUTPUT_VARIABLE}"); + + std::vector<std::string> glsl_vals; + glsl_vals.push_back(getGlslVersionString(gl_major_version, gl_minor_version)); + if (gl_major_version >= 3) { + glsl_vals.push_back("out vec4 fragColor;"); + glsl_vals.push_back("fragColor"); + } else { + glsl_vals.push_back(""); + glsl_vals.push_back("gl_FragColor"); } - + // uniform { - const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n"; - - if(GLEW_VERSION_2_0) + // vertex shader. + std::string vtx = format( + "${GLSL_VERSION}\n" + "void main(void) {\n" + " gl_Position = vec4(0,0,0,1);\n" + "}\n", + glsl_vars, glsl_vals); + +#ifdef GL_VERSION_2_0 + if(GLEW_VERSION_2_0 && GL_VERSION_2_0) { - #ifdef GL_VERSION_2_0 - const char* frg = "" + std::string frg = format( + "${GLSL_VERSION}\n" "uniform vec2 v2f;\n" "uniform vec3 v3f;\n" "uniform vec4 v4f;\n" @@ -231,107 +389,212 @@ void test_openglsupport() "uniform mat2 m2f;\n" "uniform mat3 m3f;\n" "uniform mat4 m4f;\n" - "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n"; - - GLint prg_id = createShader(vtx,frg); - - VERIFY_UNIFORM(fv,v2f, Vector2f); - VERIFY_UNIFORM(fv,v3f, Vector3f); - VERIFY_UNIFORM(fv,v4f, Vector4f); + "${FRAG_OUTPUT_DECLARATION}\n" + "void main(void) { \n" + " ${FRAG_OUTPUT_VARIABLE} = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]);\n" + "}\n", + glsl_vars, glsl_vals); + + GLint prg_id = createProgram(vtx, frg); + VERIFY(prg_id > 0 && "Failed to create program."); + VERIFY_UNIFORM(fv, v2f, Vector2f); + VERIFY_UNIFORM(fv, v3f, Vector3f); + VERIFY_UNIFORM(fv, v4f, Vector4f); VERIFY_UNIFORMi(v2i, Vector2i); VERIFY_UNIFORMi(v3i, Vector3i); VERIFY_UNIFORMi(v4i, Vector4i); - VERIFY_UNIFORM(fv,m2f, Matrix2f); - VERIFY_UNIFORM(fv,m3f, Matrix3f); - VERIFY_UNIFORM(fv,m4f, Matrix4f); - #endif + VERIFY_UNIFORM(fv, m2f, Matrix2f); + VERIFY_UNIFORM(fv, m3f, Matrix3f); + VERIFY_UNIFORM(fv, m4f, Matrix4f); + glDeleteProgram(prg_id); } else - std::cerr << "Warning: opengl 2.0 was not tested\n"; - - if(GLEW_VERSION_2_1) +#endif + std::cerr << "Warning: opengl 2.0 was not tested.\n"; + +#ifdef GL_VERSION_2_1 + if(GLEW_VERSION_2_1 && GL_VERSION_2_1 && + (gl_major_version > 2 || (gl_major_version == 2 && gl_minor_version >= 1))) { - #ifdef GL_VERSION_2_1 - const char* frg = "#version 120\n" + std::string frg = format( + "${GLSL_VERSION}\n" "uniform mat2x3 m23f;\n" "uniform mat3x2 m32f;\n" "uniform mat2x4 m24f;\n" "uniform mat4x2 m42f;\n" "uniform mat3x4 m34f;\n" "uniform mat4x3 m43f;\n" - "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n"; - - GLint prg_id = createShader(vtx,frg); - + "${FRAG_OUTPUT_DECLARATION}\n" + "void main(void) {\n" + " ${FRAG_OUTPUT_VARIABLE} = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]);\n" + "}\n", + glsl_vars, glsl_vals); + + GLint prg_id = createProgram(vtx, frg); + VERIFY(prg_id > 0 && "Failed to create program."); typedef Matrix<float,2,3> Matrix23f; typedef Matrix<float,3,2> Matrix32f; typedef Matrix<float,2,4> Matrix24f; typedef Matrix<float,4,2> Matrix42f; typedef Matrix<float,3,4> Matrix34f; typedef Matrix<float,4,3> Matrix43f; - - VERIFY_UNIFORM(fv,m23f, Matrix23f); - VERIFY_UNIFORM(fv,m32f, Matrix32f); - VERIFY_UNIFORM(fv,m24f, Matrix24f); - VERIFY_UNIFORM(fv,m42f, Matrix42f); - VERIFY_UNIFORM(fv,m34f, Matrix34f); - VERIFY_UNIFORM(fv,m43f, Matrix43f); - #endif + + VERIFY_UNIFORM(fv, m23f, Matrix23f); + VERIFY_UNIFORM(fv, m32f, Matrix32f); + VERIFY_UNIFORM(fv, m24f, Matrix24f); + VERIFY_UNIFORM(fv, m42f, Matrix42f); + VERIFY_UNIFORM(fv, m34f, Matrix34f); + VERIFY_UNIFORM(fv, m43f, Matrix43f); + glDeleteProgram(prg_id); } else - std::cerr << "Warning: opengl 2.1 was not tested\n"; - - if(GLEW_VERSION_3_0) +#endif + std::cerr << "Warning: opengl 2.1 was not tested.\n"; + +#ifdef GL_VERSION_3_0 + if(GLEW_VERSION_3_0 && GL_VERSION_3_0 && gl_major_version >= 3) { - #ifdef GL_VERSION_3_0 - const char* frg = "#version 150\n" + std::string frg = format( + "${GLSL_VERSION}\n" "uniform uvec2 v2ui;\n" "uniform uvec3 v3ui;\n" "uniform uvec4 v4ui;\n" - "out vec4 data;\n" - "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n"; - - GLint prg_id = createShader(vtx,frg); - + "${FRAG_OUTPUT_DECLARATION}\n" + "void main(void) {\n" + " ${FRAG_OUTPUT_VARIABLE} = vec4(v2ui[0]+v3ui[0]+v4ui[0]);\n" + "}\n", + glsl_vars, glsl_vals); + + GLint prg_id = createProgram(vtx, frg); + VERIFY(prg_id > 0 && "Failed to create program."); typedef Matrix<unsigned int,2,1> Vector2ui; typedef Matrix<unsigned int,3,1> Vector3ui; typedef Matrix<unsigned int,4,1> Vector4ui; - + VERIFY_UNIFORMi(v2ui, Vector2ui); VERIFY_UNIFORMi(v3ui, Vector3ui); VERIFY_UNIFORMi(v4ui, Vector4ui); - #endif + glDeleteProgram(prg_id); } else - std::cerr << "Warning: opengl 3.0 was not tested\n"; - - #ifdef GLEW_ARB_gpu_shader_fp64 +#endif + std::cerr << "Warning: opengl 3.0 was not tested.\n"; + + // dvecn supported if >= 4.1 or ARB_vertex_attrib_64bit + bool has_fp64_native = (gl_major_version == 4 && gl_minor_version >= 1); + bool has_fp64_extension = false; +#ifdef GLEW_ARB_gpu_shader_fp64 if(GLEW_ARB_gpu_shader_fp64) { - #ifdef GL_ARB_gpu_shader_fp64 - const char* frg = "#version 150\n" + // Check that extension can actually be compiled. + if (has_fp64_extension) + { + std::string frg = format( + "${GLSL_VERSION}\n" + "#extension GL_ARB_gpu_shader_fp64 : enable\n" + "uniform dvec2 dv2;\n" + "${FRAG_OUTPUT_DECLARATION}\n" + "void main(void) {\n" + " ${FRAG_OUTPUT_VARIABLE} = vec4(dv2.x, dv2.y, dv2.x, dv2.y);\n" + "}\n", + glsl_vars, glsl_vals); + GLint prg_id = createProgram(vtx, frg, /*print_errors=*/false); + if (prg_id) + { + has_fp64_extension = true; + glDeleteProgram(prg_id); + } + } + } +#endif + + if( has_fp64_native || has_fp64_extension ) + { + std::vector<std::string> glsl_vars_with_extension = glsl_vars; + glsl_vars_with_extension.push_back("${GLSL_EXTENSIONS}"); + std::vector<std::string> glsl_vals_with_extension = glsl_vals; + if (has_fp64_extension) + { + glsl_vals_with_extension.push_back("#extension GL_ARB_gpu_shader_fp64 : enable"); + } + else + { + glsl_vals_with_extension.push_back(""); + } + + std::string frg = format( + "${GLSL_VERSION}\n" + "${GLSL_EXTENSIONS}\n" "uniform dvec2 v2d;\n" "uniform dvec3 v3d;\n" "uniform dvec4 v4d;\n" - "out vec4 data;\n" - "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n"; - - GLint prg_id = createShader(vtx,frg); - - typedef Vector2d Vector2d; - typedef Vector3d Vector3d; - typedef Vector4d Vector4d; - - VERIFY_UNIFORM(dv,v2d, Vector2d); - VERIFY_UNIFORM(dv,v3d, Vector3d); - VERIFY_UNIFORM(dv,v4d, Vector4d); - #endif + "${FRAG_OUTPUT_DECLARATION}\n" + "void main(void) {\n" + " ${FRAG_OUTPUT_VARIABLE} = vec4(v2d[0]+v3d[0]+v4d[0]);\n" + "}\n", + glsl_vars_with_extension, glsl_vals_with_extension); + + GLint prg_id = createProgram(vtx,frg); + VERIFY(prg_id > 0 && "Failed to create program."); + VERIFY_UNIFORM(dv, v2d, Vector2d); + VERIFY_UNIFORM(dv, v3d, Vector3d); + VERIFY_UNIFORM(dv, v4d, Vector4d); + glDeleteProgram(prg_id); } else - std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n"; - #else - std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n"; - #endif + std::cerr << "Warning: dvec (fp64) was not tested.\n"; } - + + // Exit loop - Leaving main loop is supported by freeglut, otherwise we + // are forced to exit. +#ifdef FREEGLUT + glutLeaveMainLoop(); + // Trigger another display loop iteration. Otherwise, it just hangs. + glutPostRedisplay(); +#else + exit(0); +#endif +} + +EIGEN_DECLARE_TEST(openglsupport) +{ + int argc = 0; + glutInit(&argc, 0); + + GLint glut_display_mode = GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH; + +#ifndef EIGEN_LEGACY_OPENGL + // Initialize 3.2+ OpenGL context. +#if defined(__APPLE_CC__) + glut_display_mode |= GLUT_3_2_CORE_PROFILE; +#elif defined(FREEGLUT) + glutInitContextVersion(3, 2); + glutInitContextFlags(GLUT_FORWARD_COMPATIBLE); + glutInitContextProfile(GLUT_CORE_PROFILE); +#endif +#endif + + glutInitDisplayMode(glut_display_mode); + glutInitWindowPosition(0, 0); + glutInitWindowSize(10, 10); + + int window = glutCreateWindow("Eigen"); + if(window <= 0) + { + std::cerr << "Error: Unable to create GLUT Window.\n"; + exit(1); + } + + glewExperimental = GL_TRUE; + if(glewInit() != GLEW_OK) + { + std::cerr << "Warning: Failed to initialize GLEW.\n"; + exit(1); + } + + // Run test in display, otherwise GLUT fails to clean up and leads to memory + // access errors on exit. + glutDisplayFunc(openglsupport_test_loop); + glutMainLoop(); + glutDestroyWindow(window); } diff --git a/unsupported/test/polynomialsolver.cpp b/unsupported/test/polynomialsolver.cpp index 0c87478dd..4ff9bda5a 100644 --- a/unsupported/test/polynomialsolver.cpp +++ b/unsupported/test/polynomialsolver.cpp @@ -26,15 +26,25 @@ struct increment_if_fixed_size } } +template<typename PolynomialType> +PolynomialType polyder(const PolynomialType& p) +{ + typedef typename PolynomialType::Scalar Scalar; + PolynomialType res(p.size()); + for(Index i=1; i<p.size(); ++i) + res[i-1] = p[i]*Scalar(i); + res[p.size()-1] = 0.; + return res; +} template<int Deg, typename POLYNOMIAL, typename SOLVER> bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) { - typedef typename POLYNOMIAL::Index Index; typedef typename POLYNOMIAL::Scalar Scalar; + typedef typename POLYNOMIAL::RealScalar RealScalar; typedef typename SOLVER::RootsType RootsType; - typedef Matrix<Scalar,Deg,1> EvalRootsType; + typedef Matrix<RealScalar,Deg,1> EvalRootsType; const Index deg = pols.size()-1; @@ -44,10 +54,17 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) psolve.compute( pols ); const RootsType& roots( psolve.roots() ); EvalRootsType evr( deg ); + POLYNOMIAL pols_der = polyder(pols); + EvalRootsType der( deg ); for( int i=0; i<roots.size(); ++i ){ - evr[i] = std::abs( poly_eval( pols, roots[i] ) ); } + evr[i] = std::abs( poly_eval( pols, roots[i] ) ); + der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) )); + } - bool evalToZero = evr.isZero( test_precision<Scalar>() ); + // we need to divide by the magnitude of the derivative because + // with a high derivative is very small error in the value of the root + // yiels a very large error in the polynomial evaluation. + bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() ); if( !evalToZero ) { cerr << "WRONG root: " << endl; @@ -57,7 +74,7 @@ bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve ) cerr << endl; } - std::vector<Scalar> rootModuli( roots.size() ); + std::vector<RealScalar> rootModuli( roots.size() ); Map< EvalRootsType > aux( &rootModuli[0], roots.size() ); aux = roots.array().abs(); std::sort( rootModuli.begin(), rootModuli.end() ); @@ -83,7 +100,7 @@ void evalSolver( const POLYNOMIAL& pols ) { typedef typename POLYNOMIAL::Scalar Scalar; - typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; + typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; PolynomialSolverType psolve; aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve ); @@ -97,6 +114,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const { using std::sqrt; typedef typename POLYNOMIAL::Scalar Scalar; + typedef typename POLYNOMIAL::RealScalar RealScalar; typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType; @@ -107,15 +125,12 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const // 1) the roots found are correct // 2) the roots have distinct moduli - typedef typename POLYNOMIAL::Scalar Scalar; - typedef typename REAL_ROOTS::Scalar Real; - //Test realRoots - std::vector< Real > calc_realRoots; - psolve.realRoots( calc_realRoots ); - VERIFY( calc_realRoots.size() == (size_t)real_roots.size() ); + std::vector< RealScalar > calc_realRoots; + psolve.realRoots( calc_realRoots, test_precision<RealScalar>()); + VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() ); - const Scalar psPrec = sqrt( test_precision<Scalar>() ); + const RealScalar psPrec = sqrt( test_precision<RealScalar>() ); for( size_t i=0; i<calc_realRoots.size(); ++i ) { @@ -138,7 +153,7 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const bool hasRealRoot; //Test absGreatestRealRoot - Real r = psolve.absGreatestRealRoot( hasRealRoot ); + RealScalar r = psolve.absGreatestRealRoot( hasRealRoot ); VERIFY( hasRealRoot == (real_roots.size() > 0 ) ); if( hasRealRoot ){ VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) ); } @@ -167,9 +182,11 @@ void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const template<typename _Scalar, int _Deg> void polynomialsolver(int deg) { - typedef internal::increment_if_fixed_size<_Deg> Dim; + typedef typename NumTraits<_Scalar>::Real RealScalar; + typedef internal::increment_if_fixed_size<_Deg> Dim; typedef Matrix<_Scalar,Dim::ret,1> PolynomialType; typedef Matrix<_Scalar,_Deg,1> EvalRootsType; + typedef Matrix<RealScalar,_Deg,1> RealRootsType; cout << "Standard cases" << endl; PolynomialType pols = PolynomialType::Random(deg+1); @@ -182,19 +199,15 @@ void polynomialsolver(int deg) evalSolver<_Deg,PolynomialType>( pols ); cout << "Test sugar" << endl; - EvalRootsType realRoots = EvalRootsType::Random(deg); + RealRootsType realRoots = RealRootsType::Random(deg); roots_to_monicPolynomial( realRoots, pols ); evalSolverSugarFunction<_Deg>( pols, - realRoots.template cast < - std::complex< - typename NumTraits<_Scalar>::Real - > - >(), + realRoots.template cast <std::complex<RealScalar> >().eval(), realRoots ); } -void test_polynomialsolver() +EIGEN_DECLARE_TEST(polynomialsolver) { for(int i = 0; i < g_repeat; i++) { @@ -214,5 +227,6 @@ void test_polynomialsolver() internal::random<int>(9,13) )) ); CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) ); + CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) ); } } diff --git a/unsupported/test/polynomialutils.cpp b/unsupported/test/polynomialutils.cpp index 5fc968402..8ff451996 100644 --- a/unsupported/test/polynomialutils.cpp +++ b/unsupported/test/polynomialutils.cpp @@ -101,7 +101,7 @@ template<typename _Scalar> void CauchyBounds_scalar() internal::random<int>(18,26) )) ); } -void test_polynomialutils() +EIGEN_DECLARE_TEST(polynomialutils) { for(int i = 0; i < g_repeat; i++) { diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp index a010ceb93..602c2cb84 100644 --- a/unsupported/test/sparse_extra.cpp +++ b/unsupported/test/sparse_extra.cpp @@ -8,10 +8,45 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// import basic and product tests for deprectaed DynamicSparseMatrix +// import basic and product tests for deprecated DynamicSparseMatrix +#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled +static long g_realloc_count = 0; +#define EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN g_realloc_count++; + +static long g_dense_op_sparse_count = 0; +#define EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN g_dense_op_sparse_count++; +#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN g_dense_op_sparse_count+=10; +#define EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN g_dense_op_sparse_count+=20; + +#define EIGEN_SPARSE_TEST_INCLUDED_FROM_SPARSE_EXTRA 1 +#endif + #define EIGEN_NO_DEPRECATED_WARNING -#include "sparse_basic.cpp" +// Disable counting of temporaries, since sparse_product(DynamicSparseMatrix) +// has an extra copy-assignment. +#define EIGEN_SPARSE_PRODUCT_IGNORE_TEMPORARY_COUNT #include "sparse_product.cpp" + +#if 0 // sparse_basic(DynamicSparseMatrix) does not compile at all -> disabled +#include "sparse_basic.cpp" +#endif + +#if EIGEN_HAS_CXX11 + +#ifdef min +#undef min +#endif + +#ifdef max +#undef max +#endif + +#include <unordered_map> +#define EIGEN_UNORDERED_MAP_SUPPORT + +#endif + + #include <Eigen/SparseExtra> template<typename SetterType,typename DenseType, typename Scalar, int Options> @@ -104,10 +139,8 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re #ifdef EIGEN_UNORDERED_MAP_SUPPORT VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) )); #endif - #ifdef _DENSE_HASH_MAP_H_ + #ifdef EIGEN_GOOGLEHASH_SUPPORT VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) )); - #endif - #ifdef _SPARSE_HASH_MAP_H_ VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) )); #endif @@ -129,7 +162,32 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re } -void test_sparse_extra() + +template<typename SparseMatrixType> +void check_marketio() +{ + typedef Matrix<typename SparseMatrixType::Scalar, Dynamic, Dynamic> DenseMatrix; + Index rows = internal::random<Index>(1,100); + Index cols = internal::random<Index>(1,100); + SparseMatrixType m1, m2; + m1 = DenseMatrix::Random(rows, cols).sparseView(); + saveMarket(m1, "sparse_extra.mtx"); + loadMarket(m2, "sparse_extra.mtx"); + VERIFY_IS_EQUAL(DenseMatrix(m1),DenseMatrix(m2)); +} + +template<typename VectorType> +void check_marketio_vector() +{ + Index size = internal::random<Index>(1,100); + VectorType v1, v2; + v1 = VectorType::Random(size); + saveMarketVector(v1, "vector_extra.mtx"); + loadMarketVector(v2, "vector_extra.mtx"); + VERIFY_IS_EQUAL(v1,v2); +} + +EIGEN_DECLARE_TEST(sparse_extra) { for(int i = 0; i < g_repeat; i++) { int s = Eigen::internal::random<int>(1,50); @@ -143,5 +201,26 @@ void test_sparse_extra() CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) ); CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) ); + + CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<float,ColMajor,long int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<double,ColMajor,long int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<float>,ColMajor,long int> >()) ); + CALL_SUBTEST_4( (check_marketio<SparseMatrix<std::complex<double>,ColMajor,long int> >()) ); + + + CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,1,Dynamic> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,1,Dynamic> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,1,Dynamic> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,1,Dynamic> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<float,Dynamic,1> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<double,Dynamic,1> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<float>,Dynamic,1> >()) ); + CALL_SUBTEST_5( (check_marketio_vector<Matrix<std::complex<double>,Dynamic,1> >()) ); + + TEST_SET_BUT_UNUSED_VARIABLE(s); } } diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 057fb3e92..589bb76e1 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -7,9 +7,21 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +#include <limits.h> #include "main.h" #include "../Eigen/SpecialFunctions" +// Hack to allow "implicit" conversions from double to Scalar via comma-initialization. +template<typename Derived> +Eigen::CommaInitializer<Derived> operator<<(Eigen::DenseBase<Derived>& dense, double v) { + return (dense << static_cast<typename Derived::Scalar>(v)); +} + +template<typename XprType> +Eigen::CommaInitializer<XprType>& operator,(Eigen::CommaInitializer<XprType>& ci, double v) { + return (ci, static_cast<typename XprType::Scalar>(v)); +} + template<typename X, typename Y> void verify_component_wise(const X& x, const Y& y) { @@ -64,8 +76,8 @@ template<typename ArrayType> void array_special_functions() // igamma(a, x) = gamma(a, x) / Gamma(a) // where Gamma and gamma are considered the standard unnormalized // upper and lower incomplete gamma functions, respectively. - ArrayType a = m1.abs() + 2; - ArrayType x = m2.abs() + 2; + ArrayType a = m1.abs() + Scalar(2); + ArrayType x = m2.abs() + Scalar(2); ArrayType zero = ArrayType::Zero(rows, cols); ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0)); ArrayType a_m1 = a - one; @@ -74,6 +86,7 @@ template<typename ArrayType> void array_special_functions() ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp(); ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp(); + // Gamma(a, 0) == Gamma(a) VERIFY_IS_APPROX(Eigen::igammac(a, zero), one); @@ -81,10 +94,23 @@ template<typename ArrayType> void array_special_functions() VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp()); // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x) - VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp()); + VERIFY_IS_APPROX(Gamma_a_x, (a - Scalar(1)) * Gamma_a_m1_x + x.pow(a-Scalar(1)) * (-x).exp()); // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x) - VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp()); + VERIFY_IS_APPROX(gamma_a_x, (a - Scalar(1)) * gamma_a_m1_x - x.pow(a-Scalar(1)) * (-x).exp()); + } + { + // Verify for large a and x that values are between 0 and 1. + ArrayType m1 = ArrayType::Random(rows,cols); + ArrayType m2 = ArrayType::Random(rows,cols); + int max_exponent = std::numeric_limits<Scalar>::max_exponent10; + ArrayType a = m1.abs() * Scalar(pow(10., max_exponent - 1)); + ArrayType x = m2.abs() * Scalar(pow(10., max_exponent - 1)); + for (int i = 0; i < a.size(); ++i) { + Scalar igam = numext::igamma(a(i), x(i)); + VERIFY(0 <= igam); + VERIFY(igam <= 1); + } } { @@ -93,27 +119,37 @@ template<typename ArrayType> void array_special_functions() Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)}; // location i*6+j corresponds to a_s[i], x_s[j]. - Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan}, - {0.0, 0.6321205588285578, 0.7768698398515702, - 0.9816843611112658, 9.999500016666262e-05, 1.0}, - {0.0, 0.4275932955291202, 0.608374823728911, - 0.9539882943107686, 7.522076445089201e-07, 1.0}, - {0.0, 0.01898815687615381, 0.06564245437845008, - 0.5665298796332909, 4.166333347221828e-18, 1.0}, - {0.0, 0.9999780593618628, 0.9999899967080838, - 0.9999996219837988, 0.9991370418689945, 1.0}, - {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}}; - Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan}, - {1.0, 0.36787944117144233, 0.22313016014842982, - 0.018315638888734182, 0.9999000049998333, 0.0}, - {1.0, 0.5724067044708798, 0.3916251762710878, - 0.04601170568923136, 0.9999992477923555, 0.0}, - {1.0, 0.9810118431238462, 0.9343575456215499, - 0.4334701203667089, 1.0, 0.0}, - {1.0, 2.1940638138146658e-05, 1.0003291916285e-05, - 3.7801620118431334e-07, 0.0008629581310054535, - 0.0}, - {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}}; + Scalar igamma_s[][6] = { + {Scalar(0.0), nan, nan, nan, nan, nan}, + {Scalar(0.0), Scalar(0.6321205588285578), Scalar(0.7768698398515702), + Scalar(0.9816843611112658), Scalar(9.999500016666262e-05), + Scalar(1.0)}, + {Scalar(0.0), Scalar(0.4275932955291202), Scalar(0.608374823728911), + Scalar(0.9539882943107686), Scalar(7.522076445089201e-07), + Scalar(1.0)}, + {Scalar(0.0), Scalar(0.01898815687615381), + Scalar(0.06564245437845008), Scalar(0.5665298796332909), + Scalar(4.166333347221828e-18), Scalar(1.0)}, + {Scalar(0.0), Scalar(0.9999780593618628), Scalar(0.9999899967080838), + Scalar(0.9999996219837988), Scalar(0.9991370418689945), Scalar(1.0)}, + {Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), Scalar(0.0), + Scalar(0.5042041932513908)}}; + Scalar igammac_s[][6] = { + {nan, nan, nan, nan, nan, nan}, + {Scalar(1.0), Scalar(0.36787944117144233), + Scalar(0.22313016014842982), Scalar(0.018315638888734182), + Scalar(0.9999000049998333), Scalar(0.0)}, + {Scalar(1.0), Scalar(0.5724067044708798), Scalar(0.3916251762710878), + Scalar(0.04601170568923136), Scalar(0.9999992477923555), + Scalar(0.0)}, + {Scalar(1.0), Scalar(0.9810118431238462), Scalar(0.9343575456215499), + Scalar(0.4334701203667089), Scalar(1.0), Scalar(0.0)}, + {Scalar(1.0), Scalar(2.1940638138146658e-05), + Scalar(1.0003291916285e-05), Scalar(3.7801620118431334e-07), + Scalar(0.0008629581310054535), Scalar(0.0)}, + {Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), Scalar(1.0), + Scalar(0.49579580674813944)}}; + for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { if ((std::isnan)(igamma_s[i][j])) { @@ -133,12 +169,32 @@ template<typename ArrayType> void array_special_functions() } #endif // EIGEN_HAS_C99_MATH + // Check the ndtri function against scipy.special.ndtri + { + ArrayType x(7), res(7), ref(7); + x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01; + ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408; + CALL_SUBTEST( verify_component_wise(ref, ref); ); + CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); ); + CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); ); + + // ndtri(normal_cdf(x)) ~= x + CALL_SUBTEST( + ArrayType m1 = ArrayType::Random(32); + using std::sqrt; + + ArrayType cdf_val = (m1 / Scalar(sqrt(2.))).erf(); + cdf_val = (cdf_val + Scalar(1)) / Scalar(2); + verify_component_wise(cdf_val.ndtri(), m1);); + + } + // Check the zeta function against scipy.special.zeta { - ArrayType x(7), q(7), res(7), ref(7); - x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9; - q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345; - ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan; + ArrayType x(10), q(10), res(10), ref(10); + x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4; + q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3; + ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); ); CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); ); @@ -146,22 +202,21 @@ template<typename ArrayType> void array_special_functions() // digamma { - ArrayType x(7), res(7), ref(7); - x << 1, 1.5, 4, -10.5, 10000.5, 0, -1; - ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf; + ArrayType x(9), res(9), ref(9); + x << 1, 1.5, 4, -10.5, 10000.5, 0, -1, -2, -3; + ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, nan, nan, nan, nan; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); ); CALL_SUBTEST( res = digamma(x); verify_component_wise(res, ref); ); } - #if EIGEN_HAS_C99_MATH { - ArrayType n(11), x(11), res(11), ref(11); - n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170; - x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64; - ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927; + ArrayType n(16), x(16), res(16), ref(16); + n << 1, 1, 1, 1.5, 17, 31, 28, 8, 42, 147, 170, -1, 0, 1, 2, 3; + x << 2, 3, 25.5, 1.5, 4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64, -1, -2, -3, -4, -5; + ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927, nan, nan, plusinf, nan, plusinf; CALL_SUBTEST( verify_component_wise(ref, ref); ); if(sizeof(RealScalar)>=8) { // double @@ -288,8 +343,8 @@ template<typename ArrayType> void array_special_functions() ArrayType m3 = ArrayType::Random(32); ArrayType one = ArrayType::Constant(32, Scalar(1.0)); const Scalar eps = std::numeric_limits<Scalar>::epsilon(); - ArrayType a = (m1 * 4.0).exp(); - ArrayType b = (m2 * 4.0).exp(); + ArrayType a = (m1 * Scalar(4)).exp(); + ArrayType b = (m2 * Scalar(4)).exp(); ArrayType x = m3.abs(); // betainc(a, 1, x) == x**a @@ -335,11 +390,108 @@ template<typename ArrayType> void array_special_functions() ArrayType test = betainc(a, b + one, x) + eps; verify_component_wise(test, expected);); } -#endif +#endif // EIGEN_HAS_C99_MATH + + /* Code to generate the data for the following two test cases. + N = 5 + np.random.seed(3) + + a = np.logspace(-2, 3, 6) + a = np.ravel(np.tile(np.reshape(a, [-1, 1]), [1, N])) + x = np.random.gamma(a, 1.0) + x = np.maximum(x, np.finfo(np.float32).tiny) + + def igamma(a, x): + return mpmath.gammainc(a, 0, x, regularized=True) + + def igamma_der_a(a, x): + res = mpmath.diff(lambda a_prime: igamma(a_prime, x), a) + return np.float64(res) + + def gamma_sample_der_alpha(a, x): + igamma_x = igamma(a, x) + def igammainv_of_igamma(a_prime): + return mpmath.findroot(lambda x_prime: igamma(a_prime, x_prime) - + igamma_x, x, solver='newton') + return np.float64(mpmath.diff(igammainv_of_igamma, a)) + + v_igamma_der_a = np.vectorize(igamma_der_a)(a, x) + v_gamma_sample_der_alpha = np.vectorize(gamma_sample_der_alpha)(a, x) + */ + +#if EIGEN_HAS_C99_MATH + // Test igamma_der_a + { + ArrayType a(30); + ArrayType x(30); + ArrayType res(30); + ArrayType v(30); + + a << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, 1.0, + 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0, + 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + x << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, + 0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, + 0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426, + 0.786686768458, 7.63873279537, 13.1944344379, 11.896042354, + 10.5830172417, 10.5020942233, 92.8918587747, 95.003720371, + 86.3715926467, 96.0330217672, 82.6389930677, 968.702906754, + 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + v << -32.7256441441, -36.4394150514, -9.66467612263, -36.4394150514, + -36.4394150514, -1.0891900302, -2.66351229645, -2.48666868596, + -0.929700494428, -3.56327722764, -0.455320135314, -0.391437214323, + -0.491352055991, -0.350454834292, -0.471773162921, -0.104084440522, + -0.0723646747909, -0.0992828975532, -0.121638215446, -0.122619605294, + -0.0317670267286, -0.0359974812869, -0.0154359225363, -0.0375775365921, + -0.00794899153653, -0.00777303219211, -0.00796085782042, + -0.0125850719397, -0.00455500206958, -0.00476436993148; + + CALL_SUBTEST(res = igamma_der_a(a, x); verify_component_wise(res, v);); + } + + // Test gamma_sample_der_alpha + { + ArrayType alpha(30); + ArrayType sample(30); + ArrayType res(30); + ArrayType v(30); + + alpha << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0, + 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0, + 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0; + + sample << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05, + 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, + 0.0132865061065, 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, + 0.333412038288, 1.18135687766, 0.580629033777, 0.170631439426, + 0.786686768458, 7.63873279537, 13.1944344379, 11.896042354, + 10.5830172417, 10.5020942233, 92.8918587747, 95.003720371, + 86.3715926467, 96.0330217672, 82.6389930677, 968.702906754, + 969.463546828, 1001.79726022, 955.047416547, 1044.27458568; + + v << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738, + 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243, + 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302, + 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534, + 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812, + 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061, + 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206, + 1.00106492525, 0.97734200649, 1.02198794179; + + CALL_SUBTEST(res = gamma_sample_der_alpha(alpha, sample); + verify_component_wise(res, v);); + } +#endif // EIGEN_HAS_C99_MATH } -void test_special_functions() +EIGEN_DECLARE_TEST(special_functions) { CALL_SUBTEST_1(array_special_functions<ArrayXf>()); CALL_SUBTEST_2(array_special_functions<ArrayXd>()); + // TODO(cantonios): half/bfloat16 don't have enough precision to reproduce results above. + // CALL_SUBTEST_3(array_special_functions<ArrayX<Eigen::half>>()); + // CALL_SUBTEST_4(array_special_functions<ArrayX<Eigen::bfloat16>>()); } diff --git a/unsupported/test/special_packetmath.cpp b/unsupported/test/special_packetmath.cpp new file mode 100644 index 000000000..31233f1b0 --- /dev/null +++ b/unsupported/test/special_packetmath.cpp @@ -0,0 +1,149 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr> +// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com> +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include <limits> +#include "packetmath_test_shared.h" +#include "../Eigen/SpecialFunctions" + +template<typename Scalar,typename Packet> void packetmath_real() +{ + using std::abs; + typedef internal::packet_traits<Scalar> PacketTraits; + const int PacketSize = internal::unpacket_traits<Packet>::size; + + const int size = PacketSize*4; + EIGEN_ALIGN_MAX Scalar data1[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar data2[PacketSize*4]; + EIGEN_ALIGN_MAX Scalar ref[PacketSize*4]; + +#if EIGEN_HAS_C99_MATH + { + data1[0] = std::numeric_limits<Scalar>::quiet_NaN(); + test::packet_helper<internal::packet_traits<Scalar>::HasLGamma,Packet> h; + h.store(data2, internal::plgamma(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + } + if (internal::packet_traits<Scalar>::HasErf) { + data1[0] = std::numeric_limits<Scalar>::quiet_NaN(); + test::packet_helper<internal::packet_traits<Scalar>::HasErf,Packet> h; + h.store(data2, internal::perf(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + } + { + data1[0] = std::numeric_limits<Scalar>::quiet_NaN(); + test::packet_helper<internal::packet_traits<Scalar>::HasErfc,Packet> h; + h.store(data2, internal::perfc(h.load(data1))); + VERIFY((numext::isnan)(data2[0])); + } + { + for (int i=0; i<size; ++i) { + data1[i] = internal::random<Scalar>(Scalar(0),Scalar(1)); + } + CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasNdtri, numext::ndtri, internal::pndtri); + } +#endif // EIGEN_HAS_C99_MATH + + // For bessel_i*e and bessel_j*, the valid range is negative reals. + { + const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 6); + for (int i=0; i<size; ++i) + { + data1[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent)))); + data2[i] = internal::random<Scalar>(Scalar(-1),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-max_exponent),Scalar(max_exponent)))); + } + + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0e, internal::pbessel_i0e); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1e, internal::pbessel_i1e); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j0, internal::pbessel_j0); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_j1, internal::pbessel_j1); + } + + // Use a smaller data range for the bessel_i* as these can become very large. + // Following #1693, we also restrict this range further to avoid inf's due to + // differences in pexp and exp. + for (int i=0; i<size; ++i) { + data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + } + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i0, internal::pbessel_i0); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_i1, internal::pbessel_i1); + + + // y_i, and k_i are valid for x > 0. + { + const int max_exponent = numext::mini(std::numeric_limits<Scalar>::max_exponent10-1, 5); + for (int i=0; i<size; ++i) + { + data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent)))); + data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-2),Scalar(max_exponent)))); + } + } + + // TODO(srvasude): Re-enable this test once properly investigated why the + // scalar and vector paths differ. + // CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y0, internal::pbessel_y0); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_y1, internal::pbessel_y1); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0e, internal::pbessel_k0e); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1e, internal::pbessel_k1e); + + // Following #1693, we restrict the range for exp to avoid zeroing out too + // fast. + for (int i=0; i<size; ++i) { + data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(9), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + } + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k0, internal::pbessel_k0); + CHECK_CWISE1_IF(PacketTraits::HasBessel, numext::bessel_k1, internal::pbessel_k1); + + + for (int i=0; i<size; ++i) { + data1[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + data2[i] = internal::random<Scalar>(Scalar(0.01),Scalar(1)) * + Scalar(std::pow(Scalar(10), internal::random<Scalar>(Scalar(-1),Scalar(2)))); + } + +#if EIGEN_HAS_C99_MATH && (EIGEN_COMP_CXXVER >= 11) + CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasLGamma, std::lgamma, internal::plgamma); + CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErf, std::erf, internal::perf); + CHECK_CWISE1_IF(internal::packet_traits<Scalar>::HasErfc, std::erfc, internal::perfc); +#endif + +} + +namespace Eigen { +namespace test { + +template<typename Scalar,typename PacketType, bool IsComplex, bool IsInteger> +struct runall { + static void run() { + packetmath_real<Scalar,PacketType>(); + } +}; + +} +} + +EIGEN_DECLARE_TEST(special_packetmath) +{ + g_first_pass = true; + for(int i = 0; i < g_repeat; i++) { + + CALL_SUBTEST_1( test::runner<float>::run() ); + CALL_SUBTEST_2( test::runner<double>::run() ); + CALL_SUBTEST_3( test::runner<Eigen::half>::run() ); + CALL_SUBTEST_4( test::runner<Eigen::bfloat16>::run() ); + g_first_pass = false; + } +} diff --git a/unsupported/test/splines.cpp b/unsupported/test/splines.cpp index 3be020434..88ec87b97 100644 --- a/unsupported/test/splines.cpp +++ b/unsupported/test/splines.cpp @@ -268,7 +268,7 @@ void check_global_interpolation_with_derivatives2d() } } -void test_splines() +EIGEN_DECLARE_TEST(splines) { for (int i = 0; i < g_repeat; ++i) { |