diff options
Diffstat (limited to 'bench/tensors/tensor_benchmarks.h')
-rw-r--r-- | bench/tensors/tensor_benchmarks.h | 199 |
1 files changed, 159 insertions, 40 deletions
diff --git a/bench/tensors/tensor_benchmarks.h b/bench/tensors/tensor_benchmarks.h index c2fb3dede..0825e1563 100644 --- a/bench/tensors/tensor_benchmarks.h +++ b/bench/tensors/tensor_benchmarks.h @@ -27,6 +27,11 @@ template <typename Device, typename T> class BenchmarkSuite { initialize(); } + BenchmarkSuite(const Device& device, size_t m, size_t k) + : m_(1), k_(k), n_(m), device_(device) { + initialize(); + } + ~BenchmarkSuite() { device_.deallocate(a_); device_.deallocate(b_); @@ -35,6 +40,11 @@ template <typename Device, typename T> class BenchmarkSuite { void memcpy(int num_iters) { eigen_assert(m_ == k_ && k_ == n_); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); @@ -55,7 +65,11 @@ template <typename Device, typename T> class BenchmarkSuite { } const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes); TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.template cast<T>(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.template cast<T>(); @@ -70,7 +84,11 @@ template <typename Device, typename T> class BenchmarkSuite { sizes[0] = m_; sizes[1] = m_; TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = C.random(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = C.random(); @@ -93,7 +111,18 @@ template <typename Device, typename T> class BenchmarkSuite { const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2); const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0); const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.slice(first_quadrant, quarter_sizes).device(device_) = + A.slice(first_quadrant, quarter_sizes); + C.slice(second_quadrant, quarter_sizes).device(device_) = + B.slice(second_quadrant, quarter_sizes); + C.slice(third_quadrant, quarter_sizes).device(device_) = + A.slice(third_quadrant, quarter_sizes); + C.slice(fourth_quadrant, quarter_sizes).device(device_) = + B.slice(fourth_quadrant, quarter_sizes); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.slice(first_quadrant, quarter_sizes).device(device_) = @@ -118,7 +147,11 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<TensorIndex, 1> output_size; output_size[0] = n_; TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % k_, 0); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % k_, 0); @@ -135,7 +168,11 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<TensorIndex, 1> output_size; output_size[0] = n_; TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.chip(iter % n_, 1); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.chip(iter % n_, 1); @@ -158,7 +195,11 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<int, 2> shuffle; shuffle[0] = 1; shuffle[1] = 0; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.shuffle(shuffle); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.shuffle(shuffle); @@ -186,7 +227,11 @@ template <typename Device, typename T> class BenchmarkSuite { paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.pad(paddings); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.pad(paddings); @@ -216,6 +261,11 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides; #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + B.device(device_) = A.stride(strides); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { B.device(device_) = A.stride(strides); @@ -224,6 +274,7 @@ template <typename Device, typename T> class BenchmarkSuite { finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); } + void broadcasting(int num_iters) { Eigen::array<TensorIndex, 2> size_a; size_a[0] = m_; @@ -245,6 +296,11 @@ template <typename Device, typename T> class BenchmarkSuite { broadcast.set(1, n_); #endif +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.broadcast(broadcast); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.broadcast(broadcast); @@ -261,7 +317,11 @@ template <typename Device, typename T> class BenchmarkSuite { const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); @@ -280,6 +340,11 @@ template <typename Device, typename T> class BenchmarkSuite { const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); +#ifdef EIGEN_USE_SYCL // warmup for sycl +for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); +} +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); @@ -297,7 +362,11 @@ template <typename Device, typename T> class BenchmarkSuite { const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.exp() + B.log(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.exp() + B.log(); @@ -325,7 +394,11 @@ template <typename Device, typename T> class BenchmarkSuite { // optimize the code. Eigen::IndexList<Eigen::type2index<0>> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(sum_along_dim); @@ -344,8 +417,8 @@ template <typename Device, typename T> class BenchmarkSuite { b_, input_size); Eigen::array<TensorIndex, 1> output_size; output_size[0] = k_; - TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C( - c_, output_size); + TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> A( + a_, output_size); #ifndef EIGEN_HAS_INDEX_LIST Eigen::array<TensorIndex, 1> sum_along_dim; @@ -355,10 +428,14 @@ template <typename Device, typename T> class BenchmarkSuite { // optimize the code. Eigen::IndexList<Eigen::type2index<1>> sum_along_dim; #endif - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + A.device(device_) = B.sum(sum_along_dim); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = B.sum(sum_along_dim); + A.device(device_) = B.sum(sum_along_dim); } // Record the number of FLOP executed per second (assuming one operation // per value) @@ -375,7 +452,11 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<TensorIndex, 0> output_size; TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C( c_, output_size); - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = B.sum(); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = B.sum(); @@ -385,33 +466,27 @@ template <typename Device, typename T> class BenchmarkSuite { finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); } + + // do a contraction which is equivalent to a matrix multiplication void contraction(int num_iters) { - Eigen::array<TensorIndex, 2> sizeA; - sizeA[0] = m_; - sizeA[1] = k_; - Eigen::array<TensorIndex, 2> sizeB; - sizeB[0] = k_; - sizeB[1] = n_; - Eigen::array<TensorIndex, 2> sizeC; - sizeC[0] = m_; - sizeC[1] = n_; + contraction<static_cast<int>(Eigen::ColMajor)>(num_iters, false, false); + } - const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); - const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); - TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); + void contractionRowMajor(int num_iters) { + contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, false); + } + + void contractionRowMajorAT(int num_iters) { + contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, false); + } - typedef typename Tensor<T, 2>::DimensionPair DimPair; - Eigen::array<DimPair, 1> dims; - dims[0] = DimPair(1, 0); + void contractionRowMajorBT(int num_iters) { + contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, false, true); + } - StartBenchmarkTiming(); - for (int iter = 0; iter < num_iters; ++iter) { - C.device(device_) = A.contract(B, dims); - } - // Record the number of FLOP executed per second (size_ multiplications and - // additions for each value in the resulting tensor) - finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters); + void contractionRowMajorABT(int num_iters) { + contraction<static_cast<int>(Eigen::RowMajor)>(num_iters, true, true); } void convolution(int num_iters, int kernel_x, int kernel_y) { @@ -430,18 +505,58 @@ template <typename Device, typename T> class BenchmarkSuite { Eigen::array<TensorIndex, 2> dims; dims[0] = 0; dims[1] = 1; - +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.convolve(B, dims); + } +#endif StartBenchmarkTiming(); for (int iter = 0; iter < num_iters; ++iter) { C.device(device_) = A.convolve(B, dims); } - // Record the number of FLOP executed per second (kernel_size + // Record the number of FLOPs executed per second (kernel_size // multiplications and additions for each value in the resulting tensor) finalizeBenchmark(static_cast<int64_t>(2) * (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); } private: + // do a contraction which is equivalent to a matrix multiplication + template<int Layout> + void contraction(int num_iters, bool trans_a, bool trans_b) { + Eigen::array<TensorIndex, 2> sizeA; + sizeA[0] = (trans_a ? k_: m_); + sizeA[1] = (trans_a ? m_: k_); + Eigen::array<TensorIndex, 2> sizeB; + sizeB[0] = (trans_b ? n_: k_); + sizeB[1] = (trans_b ? k_: n_); + Eigen::array<TensorIndex, 2> sizeC; + sizeC[0] = m_; + sizeC[1] = n_; + + const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> A(a_, sizeA); + const TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> B(b_, sizeB); + TensorMap<Tensor<T, 2, Layout>, Eigen::Aligned> C(c_, sizeC); + + typedef typename Tensor<T, 2, Layout>::DimensionPair DimPair; + Eigen::array<DimPair, 1> dims; + TensorIndex a_contract_dim = (trans_a ? 0 : 1); + TensorIndex b_contract_dim = (trans_b ? 1 : 0); + dims[0] = DimPair(a_contract_dim, b_contract_dim); +#ifdef EIGEN_USE_SYCL // warmup for sycl + for (int iter = 0; iter < 10; ++iter) { + C.device(device_) = A.contract(B, dims); + } +#endif + StartBenchmarkTiming(); + for (int iter = 0; iter < num_iters; ++iter) { + C.device(device_) = A.contract(B, dims); + } + // Record the number of FLOP executed per second (size_ multiplications and + // additions for each value in the resulting tensor) + finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters); + } + void initialize() { a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); @@ -453,7 +568,6 @@ template <typename Device, typename T> class BenchmarkSuite { device_.memset(b_, 23, k_ * n_ * sizeof(T)); device_.memset(c_, 31, m_ * n_ * sizeof(T)); - //BenchmarkUseRealTime(); } inline void finalizeBenchmark(int64_t num_items) { @@ -461,6 +575,11 @@ template <typename Device, typename T> class BenchmarkSuite { if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { device_.synchronize(); } +#elif defined(EIGEN_USE_SYCL) + if (Eigen::internal::is_same<Device, Eigen::SyclDevice>::value) { + device_.synchronize(); + } + #endif StopBenchmarkTiming(); SetBenchmarkFlopsProcessed(num_items); |