From 2aab794c004027d008d6b0b64165bf1961d5d2bb Mon Sep 17 00:00:00 2001 From: Yi Kong Date: Fri, 25 Feb 2022 16:32:14 +0800 Subject: Upgrade eigen to 3.4.0 Steps: * Removed common files between Android copy and the matching upstream copy * Obtained latest upstream tarball (see README.version) * Extracted over the directory Bug: 148287349 Test: presubmit Change-Id: Iee2744719075fdf000b315e973645923da766111 --- bench/perf_monitoring/changesets.txt | 95 +++++++++++ bench/perf_monitoring/gemm.cpp | 12 ++ bench/perf_monitoring/gemm/changesets.txt | 61 -------- bench/perf_monitoring/gemm/gemm.cpp | 67 -------- bench/perf_monitoring/gemm/gemm_settings.txt | 15 -- bench/perf_monitoring/gemm/lazy_gemm.cpp | 98 ------------ bench/perf_monitoring/gemm/lazy_gemm_settings.txt | 15 -- bench/perf_monitoring/gemm/make_plot.sh | 38 ----- bench/perf_monitoring/gemm/run.sh | 156 ------------------ bench/perf_monitoring/gemm_common.h | 67 ++++++++ bench/perf_monitoring/gemm_settings.txt | 15 ++ bench/perf_monitoring/gemm_square_settings.txt | 11 ++ bench/perf_monitoring/gemv.cpp | 12 ++ bench/perf_monitoring/gemv_common.h | 69 ++++++++ bench/perf_monitoring/gemv_settings.txt | 11 ++ bench/perf_monitoring/gemv_square_settings.txt | 13 ++ bench/perf_monitoring/gemvt.cpp | 12 ++ bench/perf_monitoring/lazy_gemm.cpp | 101 ++++++++++++ bench/perf_monitoring/lazy_gemm_settings.txt | 15 ++ bench/perf_monitoring/llt.cpp | 15 ++ bench/perf_monitoring/make_plot.sh | 112 +++++++++++++ bench/perf_monitoring/resources/chart_footer.html | 41 +++++ bench/perf_monitoring/resources/chart_header.html | 45 ++++++ bench/perf_monitoring/resources/footer.html | 3 + bench/perf_monitoring/resources/header.html | 42 +++++ bench/perf_monitoring/resources/s1.js | 1 + bench/perf_monitoring/resources/s2.js | 1 + bench/perf_monitoring/run.sh | 183 ++++++++++++++++++++++ bench/perf_monitoring/runall.sh | 72 +++++++++ bench/perf_monitoring/trmv_lo.cpp | 12 ++ bench/perf_monitoring/trmv_lot.cpp | 12 ++ bench/perf_monitoring/trmv_up.cpp | 12 ++ bench/perf_monitoring/trmv_upt.cpp | 12 ++ 33 files changed, 996 insertions(+), 450 deletions(-) create mode 100644 bench/perf_monitoring/changesets.txt create mode 100644 bench/perf_monitoring/gemm.cpp delete mode 100644 bench/perf_monitoring/gemm/changesets.txt delete mode 100644 bench/perf_monitoring/gemm/gemm.cpp delete mode 100644 bench/perf_monitoring/gemm/gemm_settings.txt delete mode 100644 bench/perf_monitoring/gemm/lazy_gemm.cpp delete mode 100644 bench/perf_monitoring/gemm/lazy_gemm_settings.txt delete mode 100755 bench/perf_monitoring/gemm/make_plot.sh delete mode 100755 bench/perf_monitoring/gemm/run.sh create mode 100644 bench/perf_monitoring/gemm_common.h create mode 100644 bench/perf_monitoring/gemm_settings.txt create mode 100644 bench/perf_monitoring/gemm_square_settings.txt create mode 100644 bench/perf_monitoring/gemv.cpp create mode 100644 bench/perf_monitoring/gemv_common.h create mode 100644 bench/perf_monitoring/gemv_settings.txt create mode 100644 bench/perf_monitoring/gemv_square_settings.txt create mode 100644 bench/perf_monitoring/gemvt.cpp create mode 100644 bench/perf_monitoring/lazy_gemm.cpp create mode 100644 bench/perf_monitoring/lazy_gemm_settings.txt create mode 100644 bench/perf_monitoring/llt.cpp create mode 100755 bench/perf_monitoring/make_plot.sh create mode 100644 bench/perf_monitoring/resources/chart_footer.html create mode 100644 bench/perf_monitoring/resources/chart_header.html create mode 100644 bench/perf_monitoring/resources/footer.html create mode 100644 bench/perf_monitoring/resources/header.html create mode 100644 bench/perf_monitoring/resources/s1.js create mode 100644 bench/perf_monitoring/resources/s2.js create mode 100755 bench/perf_monitoring/run.sh create mode 100755 bench/perf_monitoring/runall.sh create mode 100644 bench/perf_monitoring/trmv_lo.cpp create mode 100644 bench/perf_monitoring/trmv_lot.cpp create mode 100644 bench/perf_monitoring/trmv_up.cpp create mode 100644 bench/perf_monitoring/trmv_upt.cpp (limited to 'bench/perf_monitoring') diff --git a/bench/perf_monitoring/changesets.txt b/bench/perf_monitoring/changesets.txt new file mode 100644 index 000000000..efdd9a0ff --- /dev/null +++ b/bench/perf_monitoring/changesets.txt @@ -0,0 +1,95 @@ +Load hg-to-git hash maps from ./eigen_git/.git/ +#3.0.1 +#3.1.1 +#3.2.0 +3.2.4 +#574a7621809fe +58964a85800bd # introduce AVX +#589cbd7e98174 # merge +589db7d49efbb # introduce FMA +#590a078f442a3 # complex and AVX +590a419cea4a0 # improve packing with ptranspose +#59251e85c936d # merge +#592e497a27ddc +593d5a795f673 # New gebp kernel: up to 3 packets x 4 register-level blocks +#5942c3c95990d # merge +#596c9788d55b9 # Disable 3pX4 kernel on Altivec +#5999aa3dc4e21 # merge +6209452eb38f8 # before-evaluators +#6333eba5e1101 # Implement evaluator for sparse outer products +#663b9d314ae19 +#6655ef95fabee # Properly detect FMA support on ARM +#667fe25f3b8e3 # FMA has been wrongly disabled +#668409547a0c8 +#6694304c73542 # merge default to tensors +#67216047c8d4a # merge default to tensors +#67410a79ca3a3 # merge default to tensors +#674b7271dffb5 # Generalized the gebp apis +676bfdd9f3ac9 # Made the blocking computation aware of the l3 cache;
Also optimized the blocking parameters to take
into account the number of threads used for a computation. +6782dde63499c # generalized gemv +6799f98650d0a # ensured that contractions that can be reduced to a matrix vector product +#6840918c51e60 # merge tensor +684e972b55ec4 # change prefetching in gebp +#68598604576d1 # merge index conversion +68963eb0f6fe6 # clean blocking size computation +689db05f2d01e # rotating kernel for ARM only +#6901b7e12847d # result_of +69226275b250a # fix prefetching change for ARM +692692136350b # prefetching +693a8ad8887bf # blocking size strategy +693bcf9bb5c1f # avoid redundant pack_rhs +6987550107028 # dynamic loop swapping +69858740ce4c6 # rm dynamic loop swapping,
adjust lhs's micro panel height to fully exploit L1 cache +698cd3bbffa73 # blocking heuristic:
block on the rhs in L1 if the lhs fit in L1. +701488c15615a # organize a little our default cache sizes,
and use a saner default L1 outside of x86 (10% faster on Nexus 5) +701e56aabf205 # Refactor computeProductBlockingSizes to make room
for the possibility of using lookup tables +701ca5c12587b # Polish lookup tables generation +7013589a9c115 # actual_panel_rows computation should always be resilient
to parameters not consistent with the known L1 cache size, see comment +70102babb9c0f # Provide a empirical lookup table for blocking sizes measured on a Nexus 5.
Only for float, only for Android on ARM 32bit for now. +7088481dc21ea # Bug 986: add support for coefficient-based
product with 0 depth. +709d7f51feb07 # Bug 992: don't select a 3p GEMM path with non-SIMD scalar types. +759f9303cc7c5 # 3.3-alpha1 +765aba1eda71e # help clang inlining +770fe630c9873 # Improve numerical accuracy in LLT and triangular solve
by using true scalar divisions (instead of x * (1/y)) +#8741d23430628 # Improved the matrix multiplication blocking in the case
where mr is not a power of 2 (e.g on Haswell CPUs) +878f629fe95c8 # Made the index type a template parameter to evaluateProductBlockingSizes.
Use numext::mini and numext::maxi instead of
std::min/std::max to compute blocking sizes. +8975d51a7f12c # Don't optimize the processing of the last rows of
a matrix matrix product in cases that violate
the assumptions made by the optimized code path. +8986136f4fdd4 # Remove the rotating kernel. +898e68e165a23 # Bug 256: enable vectorization with unaligned loads/stores. +91466e99ab6a1 # Relax mixing-type constraints for binary coeff-wise operators +91776236cdea4 # merge +917101ea26f5e # Include the cost of stores in unrolling +921672076db5d # Fix perf regression introduced in changeset e56aabf205 +9210fa9e4a15c # Fix perf regression in dgemm introduced by changeset 5d51a7f12c +936f6b3cf8de9 # 3.3-beta2 +944504a4404f1 # Optimize expression matching 'd?=a-b*c' as 'd?=a; d?=b*c;' +95877e27fbeee # 3.3-rc1 +959779774f98c # Bug 1311: fix alignment logic in some cases
of (scalar*small).lazyProduct(small) +9729f9d8d2f62 # Disabled part of the matrix matrix peeling code
that's incompatible with 512 bit registers +979eeac81b8c0 # 3.3.0 +989c927af60ed # Fix a performance regression in (mat*mat)*vec
for which mat*mat was evaluated multiple times. +994fe696022ec # Operators += and -= do not resize! +99466f65ccc36 # Ease compiler generating clean and efficient code in mat*vec +9946a5fe86098 # Complete rewrite of column-major-matrix * vector product
to deliver higher performance of modern CPU. +99591003f3b86 # Improve performance of row-major-dense-matrix * vector products
for recent CPUs. +997eb621413c1 # Revert vec/y to vec*(1/y) in row-major TRSM +10444bbc320468 # Bug 1435: fix aliasing issue in exressions like: A = C - B*A; +1073624df50945 # Adds missing EIGEN_STRONG_INLINE to support MSVC
properly inlining small vector calculations +1094d428a199ab # Bug 1562: optimize evaluation of small products
of the form s*A*B by rewriting them as: s*(A.lazyProduct(B))
to save a costly temporary.
Measured speedup from 2x to 5x. +1096de9e31a06d # Introduce the macro ei_declare_local_nested_eval to
help allocating on the stack local temporaries via alloca,
and let outer-products makes a good use of it. +11087b91c11207 # Bug 1578: Improve prefetching in matrix multiplication on MIPS. +1153aa110e681b # PR 526: Speed up multiplication of small, dynamically sized matrices +11544ad359237a # Vectorize row-by-row gebp loop iterations on 16 packets as well +1157a476054879 # Bug 1624: improve matrix-matrix product on ARM 64, 20% speedup +1160a4159dba08 # do not read buffers out of bounds +1163c53eececb0 # Implement AVX512 vectorization of std::complex +11644e7746fe22 # Bug 1636: fix gemm performance issue with gcc>=6 and no FMA +1164956678a4ef # Bug 1515: disable gebp's 3pX4 micro kernel
for MSVC<=19.14 because of register spilling. +1165426bce7529 # fix EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
for non vectorized type, and non x86/64 target +11660d90637838 # enable spilling workaround on architectures with SSE/AVX +1166f159cf3d75 # Artificially increase l1-blocking size for AVX512.
+10% speedup with current kernels. +11686dd93f7e3b # Make code compile again for older compilers. +1175dbfcceabf5 # Bug: 1633: refactor gebp kernel and optimize for neon +117670e133333d # Bug 1661: fix regression in GEBP and AVX512 +11760f028f61cb # GEBP: cleanup logic to choose between
a 4 packets of 1 packet (=e118ce86fd+fix) +1180de77bf5d6c # gebp: Add new ½ and ¼ packet rows per (peeling) round on the lhs diff --git a/bench/perf_monitoring/gemm.cpp b/bench/perf_monitoring/gemm.cpp new file mode 100644 index 000000000..804139db7 --- /dev/null +++ b/bench/perf_monitoring/gemm.cpp @@ -0,0 +1,12 @@ +#include "gemm_common.h" + +EIGEN_DONT_INLINE +void gemm(const Mat &A, const Mat &B, Mat &C) +{ + C.noalias() += A * B; +} + +int main(int argc, char **argv) +{ + return main_gemm(argc, argv, gemm); +} diff --git a/bench/perf_monitoring/gemm/changesets.txt b/bench/perf_monitoring/gemm/changesets.txt deleted file mode 100644 index af8eb9b8f..000000000 --- a/bench/perf_monitoring/gemm/changesets.txt +++ /dev/null @@ -1,61 +0,0 @@ -#3.0.1 -#3.1.1 -#3.2.0 -3.2.4 -#5745:37f59e65eb6c -5891:d8652709345d # introduce AVX -#5893:24b4dc92c6d3 # merge -5895:997c2ef9fc8b # introduce FMA -#5904:e1eafd14eaa1 # complex and AVX -5908:f8ee3c721251 # improve packing with ptranspose -#5921:ca808bb456b0 # merge -#5927:8b1001f9e3ac -5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks -#5949:f3488f4e45b2 # merge -#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec -#5992:4a429f5e0483 # merge -before-evaluators -#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products -#6639:c9121c60b5c7 -#6655:06f163b5221f # Properly detect FMA support on ARM -#6677:700e023044e7 # FMA has been wrongly disabled -#6681:11d31dafb0e3 -#6699:5e6e8e10aad1 # merge default to tensors -#6726:ff2d2388e7b9 # merge default to tensors -#6742:0cbd6195e829 # merge default to tensors -#6747:853d2bafeb8f # Generalized the gebp apis -6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation -#6781:9cc5a931b2c6 # generalized gemv -#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product -#6844:039efd86b75c # merge tensor -6845:7333ed40c6ef # change prefetching in gebp -#6856:b5be5e10eb7f # merge index conversion -#6893:c3a64aba7c70 # clean blocking size computation -#6898:6fb31ebe6492 # rotating kernel for ARM -6899:877facace746 # rotating kernel for ARM only -#6904:c250623ae9fa # result_of -6921:915f1b1fc158 # fix prefetching change for ARM -6923:9ff25f6dacc6 # prefetching -6933:52572e60b5d3 # blocking size strategy -6937:c8c042f286b2 # avoid redundant pack_rhs -6981:7e5d6f78da59 # dynamic loop swapping -6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache -6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1. -7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5) -7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables -7016:a58d253e8c91 # Polish lookup tables generation -7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment -7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now. -7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth. -7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code -7591:09a8e2186610 # 3.3-alpha1 -7650:b0f3c8f43025 # help clang inlining -#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs) -8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes -8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path -8985:d935df21a082 # Remove the rotating kernel. -8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores. -9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators -9174:d228bc282ac9 # merge -9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955 -9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775 diff --git a/bench/perf_monitoring/gemm/gemm.cpp b/bench/perf_monitoring/gemm/gemm.cpp deleted file mode 100644 index 614bd4737..000000000 --- a/bench/perf_monitoring/gemm/gemm.cpp +++ /dev/null @@ -1,67 +0,0 @@ -#include -#include -#include -#include -#include "../../BenchTimer.h" -using namespace Eigen; - -#ifndef SCALAR -#error SCALAR must be defined -#endif - -typedef SCALAR Scalar; - -typedef Matrix Mat; - -EIGEN_DONT_INLINE -void gemm(const Mat &A, const Mat &B, Mat &C) -{ - C.noalias() += A * B; -} - -EIGEN_DONT_INLINE -double bench(long m, long n, long k) -{ - Mat A(m,k); - Mat B(k,n); - Mat C(m,n); - A.setRandom(); - B.setRandom(); - C.setZero(); - - BenchTimer t; - - double up = 1e8*4/sizeof(Scalar); - double tm0 = 4, tm1 = 10; - if(NumTraits::IsComplex) - { - up /= 4; - tm0 = 2; - tm1 = 4; - } - - double flops = 2. * m * n * k; - long rep = std::max(1., std::min(100., up/flops) ); - long tries = std::max(tm0, std::min(tm1, up/flops) ); - - BENCH(t, tries, rep, gemm(A,B,C)); - - return 1e-9 * rep * flops / t.best(); -} - -int main(int argc, char **argv) -{ - std::vector results; - - std::ifstream settings("gemm_settings.txt"); - long m, n, k; - while(settings >> m >> n >> k) - { - //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; - results.push_back( bench(m, n, k) ); - } - - std::cout << RowVectorXd::Map(results.data(), results.size()); - - return 0; -} diff --git a/bench/perf_monitoring/gemm/gemm_settings.txt b/bench/perf_monitoring/gemm/gemm_settings.txt deleted file mode 100644 index 5c43e1c7d..000000000 --- a/bench/perf_monitoring/gemm/gemm_settings.txt +++ /dev/null @@ -1,15 +0,0 @@ -8 8 8 -9 9 9 -24 24 24 -239 239 239 -240 240 240 -2400 24 24 -24 2400 24 -24 24 2400 -24 2400 2400 -2400 24 2400 -2400 2400 24 -2400 2400 64 -4800 23 160 -23 4800 160 -2400 2400 2400 diff --git a/bench/perf_monitoring/gemm/lazy_gemm.cpp b/bench/perf_monitoring/gemm/lazy_gemm.cpp deleted file mode 100644 index 6dc370155..000000000 --- a/bench/perf_monitoring/gemm/lazy_gemm.cpp +++ /dev/null @@ -1,98 +0,0 @@ -#include -#include -#include -#include -#include "../../BenchTimer.h" -using namespace Eigen; - -#ifndef SCALAR -#error SCALAR must be defined -#endif - -typedef SCALAR Scalar; - -template -EIGEN_DONT_INLINE -void lazy_gemm(const MatA &A, const MatB &B, MatC &C) -{ -// escape((void*)A.data()); -// escape((void*)B.data()); - C.noalias() += A.lazyProduct(B); -// escape((void*)C.data()); -} - -template -EIGEN_DONT_INLINE -double bench() -{ - typedef Matrix MatA; - typedef Matrix MatB; - typedef Matrix MatC; - - MatA A(m,k); - MatB B(k,n); - MatC C(m,n); - A.setRandom(); - B.setRandom(); - C.setZero(); - - BenchTimer t; - - double up = 1e7*4/sizeof(Scalar); - double tm0 = 10, tm1 = 20; - - double flops = 2. * m * n * k; - long rep = std::max(10., std::min(10000., up/flops) ); - long tries = std::max(tm0, std::min(tm1, up/flops) ); - - BENCH(t, tries, rep, lazy_gemm(A,B,C)); - - return 1e-9 * rep * flops / t.best(); -} - -template -double bench_t(int t) -{ - if(t) - return bench(); - else - return bench(); -} - -EIGEN_DONT_INLINE -double bench_mnk(int m, int n, int k, int t) -{ - int id = m*10000 + n*100 + k; - switch(id) { - case 10101 : return bench_t< 1, 1, 1>(t); break; - case 20202 : return bench_t< 2, 2, 2>(t); break; - case 30303 : return bench_t< 3, 3, 3>(t); break; - case 40404 : return bench_t< 4, 4, 4>(t); break; - case 50505 : return bench_t< 5, 5, 5>(t); break; - case 60606 : return bench_t< 6, 6, 6>(t); break; - case 70707 : return bench_t< 7, 7, 7>(t); break; - case 80808 : return bench_t< 8, 8, 8>(t); break; - case 90909 : return bench_t< 9, 9, 9>(t); break; - case 101010 : return bench_t<10,10,10>(t); break; - case 111111 : return bench_t<11,11,11>(t); break; - case 121212 : return bench_t<12,12,12>(t); break; - } - return 0; -} - -int main(int argc, char **argv) -{ - std::vector results; - - std::ifstream settings("lazy_gemm_settings.txt"); - long m, n, k, t; - while(settings >> m >> n >> k >> t) - { - //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; - results.push_back( bench_mnk(m, n, k, t) ); - } - - std::cout << RowVectorXd::Map(results.data(), results.size()); - - return 0; -} diff --git a/bench/perf_monitoring/gemm/lazy_gemm_settings.txt b/bench/perf_monitoring/gemm/lazy_gemm_settings.txt deleted file mode 100644 index 407d5d4fa..000000000 --- a/bench/perf_monitoring/gemm/lazy_gemm_settings.txt +++ /dev/null @@ -1,15 +0,0 @@ -1 1 1 0 -2 2 2 0 -3 3 3 0 -4 4 4 0 -4 4 4 1 -5 5 5 0 -6 6 6 0 -7 7 7 0 -7 7 7 1 -8 8 8 0 -9 9 9 0 -10 10 10 0 -11 11 11 0 -12 12 12 0 -12 12 12 1 diff --git a/bench/perf_monitoring/gemm/make_plot.sh b/bench/perf_monitoring/gemm/make_plot.sh deleted file mode 100755 index cd3214ac9..000000000 --- a/bench/perf_monitoring/gemm/make_plot.sh +++ /dev/null @@ -1,38 +0,0 @@ -#!/bin/bash - -# base name of the bench -# it reads $1.out -# and generates $1.pdf -WHAT=$1 -bench=$2 - -header="rev " -while read line -do - if [ ! -z '$line' ]; then - header="$header \"$line\"" - fi -done < $bench"_settings.txt" - -echo $header > $WHAT.out.header -cat $WHAT.out >> $WHAT.out.header - - -echo "set title '$WHAT'" > $WHAT.gnuplot -echo "set key autotitle columnhead outside " >> $WHAT.gnuplot -echo "set xtics rotate 1" >> $WHAT.gnuplot - -echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot -echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot - -col=`cat $bench"_settings.txt" | wc -l` -echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot -echo " " >> $WHAT.gnuplot - -gnuplot -persist < $WHAT.gnuplot - -# generate a png file -# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten .$WHAT.png - -# clean -rm $WHAT.out.header $WHAT.gnuplot \ No newline at end of file diff --git a/bench/perf_monitoring/gemm/run.sh b/bench/perf_monitoring/gemm/run.sh deleted file mode 100755 index 9d6ee40bc..000000000 --- a/bench/perf_monitoring/gemm/run.sh +++ /dev/null @@ -1,156 +0,0 @@ -#!/bin/bash - -# ./run.sh gemm -# ./run.sh lazy_gemm - -# Examples of environment variables to be set: -# PREFIX="haswell-fma-" -# CXX_FLAGS="-mfma" - -# Options: -# -up : enforce the recomputation of existing data, and keep best results as a merging strategy -# -s : recompute selected changesets only and keep bests - -bench=$1 - -if echo "$*" | grep '\-up' > /dev/null; then - update=true -else - update=false -fi - -if echo "$*" | grep '\-s' > /dev/null; then - selected=true -else - selected=false -fi - -global_args="$*" - -if [ $selected == true ]; then - echo "Recompute selected changesets only and keep bests" -elif [ $update == true ]; then - echo "(Re-)Compute all changesets and keep bests" -else - echo "Skip previously computed changesets" -fi - - - -if [ ! -d "eigen_src" ]; then - hg clone https://bitbucket.org/eigen/eigen eigen_src -else - cd eigen_src - hg pull -u - cd .. -fi - -if [ ! -z '$CXX' ]; then - CXX=g++ -fi - -function make_backup -{ - if [ -f "$1.out" ]; then - mv "$1.out" "$1.backup" - fi -} - -function merge -{ - count1=`echo $1 | wc -w` - count2=`echo $2 | wc -w` - - if [ $count1 == $count2 ]; then - a=( $1 ); b=( $2 ) - res="" - for (( i=0 ; i<$count1 ; i++ )); do - ai=${a[$i]}; bi=${b[$i]} - tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l` - res="$res $tmp" - done - echo $res - - else - echo $1 - fi -} - -function test_current -{ - rev=$1 - scalar=$2 - name=$3 - - prev="" - if [ -e "$name.backup" ]; then - prev=`grep $rev "$name.backup" | cut -c 14-` - fi - res=$prev - count_rev=`echo $prev | wc -w` - count_ref=`cat $bench"_settings.txt" | wc -l` - if echo "$global_args" | grep "$rev" > /dev/null; then - rev_found=true - else - rev_found=false - fi -# echo $update et $selected et $rev_found because $rev et "$global_args" -# echo $count_rev et $count_ref - if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then - if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then - curr=`./$name` - if [ $count_rev == $count_ref ]; then - echo "merge previous $prev" - echo "with new $curr" - else - echo "got $curr" - fi - res=`merge "$curr" "$prev"` -# echo $res - echo "$rev $res" >> $name.out - else - echo "Compilation failed, skip rev $rev" - fi - else - echo "Skip existing results for $rev / $name" - echo "$rev $res" >> $name.out - fi -} - -make_backup $PREFIX"s"$bench -make_backup $PREFIX"d"$bench -make_backup $PREFIX"c"$bench - -cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev -do - if [ ! -z '$rev' ]; then - echo "Testing rev $rev" - cd eigen_src - hg up -C $rev > /dev/null - actual_rev=`hg identify | cut -f1 -d' '` - cd .. - - test_current $actual_rev float $PREFIX"s"$bench - test_current $actual_rev double $PREFIX"d"$bench - test_current $actual_rev "std::complex" $PREFIX"c"$bench - fi - -done - -echo "Float:" -cat $PREFIX"s""$bench.out" -echo " " - -echo "Double:" -cat $PREFIX"d""$bench.out" -echo "" - -echo "Complex:" -cat $PREFIX"c""$bench.out" -echo "" - -./make_plot.sh $PREFIX"s"$bench $bench -./make_plot.sh $PREFIX"d"$bench $bench -./make_plot.sh $PREFIX"c"$bench $bench - - diff --git a/bench/perf_monitoring/gemm_common.h b/bench/perf_monitoring/gemm_common.h new file mode 100644 index 000000000..30dbc0df6 --- /dev/null +++ b/bench/perf_monitoring/gemm_common.h @@ -0,0 +1,67 @@ +#include +#include +#include +#include +#include "eigen_src/Eigen/Core" +#include "../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +typedef Matrix Mat; + +template +EIGEN_DONT_INLINE +double bench(long m, long n, long k, const Func& f) +{ + Mat A(m,k); + Mat B(k,n); + Mat C(m,n); + A.setRandom(); + B.setRandom(); + C.setZero(); + + BenchTimer t; + + double up = 1e8*4/sizeof(Scalar); + double tm0 = 4, tm1 = 10; + if(NumTraits::IsComplex) + { + up /= 4; + tm0 = 2; + tm1 = 4; + } + + double flops = 2. * m * n * k; + long rep = std::max(1., std::min(100., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, f(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +int main_gemm(int argc, char **argv, const Func& f) +{ + std::vector results; + + std::string filename = std::string("gemm_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n, k; + while(settings >> m >> n >> k) + { + //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; + results.push_back( bench(m, n, k, f) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/bench/perf_monitoring/gemm_settings.txt b/bench/perf_monitoring/gemm_settings.txt new file mode 100644 index 000000000..5c43e1c7d --- /dev/null +++ b/bench/perf_monitoring/gemm_settings.txt @@ -0,0 +1,15 @@ +8 8 8 +9 9 9 +24 24 24 +239 239 239 +240 240 240 +2400 24 24 +24 2400 24 +24 24 2400 +24 2400 2400 +2400 24 2400 +2400 2400 24 +2400 2400 64 +4800 23 160 +23 4800 160 +2400 2400 2400 diff --git a/bench/perf_monitoring/gemm_square_settings.txt b/bench/perf_monitoring/gemm_square_settings.txt new file mode 100644 index 000000000..98474d173 --- /dev/null +++ b/bench/perf_monitoring/gemm_square_settings.txt @@ -0,0 +1,11 @@ +8 8 8 +9 9 9 +12 12 12 +15 15 15 +16 16 16 +24 24 24 +102 102 102 +239 239 239 +240 240 240 +2400 2400 2400 +2463 2463 2463 diff --git a/bench/perf_monitoring/gemv.cpp b/bench/perf_monitoring/gemv.cpp new file mode 100644 index 000000000..82e5ab960 --- /dev/null +++ b/bench/perf_monitoring/gemv.cpp @@ -0,0 +1,12 @@ +#include "gemv_common.h" + +EIGEN_DONT_INLINE +void gemv(const Mat &A, const Vec &B, Vec &C) +{ + C.noalias() += A * B; +} + +int main(int argc, char **argv) +{ + return main_gemv(argc, argv, gemv); +} diff --git a/bench/perf_monitoring/gemv_common.h b/bench/perf_monitoring/gemv_common.h new file mode 100644 index 000000000..cc3257729 --- /dev/null +++ b/bench/perf_monitoring/gemv_common.h @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include "eigen_src/Eigen/Core" +#include "../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +typedef Matrix Mat; +typedef Matrix Vec; + +template +EIGEN_DONT_INLINE +double bench(long m, long n, Func &f) +{ + Mat A(m,n); + Vec B(n); + Vec C(m); + A.setRandom(); + B.setRandom(); + C.setRandom(); + + BenchTimer t; + + double up = 1e8/sizeof(Scalar); + double tm0 = 4, tm1 = 10; + if(NumTraits::IsComplex) + { + up /= 4; + tm0 = 2; + tm1 = 4; + } + + double flops = 2. * m * n; + long rep = std::max(1., std::min(100., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, f(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +int main_gemv(int argc, char **argv, Func& f) +{ + std::vector results; + + std::string filename = std::string("gemv_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n; + while(settings >> m >> n) + { + //std::cerr << " Testing " << m << " " << n << std::endl; + results.push_back( bench(m, n, f) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/bench/perf_monitoring/gemv_settings.txt b/bench/perf_monitoring/gemv_settings.txt new file mode 100644 index 000000000..21a5ee051 --- /dev/null +++ b/bench/perf_monitoring/gemv_settings.txt @@ -0,0 +1,11 @@ +8 8 +9 9 +24 24 +239 239 +240 240 +2400 24 +24 2400 +24 240 +2400 2400 +4800 23 +23 4800 diff --git a/bench/perf_monitoring/gemv_square_settings.txt b/bench/perf_monitoring/gemv_square_settings.txt new file mode 100644 index 000000000..5165759f4 --- /dev/null +++ b/bench/perf_monitoring/gemv_square_settings.txt @@ -0,0 +1,13 @@ +8 8 +9 9 +12 12 +15 15 +16 16 +24 24 +53 53 +74 74 +102 102 +239 239 +240 240 +2400 2400 +2463 2463 diff --git a/bench/perf_monitoring/gemvt.cpp b/bench/perf_monitoring/gemvt.cpp new file mode 100644 index 000000000..fe945767e --- /dev/null +++ b/bench/perf_monitoring/gemvt.cpp @@ -0,0 +1,12 @@ +#include "gemv_common.h" + +EIGEN_DONT_INLINE +void gemv(const Mat &A, Vec &B, const Vec &C) +{ + B.noalias() += A.transpose() * C; +} + +int main(int argc, char **argv) +{ + return main_gemv(argc, argv, gemv); +} diff --git a/bench/perf_monitoring/lazy_gemm.cpp b/bench/perf_monitoring/lazy_gemm.cpp new file mode 100644 index 000000000..773306048 --- /dev/null +++ b/bench/perf_monitoring/lazy_gemm.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include +#include "../../BenchTimer.h" +using namespace Eigen; + +#ifndef SCALAR +#error SCALAR must be defined +#endif + +typedef SCALAR Scalar; + +template +EIGEN_DONT_INLINE +void lazy_gemm(const MatA &A, const MatB &B, MatC &C) +{ +// escape((void*)A.data()); +// escape((void*)B.data()); + C.noalias() += A.lazyProduct(B); +// escape((void*)C.data()); +} + +template +EIGEN_DONT_INLINE +double bench() +{ + typedef Matrix MatA; + typedef Matrix MatB; + typedef Matrix MatC; + + MatA A(m,k); + MatB B(k,n); + MatC C(m,n); + A.setRandom(); + B.setRandom(); + C.setZero(); + + BenchTimer t; + + double up = 1e7*4/sizeof(Scalar); + double tm0 = 10, tm1 = 20; + + double flops = 2. * m * n * k; + long rep = std::max(10., std::min(10000., up/flops) ); + long tries = std::max(tm0, std::min(tm1, up/flops) ); + + BENCH(t, tries, rep, lazy_gemm(A,B,C)); + + return 1e-9 * rep * flops / t.best(); +} + +template +double bench_t(int t) +{ + if(t) + return bench(); + else + return bench(); +} + +EIGEN_DONT_INLINE +double bench_mnk(int m, int n, int k, int t) +{ + int id = m*10000 + n*100 + k; + switch(id) { + case 10101 : return bench_t< 1, 1, 1>(t); break; + case 20202 : return bench_t< 2, 2, 2>(t); break; + case 30303 : return bench_t< 3, 3, 3>(t); break; + case 40404 : return bench_t< 4, 4, 4>(t); break; + case 50505 : return bench_t< 5, 5, 5>(t); break; + case 60606 : return bench_t< 6, 6, 6>(t); break; + case 70707 : return bench_t< 7, 7, 7>(t); break; + case 80808 : return bench_t< 8, 8, 8>(t); break; + case 90909 : return bench_t< 9, 9, 9>(t); break; + case 101010 : return bench_t<10,10,10>(t); break; + case 111111 : return bench_t<11,11,11>(t); break; + case 121212 : return bench_t<12,12,12>(t); break; + } + return 0; +} + +int main(int argc, char **argv) +{ + std::vector results; + + std::string filename = std::string("lazy_gemm_settings.txt"); + if(argc>1) + filename = std::string(argv[1]); + std::ifstream settings(filename); + long m, n, k, t; + while(settings >> m >> n >> k >> t) + { + //std::cerr << " Testing " << m << " " << n << " " << k << std::endl; + results.push_back( bench_mnk(m, n, k, t) ); + } + + std::cout << RowVectorXd::Map(results.data(), results.size()); + + return 0; +} diff --git a/bench/perf_monitoring/lazy_gemm_settings.txt b/bench/perf_monitoring/lazy_gemm_settings.txt new file mode 100644 index 000000000..407d5d4fa --- /dev/null +++ b/bench/perf_monitoring/lazy_gemm_settings.txt @@ -0,0 +1,15 @@ +1 1 1 0 +2 2 2 0 +3 3 3 0 +4 4 4 0 +4 4 4 1 +5 5 5 0 +6 6 6 0 +7 7 7 0 +7 7 7 1 +8 8 8 0 +9 9 9 0 +10 10 10 0 +11 11 11 0 +12 12 12 0 +12 12 12 1 diff --git a/bench/perf_monitoring/llt.cpp b/bench/perf_monitoring/llt.cpp new file mode 100644 index 000000000..d55b7d803 --- /dev/null +++ b/bench/perf_monitoring/llt.cpp @@ -0,0 +1,15 @@ +#include "gemm_common.h" +#include + +EIGEN_DONT_INLINE +void llt(const Mat &A, const Mat &B, Mat &C) +{ + C = A; + C.diagonal().array() += 1000; + Eigen::internal::llt_inplace::blocked(C); +} + +int main(int argc, char **argv) +{ + return main_gemm(argc, argv, llt); +} diff --git a/bench/perf_monitoring/make_plot.sh b/bench/perf_monitoring/make_plot.sh new file mode 100755 index 000000000..65aaf66f9 --- /dev/null +++ b/bench/perf_monitoring/make_plot.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# base name of the bench +# it reads $1.out +# and generates $1.pdf +WHAT=$1 +bench=$2 +settings_file=$3 + +header="rev " +while read line +do + if [ ! -z '$line' ]; then + header="$header \"$line\"" + fi +done < $settings_file + +echo $header > $WHAT.out.header +cat $WHAT.out >> $WHAT.out.header + + +echo "set title '$WHAT'" > $WHAT.gnuplot +echo "set key autotitle columnhead outside " >> $WHAT.gnuplot +echo "set xtics rotate 1" >> $WHAT.gnuplot + +echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot +echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot + +col=`cat $settings_file | wc -l` +echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot +echo " " >> $WHAT.gnuplot + +gnuplot -persist < $WHAT.gnuplot + +# generate a png file (thumbnail) +convert -colors 256 -background white -density 300 -resize 300 -quality 0 $WHAT.pdf -background white -flatten $WHAT.png + +# clean +rm $WHAT.out.header $WHAT.gnuplot + + +# generate html/svg graph + +echo " " > $WHAT.html +cat resources/chart_header.html > $WHAT.html +echo 'var customSettings = {"TITLE":"","SUBTITLE":"","XLABEL":"","YLABEL":""};' >> $WHAT.html +# 'data' is an array of datasets (i.e. curves), each of which is an object of the form +# { +# key: , +# color: , +# values: [{ +# r: , +# v: +# }] +# } +echo 'var data = [' >> $WHAT.html + +col=2 +while read line +do + if [ ! -z '$line' ]; then + header="$header \"$line\"" + echo '{"key":"'$line'","values":[' >> $WHAT.html + i=0 + while read line2 + do + if [ ! -z "$line2" ]; then + val=`echo $line2 | cut -s -f $col -d ' '` + if [ -n "$val" ]; then # skip build failures + echo '{"r":'$i',"v":'$val'},' >> $WHAT.html + fi + fi + ((i++)) + done < $WHAT.out + echo ']},' >> $WHAT.html + fi + ((col++)) +done < $settings_file +echo '];' >> $WHAT.html + +echo 'var changesets = [' >> $WHAT.html +while read line2 +do + if [ ! -z '$line2' ]; then + echo '"'`echo $line2 | cut -f 1 -d ' '`'",' >> $WHAT.html + fi +done < $WHAT.out +echo '];' >> $WHAT.html + +echo 'var changesets_details = [' >> $WHAT.html +while read line2 +do + if [ ! -z '$line2' ]; then + num=`echo "$line2" | cut -f 1 -d ' '` + comment=`grep ":$num" changesets.txt | cut -f 2 -d '#'` + echo '"'"$comment"'",' >> $WHAT.html + fi +done < $WHAT.out +echo '];' >> $WHAT.html + +echo 'var changesets_count = [' >> $WHAT.html +i=0 +while read line2 +do + if [ ! -z '$line2' ]; then + echo $i ',' >> $WHAT.html + fi + ((i++)) +done < $WHAT.out +echo '];' >> $WHAT.html + +cat resources/chart_footer.html >> $WHAT.html diff --git a/bench/perf_monitoring/resources/chart_footer.html b/bench/perf_monitoring/resources/chart_footer.html new file mode 100644 index 000000000..a96cdb898 --- /dev/null +++ b/bench/perf_monitoring/resources/chart_footer.html @@ -0,0 +1,41 @@ + /* setup the chart and its options */ + var chart = nv.models.lineChart() + .color(d3.scale.category10().range()) + .margin({left: 75, bottom: 100}) + .forceX([0]).forceY([0]); + + chart.x(function(datum){ return datum.r; }) + .xAxis.options({ + axisLabel: customSettings.XLABEL || 'Changeset', + tickFormat: d3.format('.0f') + }); + chart.xAxis + .tickValues(changesets_count) + .tickFormat(function(d){return changesets[d]}) + .rotateLabels(-90); + + chart.y(function(datum){ return datum.v; }) + .yAxis.options({ + axisLabel: customSettings.YLABEL || 'GFlops'/*, + tickFormat: function(val){ return d3.format('.0f')(val) + ' GFlops'; }*/ + }); + + chart.tooltip.headerFormatter(function(d) { return changesets[d] + + '

' + + changesets_details[d] + "

"; }); + + //chart.useInteractiveGuideline(true); + d3.select('#chart').datum(data).call(chart); + var plot = d3.select('#chart > g'); + + /* setup the title */ + plot.append('text') + .style('font-size', '24px') + .attr('text-anchor', 'middle').attr('x', '50%').attr('y', '20px') + .text(customSettings.TITLE || ''); + + /* ensure the chart is responsive */ + nv.utils.windowResize(chart.update); + + + diff --git a/bench/perf_monitoring/resources/chart_header.html b/bench/perf_monitoring/resources/chart_header.html new file mode 100644 index 000000000..27eb02e54 --- /dev/null +++ b/bench/perf_monitoring/resources/chart_header.html @@ -0,0 +1,45 @@ + + + + + + + + + + + + +