aboutsummaryrefslogtreecommitdiff
path: root/bench/bench_gemm.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'bench/bench_gemm.cpp')
-rw-r--r--bench/bench_gemm.cpp117
1 files changed, 93 insertions, 24 deletions
diff --git a/bench/bench_gemm.cpp b/bench/bench_gemm.cpp
index 41ca8b3b6..8528c5587 100644
--- a/bench/bench_gemm.cpp
+++ b/bench/bench_gemm.cpp
@@ -2,6 +2,14 @@
// g++-4.4 bench_gemm.cpp -I .. -O2 -DNDEBUG -lrt -fopenmp && OMP_NUM_THREADS=2 ./a.out
// icpc bench_gemm.cpp -I .. -O3 -DNDEBUG -lrt -openmp && OMP_NUM_THREADS=2 ./a.out
+// Compilation options:
+//
+// -DSCALAR=std::complex<double>
+// -DSCALARA=double or -DSCALARB=double
+// -DHAVE_BLAS
+// -DDECOUPLED
+//
+
#include <iostream>
#include <Eigen/Core>
#include <bench/BenchTimer.h>
@@ -14,10 +22,18 @@ using namespace Eigen;
#define SCALAR float
#endif
+#ifndef SCALARA
+#define SCALARA SCALAR
+#endif
+
+#ifndef SCALARB
+#define SCALARB SCALAR
+#endif
+
typedef SCALAR Scalar;
typedef NumTraits<Scalar>::Real RealScalar;
-typedef Matrix<RealScalar,Dynamic,Dynamic> A;
-typedef Matrix</*Real*/Scalar,Dynamic,Dynamic> B;
+typedef Matrix<SCALARA,Dynamic,Dynamic> A;
+typedef Matrix<SCALARB,Dynamic,Dynamic> B;
typedef Matrix<Scalar,Dynamic,Dynamic> C;
typedef Matrix<RealScalar,Dynamic,Dynamic> M;
@@ -129,35 +145,69 @@ int main(int argc, char ** argv)
int tries = 2; // number of tries, we keep the best
int s = 2048;
- int cache_size = -1;
+ int m = s;
+ int n = s;
+ int p = s;
+ int cache_size1=-1, cache_size2=l2, cache_size3 = 0;
bool need_help = false;
- for (int i=1; i<argc; ++i)
+ for (int i=1; i<argc;)
{
- if(argv[i][0]=='s')
- s = atoi(argv[i]+1);
- else if(argv[i][0]=='c')
- cache_size = atoi(argv[i]+1);
- else if(argv[i][0]=='t')
- tries = atoi(argv[i]+1);
- else if(argv[i][0]=='p')
- rep = atoi(argv[i]+1);
+ if(argv[i][0]=='-')
+ {
+ if(argv[i][1]=='s')
+ {
+ ++i;
+ s = atoi(argv[i++]);
+ m = n = p = s;
+ if(argv[i][0]!='-')
+ {
+ n = atoi(argv[i++]);
+ p = atoi(argv[i++]);
+ }
+ }
+ else if(argv[i][1]=='c')
+ {
+ ++i;
+ cache_size1 = atoi(argv[i++]);
+ if(argv[i][0]!='-')
+ {
+ cache_size2 = atoi(argv[i++]);
+ if(argv[i][0]!='-')
+ cache_size3 = atoi(argv[i++]);
+ }
+ }
+ else if(argv[i][1]=='t')
+ {
+ ++i;
+ tries = atoi(argv[i++]);
+ }
+ else if(argv[i][1]=='p')
+ {
+ ++i;
+ rep = atoi(argv[i++]);
+ }
+ }
else
+ {
need_help = true;
+ break;
+ }
}
if(need_help)
{
- std::cout << argv[0] << " s<matrix size> c<cache size> t<nb tries> p<nb repeats>\n";
+ std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
+ std::cout << " <matrix sizes> : size\n";
+ std::cout << " <matrix sizes> : rows columns depth\n";
return 1;
}
- if(cache_size>0)
- setCpuCacheSizes(cache_size,96*cache_size);
-
- int m = s;
- int n = s;
- int p = s;
+#if EIGEN_VERSION_AT_LEAST(3,2,90)
+ if(cache_size1>0)
+ setCpuCacheSizes(cache_size1,cache_size2,cache_size3);
+#endif
+
A a(m,p); a.setRandom();
B b(p,n); b.setRandom();
C c(m,n); c.setOnes();
@@ -172,6 +222,7 @@ int main(int argc, char ** argv)
// check the parallel product is correct
#if defined EIGEN_HAS_OPENMP
+ Eigen::initParallel();
int procs = omp_get_max_threads();
if(procs>1)
{
@@ -188,11 +239,20 @@ int main(int argc, char ** argv)
#elif defined HAVE_BLAS
blas_gemm(a,b,r);
c.noalias() += a * b;
- if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+ if(!r.isApprox(c)) {
+ std::cout << r - c << "\n";
+ std::cerr << "Warning, your product is crap!\n\n";
+ }
#else
- gemm(a,b,c);
- r.noalias() += a.cast<Scalar>() * b.cast<Scalar>();
- if(!r.isApprox(c)) std::cerr << "Warning, your product is crap!\n\n";
+ if(1.*m*n*p<2000.*2000*2000)
+ {
+ gemm(a,b,c);
+ r.noalias() += a.cast<Scalar>() .lazyProduct( b.cast<Scalar>() );
+ if(!r.isApprox(c)) {
+ std::cout << r - c << "\n";
+ std::cerr << "Warning, your product is crap!\n\n";
+ }
+ }
#endif
#ifdef HAVE_BLAS
@@ -214,7 +274,7 @@ int main(int argc, char ** argv)
{
BenchTimer tmono;
omp_set_num_threads(1);
- Eigen::internal::setNbThreads(1);
+ Eigen::setNbThreads(1);
c = rc;
BENCH(tmono, tries, rep, gemm(a,b,c));
std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmono.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER) << "s)\n";
@@ -223,6 +283,15 @@ int main(int argc, char ** argv)
}
#endif
+ if(1.*m*n*p<30*30*30)
+ {
+ BenchTimer tmt;
+ c = rc;
+ BENCH(tmt, tries, rep, c.noalias()+=a.lazyProduct(b));
+ std::cout << "lazy cpu " << tmt.best(CPU_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(CPU_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER) << "s)\n";
+ std::cout << "lazy real " << tmt.best(REAL_TIMER)/rep << "s \t" << (double(m)*n*p*rep*2/tmt.best(REAL_TIMER))*1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER) << "s)\n";
+ }
+
#ifdef DECOUPLED
if((NumTraits<A::Scalar>::IsComplex) && (NumTraits<B::Scalar>::IsComplex))
{