Added documentation and additional benchmarking code

author: Pete Warden <petewarden@google.com> 2015-07-08 12:50:10 -0700
committer: Pete Warden <petewarden@google.com> 2015-07-08 12:50:10 -0700
commit: 5de59a56e832f01aafa27d651f9c638dfc2bb00b (patch)
tree: 0bf3af0b392748b1983aaa38c8ef918961472fae
parent: 041e4a5d64d58e5b7348e69f89b845880aae8577 (diff)
download: gemmlowp-5de59a56e832f01aafa27d651f9c638dfc2bb00b.tar.gz
2 files changed, 180 insertions, 2 deletions
diff --git a/README.txt b/README.txt
index 29fcdc2..a527e48 100644
--- a/README.txt
+++ b/README.txt
@@ -16,7 +16,7 @@ Portability, target platforms/architectures
 ===========================================
 
 Should be portable to any platform with some C++11 and POSIX support,
-while we have optional optimized code paths for specifica architectures.
+while we have optional optimized code paths for specific architectures.
 
 Required:
   C++11 (a small conservative subset of it)
@@ -79,6 +79,37 @@ Then run:
 
 $ ./scripts/test-android.sh test/test.cc eight_bit_int_gemm/eight_bit_int_gemm.cc
 
+Troubleshooting Compilation
+===========================
+
+If you're having trouble finding the compiler, follow these instructions to
+build a standalone toolchain:
+https://developer.android.com/ndk/guides/standalone_toolchain.html 
+
+Here's an example of setting up Clang 3.5:
+
+$ export INSTALL_DIR=~/toolchains/clang-21-stl-gnu
+$ $NDK/build/tools/make-standalone-toolchain.sh \
+--toolchain=arm-linux-androideabi-clang3.5 --platform=android-21 \
+--install-dir=$INSTALL_DIR
+$ export CXX="$INSTALL_DIR/bin/arm-linux-androideabi-g++ \
+--sysroot=$INSTALL_DIR/sysroot"
+
+Some compilers (e.g. the default clang++ in the same bin directory) don't
+support NEON assembly. The benchmark build process will issue a warning if
+support isn't detected, and you should make sure you're using a compiler like
+arm-linux-androideabi-g++ that does include NEON.
+
+
+Benchmarking
+============
+
+To see what the performance is like on some typical operations, run
+$ ../scripts/test-android.sh benchmark.cc
+
+This will compile and run a small benchmark binary, which runs through GEMMs
+with varying input matrix sizes and outputs the performance. The final test
+simulates the sort of GEMM sizes you'd expect for a GoogLeNet-style CNN.
 
 Profiling
 =========
diff --git a/test/benchmark.cc b/test/benchmark.cc
index 624390e..bb678af 100644
--- a/test/benchmark.cc
+++ b/test/benchmark.cc
@@ -27,6 +27,10 @@
 #include "test/test.h"
 #include "public/gemmlowp.h"
 
+#if defined(__arm__) && !defined(GEMMLOWP_NEON)
+#warning "Building without NEON support on ARM, check your compiler setup!"
+#endif
+
 namespace gemmlowp {
 
 double time() {
@@ -46,7 +50,7 @@ const std::size_t min_working_set_size = 16 * 1024 * 1024;
 
 template <typename Kernel, typename LhsType, typename RhsType,
           typename ResultType>
-double gflops_for_gemm_size(GemmContext* context, int rows, int depth,
+double time_for_gemm_size(GemmContext* context, int rows, int depth,
                             int cols) {
   typedef std::uint8_t Scalar;
 
@@ -108,6 +112,16 @@ double gflops_for_gemm_size(GemmContext* context, int rows, int depth,
     iters_at_a_time *= 2;
   }
 
+  return time_per_iter;
+}
+
+template <typename Kernel, typename LhsType, typename RhsType,
+          typename ResultType>
+double gflops_for_gemm_size(GemmContext* context, int rows, int depth,
+                            int cols) {
+  const double time_per_iter =
+      time_for_gemm_size<Kernel, LhsType, RhsType, ResultType>(
+          context, rows, depth, cols);
   return 2e-9 * rows * depth * cols / time_per_iter;
 }
 
@@ -191,6 +205,133 @@ void benchmark(GemmContext* context) {
   std::cout << std::endl;
 }
 
+void benchmark_googlenet(GemmContext* context) {
+
+#ifdef GEMMLOWP_TEST_KERNEL
+  typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMM;
+  typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMV;
+#else
+  typedef gemmlowp::DefaultKernelForGEMM KernelForGEMM;
+  typedef gemmlowp::DefaultKernelForGEMV KernelForGEMV;
+#endif
+
+  // These are the m, n, k sizes for a typical GoogLeNet.
+  const int googlenet_gemm_sizes[] = {
+    12544, 64, 147,
+    3136, 64, 64,
+    3136, 192, 576,
+    784, 64, 192,
+    784, 96, 192,
+    784, 128, 864,
+    784, 16, 192,
+    784, 32, 400,
+    784, 32, 192,
+    784, 128, 256,
+    784, 128, 256,
+    784, 192, 1152,
+    784, 32, 256,
+    784, 96, 800,
+    784, 64, 256,
+    196, 192, 480,
+    196, 96, 480,
+    196, 204, 864,
+    196, 16, 480,
+    196, 48, 400,
+    196, 64, 480,
+    196, 160, 508,
+    196, 112, 508,
+    196, 224, 1008,
+    196, 24, 508,
+    196, 64, 600,
+    196, 64, 508,
+    196, 128, 512,
+    196, 128, 512,
+    196, 256, 1152,
+    196, 24, 512,
+    196, 64, 600,
+    196, 64, 512,
+    196, 112, 512,
+    196, 144, 512,
+    196, 288, 1296,
+    196, 32, 512,
+    196, 64, 800,
+    196, 64, 512,
+    196, 256, 528,
+    196, 160, 528,
+    196, 320, 1440,
+    196, 32, 528,
+    196, 128, 800,
+    196, 128, 528,
+    49, 256, 832,
+    49, 160, 832,
+    49, 320, 1440,
+    49, 48, 832,
+    49, 128, 1200,
+    49, 128, 832,
+    49, 384, 832,
+    49, 192, 832,
+    49, 384, 1728,
+    49, 48, 832,
+    49, 128, 1200,
+    49, 128, 832,
+    16, 128, 508,
+    1, 1024, 2048,
+    1, 1008, 1024,
+    16, 128, 528,
+    1, 1024, 2048,
+    1, 1008, 1024,
+    1, 1008, 1024,
+  };
+  const int param_count =
+      sizeof(googlenet_gemm_sizes) / sizeof(googlenet_gemm_sizes[0]);
+  const int gemm_count = param_count / 3;
+
+  const int repeat = 2;
+
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+#ifdef GEMMLOWP_TEST_PROFILE
+  gemmlowp::RegisterCurrentThreadForProfiling();
+  gemmlowp::StartProfiling();
+#endif
+
+  float total_time = 0;
+
+  // We don't record the first repetition, it's just warm-up.
+  for (int r = 0; r < repeat + 1; r++) {
+    std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
+              << std::flush;
+    for (int gemm_index = 0; gemm_index < gemm_count; ++gemm_index) {
+      float gemm_time = 0;
+      const int rows = googlenet_gemm_sizes[(gemm_index * 3) + 1];
+      const int cols = googlenet_gemm_sizes[(gemm_index * 3) + 0];
+      const int depth = googlenet_gemm_sizes[(gemm_index * 3) + 2];
+      if (cols > KernelForGEMM::Format::kCols / 2) {
+        gemm_time =
+            time_for_gemm_size<KernelForGEMM, LhsType, RhsType, ResultType>(
+                context, rows, depth, cols);
+      } else {
+        gemm_time =
+            time_for_gemm_size<KernelForGEMV, LhsType, RhsType, ResultType>(
+                context, rows, depth, cols);
+      }
+      if (r > 0) {
+        total_time += gemm_time;
+      }
+    }
+  }
+
+#ifdef GEMMLOWP_TEST_PROFILE
+  gemmlowp::FinishProfiling();
+#endif
+
+  const float ms_per_network = (total_time / repeat) * 1000.0f;
+  std::cout.precision(4);
+  std::cout << "GoogLeNet GEMMs took " << ms_per_network << "ms" << std::endl;
+}
+
 }  // end namespace gemmlowp
 
 int main() {
@@ -207,4 +348,10 @@ int main() {
     std::cout << "Benchmarking single-threaded mode..." << std::endl;
     gemmlowp::benchmark(&context);
   }
+
+  {
+    gemmlowp::GemmContext context;
+    std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl;
+    gemmlowp::benchmark_googlenet(&context);
+  }
 }
author	Pete Warden <petewarden@google.com>	2015-07-08 12:50:10 -0700
committer	Pete Warden <petewarden@google.com>	2015-07-08 12:50:10 -0700
commit	5de59a56e832f01aafa27d651f9c638dfc2bb00b (patch)
tree	0bf3af0b392748b1983aaa38c8ef918961472fae
parent	041e4a5d64d58e5b7348e69f89b845880aae8577 (diff)
download	gemmlowp-5de59a56e832f01aafa27d651f9c638dfc2bb00b.tar.gz