Rebase gemmlowp to 6a2a908temp_72223856

Bug: 70573221 Test: mm Test: mm and Pixel2 boot Test: NeuralNetworksTest pass Change-Id: I8fac98811e9a276d3ff8054167dc45225c04147e
author: Miao Wang <miaowang@google.com> 2017-12-12 14:22:24 -0800
committer: Miao Wang <miaowang@google.com> 2017-12-12 16:14:38 -0800
commit: 1963df9ac4a0424674e72ef5da522b5d830605fd (patch)
tree: efd8fbbe69f13c4057f2cc5a5b1f7852fd57a2ab /test
parent: cbcfdf963151219ca77f54657defabde8d845bac (diff)
download: gemmlowp-1963df9ac4a0424674e72ef5da522b5d830605fd.tar.gz
7 files changed, 386 insertions, 24 deletions
diff --git a/test/benchmark.cc b/test/benchmark.cc
index 20dd369..9a87a41 100644
--- a/test/benchmark.cc
+++ b/test/benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <unistd.h>
 #ifdef __APPLE__
 #include <sys/time.h>
 #endif
@@ -44,18 +43,6 @@
 
 namespace gemmlowp {
 
-double time() {
-#ifdef __APPLE__
-  timeval t;
-  gettimeofday(&t, nullptr);
-  return t.tv_sec + 1e-6 * t.tv_usec;
-#else
-  timespec t;
-  clock_gettime(CLOCK_REALTIME, &t);
-  return t.tv_sec + 1e-9 * t.tv_nsec;
-#endif
-}
-
 const double min_accurate_duration = 1e-1;
 const std::size_t min_working_set_size = 16 * 1024 * 1024;
 
@@ -111,10 +98,10 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
   std::size_t pool_index = 0;
 
   while (true) {
-    double starttime = time();
+    double starttime = real_time_in_seconds();
     for (int i = 0; i < iters_at_a_time; i++) {
       for (size_t j = 0; j < gemms.size(); j++) {
-        int k = pool_index * gemms.size() + j;
+        size_t k = pool_index * gemms.size() + j;
         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
             -75, -91, 74980, 123, 20);
@@ -124,7 +111,7 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
         pool_index = 0;
       }
     }
-    double endtime = time();
+    double endtime = real_time_in_seconds();
 
     const float timing = static_cast<float>(endtime - starttime);
 
@@ -228,8 +215,8 @@ void benchmark_gemm_sizes(GemmContext* context,
   gemmlowp::StartProfiling();
 #endif
 
-  double starttime = time();
-  while (time() < starttime + mintime) {
+  double starttime = real_time_in_seconds();
+  while (real_time_in_seconds() < starttime + mintime) {
     gemm_times.push_back(
         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
   }
diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc
new file mode 100644
index 0000000..16cc57c
--- /dev/null
+++ b/test/benchmark_all_sizes.cc
@@ -0,0 +1,357 @@
+// Example command line to build on Android ARM64:
+/*
+~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
+test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
+-DBENCHMARK_QUICK -DBENCHMARK_8bit
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <ctime>
+#include <iostream>
+#include <map>
+#include <random>
+#include <set>
+
+#include "../public/gemmlowp.h"
+
+#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
+// Compilation workaround
+namespace std {
+  using ::round;
+}
+#endif
+
+// Minimum duration of each benchmark measurement. Also, duration
+// of sleep time between each two consecutive benchmark measurements to
+// prevent over-heating.
+const double kBenchmarkSecs = 0.1;
+
+// Sleep time before each benchmark.
+const int kCooldownBeforeBenchmarkSecs = 0;
+
+// Number of benchmark passes.
+const int kPasses = 4;
+
+#ifdef BENCHMARK_NUM_THREADS
+const int kNumThreads = BENCHMARK_NUM_THREADS;
+#else
+const int kNumThreads = 1;
+#endif
+
+namespace gemmlowp {
+
+// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
+// since it only maps existing data. In tests though, we need to
+// create our own matrices.
+template <typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+  typedef MatrixMap<tScalar, tOrder> Map;
+  typedef MatrixMap<const tScalar, tOrder> ConstMap;
+  typedef typename Map::Scalar Scalar;
+  static const MapOrder Order = tOrder;
+  using Map::cols_;
+  using Map::data_;
+  using Map::kOrder;
+  using Map::rows_;
+  using Map::stride_;
+
+ public:
+  Matrix() : Map(nullptr, 0, 0, 0) {}
+
+  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+  Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
+
+  Matrix& operator=(const Matrix& other) {
+    Resize(other.rows_, other.cols_);
+    std::memcpy(data_, other.data_, size() * sizeof(Scalar));
+    return *this;
+  }
+
+  friend bool operator==(const Matrix& a, const Matrix& b) {
+    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+           !std::memcmp(a.data_, b.data_, a.size());
+  }
+
+  void Resize(int rows, int cols) {
+    rows_ = rows;
+    cols_ = cols;
+    stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
+    storage.resize(size());
+    data_ = storage.data();
+  }
+
+  int size() const { return rows_ * cols_; }
+
+  Map& map() { return *static_cast<Map*>(this); }
+
+  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+  std::vector<Scalar> storage;
+};
+
+template <typename MatrixType>
+void MakeZero(MatrixType* m) {
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = 128;
+    }
+  }
+}
+
+}  // end namespace gemmlowp
+
+template <typename BitDepthParams>
+float benchmark_8bit(int rows, int depth, int cols) {
+  using namespace gemmlowp;
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+  LhsType lhs;
+  RhsType rhs;
+  ResultType result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  MakeZero(&lhs);
+  MakeZero(&rhs);
+  MakeZero(&result);
+
+  typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+                     OutputStageSaturatingCastToUint8>
+      Pipeline;
+  gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+      quantize_down_stage;
+  quantize_down_stage.result_offset_after_shift = 128;
+  quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
+  quantize_down_stage.result_shift = 16;
+  gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+  const auto output_pipeline =
+      std::make_tuple(quantize_down_stage, saturating_cast_stage);
+  GemmContext gemm_context;
+  gemm_context.set_max_num_threads(kNumThreads);
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
+      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, output_pipeline);
+
+  double time_start = real_time_in_seconds();
+  double t = time_start;
+  int iters = 0;
+  int iters_at_a_time = 1;
+  while (t - time_start < kBenchmarkSecs) {
+    for (int i = 0; i < iters_at_a_time; i++) {
+      gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
+                                       BitDepthParams>(
+          &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+          -128, output_pipeline);
+      iters++;
+    }
+    iters_at_a_time *= 2;
+    t = real_time_in_seconds();
+  }
+  return (t - time_start) / iters;
+}
+
+template <typename BitDepthParams>
+float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
+  using namespace gemmlowp;
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
+
+  LhsType lhs;
+  RhsType rhs;
+  ResultType result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  MakeZero(&lhs);
+  MakeZero(&rhs);
+  MakeZero(&result);
+
+  typedef std::tuple<> EmptyPipeline;
+  GemmContext gemm_context;
+  gemm_context.set_max_num_threads(kNumThreads);
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
+      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, EmptyPipeline());
+
+  double time_start = real_time_in_seconds();
+  double t = time_start;
+  int iters = 0;
+  int iters_at_a_time = 1;
+  while (t - time_start < kBenchmarkSecs) {
+    for (int i = 0; i < iters_at_a_time; i++) {
+      gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                                       BitDepthParams>(
+          &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+          -128, EmptyPipeline());
+      iters++;
+    }
+    iters_at_a_time *= 2;
+    t = real_time_in_seconds();
+  }
+  return (t - time_start) / iters;
+}
+
+struct Shape {
+  int rows;
+  int depth;
+  int cols;
+};
+
+bool operator==(const Shape& s1, const Shape& s2) {
+  return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
+}
+
+bool operator<(const Shape& shape1, const Shape& shape2) {
+  return shape1.depth < shape2.depth ||
+         (shape1.depth == shape2.depth &&
+          (shape1.rows < shape2.rows ||
+           (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
+};
+
+#ifdef _WIN32
+#define sleep(t) Sleep(t)
+#endif
+
+float benchmark(const Shape& shape) {
+  if (kCooldownBeforeBenchmarkSecs) {
+    sleep(kCooldownBeforeBenchmarkSecs);
+  }
+#if defined BENCHMARK_8bit
+  // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
+  // This is the recommended thing to default to: it's what most applications
+  // want to use, as it's the fastest.
+  // The contract is that LHS must take values in [1, 255], while RHS can take
+  // any value in [0, 255].
+  return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_wide
+  // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
+  // The only contract difference is that both LHS and RHS can take values in
+  // [0, 255].
+  return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit
+  // Variant of BENCHMARK_8bit where the user asks for getting raw int32
+  // accumulators, instead of a 8bit-downscaled result.
+  return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit_wide
+  // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
+  // accumulators, instead of a 8bit-downscaled result.
+  return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_float
+  return benchmark_float(shape.rows, shape.depth, shape.cols);
+#else
+#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
+#endif
+}
+
+std::set<int> all_sizes() {
+  std::set<int> sizes;
+  for (int i = 1; i <= 2048; i *= 2) {
+    sizes.insert(i);
+  }
+  for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
+    sizes.insert(static_cast<int>(std::round(x)));
+  }
+  for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
+    sizes.insert(static_cast<int>(std::round(x)));
+  }
+  return sizes;
+}
+
+std::mt19937& RandomEngine() {
+  static std::mt19937 engine;
+  return engine;
+}
+
+std::vector<Shape> all_shapes_in_random_order() {
+  std::vector<Shape> shapes;
+  const std::set<int> sizes = all_sizes();
+#if defined BENCHMARK_ROWS
+  // Benchmark one specific shape
+  Shape shape;
+  shape.rows = BENCHMARK_ROWS;
+  shape.depth = BENCHMARK_DEPTH;
+  shape.cols = BENCHMARK_COLS;
+  shapes.push_back(shape);
+#elif defined BENCHMARK_QUICK
+  // Benchmark an assortment of cubic shapes
+  for (int size : sizes) {
+    Shape shape;
+    shape.rows = size;
+    shape.depth = size;
+    shape.cols = size;
+    shapes.push_back(shape);
+  }
+#elif defined BENCHMARK_EXHAUSTIVE
+  // Benchmark all sorts of shapes
+  for (int rows : sizes) {
+    for (int depth : sizes) {
+      for (int cols : sizes) {
+        Shape shape;
+        shape.rows = rows;
+        shape.depth = depth;
+        shape.cols = cols;
+        shapes.push_back(shape);
+      }
+    }
+  }
+#else
+#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
+#endif
+  std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
+  return shapes;
+}
+
+void run_benchmarks(std::map<Shape, float>* results) {
+  std::vector<Shape> shapes;
+  for (int pass = 0; pass < kPasses; pass++) {
+    const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
+    shapes.insert(std::end(shapes), std::begin(pass_shapes),
+                  std::end(pass_shapes));
+  }
+
+  const double time_start = gemmlowp::real_time_in_seconds();
+  for (std::size_t i = 0; i < shapes.size(); i++) {
+    const double ratio = static_cast<double>(i) / shapes.size();
+    const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
+    const double elapsed_hours = elapsed / 3600.;
+    const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
+    fprintf(stderr,
+            "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
+            "hours...   \r",
+            100. * ratio, elapsed_hours, eta_hours);
+    fflush(stderr);
+    const Shape& shape = shapes[i];
+    float latency = benchmark(shape);
+    if (results->count(shape)) {
+      (*results)[shape] = std::min(latency, (*results)[shape]);
+    } else {
+      (*results)[shape] = latency;
+    }
+  }
+  fprintf(stderr, "\n");
+}
+
+int main() {
+  std::map<Shape, float> results;
+  run_benchmarks(&results);
+  printf("Using %d thread(s)\n", kNumThreads);
+  printf("depth,rows,cols,latency(s),Gop/s\n");
+  for (const auto& result : results) {
+    const Shape& shape = result.first;
+    printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
+           result.second,
+           2e-9 * shape.depth * shape.rows * shape.cols / result.second);
+  }
+}
diff --git a/test/test.cc b/test/test.cc
index fdc7bcc..eee16b4 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -14,7 +14,6 @@
 
 #include "test.h"
 
-#include <unistd.h>
 #include <array>
 #include <cstdint>
 #include <cstdlib>
diff --git a/test/test.h b/test/test.h
index b6a540d..aecd0c1 100644
--- a/test/test.h
+++ b/test/test.h
@@ -102,6 +102,19 @@ int Random() {
   return dist(RandomEngine());
 }
 
+#ifdef _MSC_VER
+// msvc does not support 8bit types in uniform_int_distribution<>.
+// Take 32 bit uniform_int_distribution<> and only use the lower 8 bits.
+template <typename OperandRange, typename MatrixType>
+void MakeRandom(MatrixType* m) {
+  ScopedProfilingLabel("MakeRandom(matrix)");
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = Random() % OperandRange::kMaxValue;
+    }
+  }
+}
+#else
 template <typename OperandRange, typename MatrixType>
 void MakeRandom(MatrixType* m) {
   ScopedProfilingLabel("MakeRandom(matrix)");
@@ -114,6 +127,7 @@ void MakeRandom(MatrixType* m) {
     }
   }
 }
+#endif
 
 template <typename MatrixType>
 void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) {
diff --git a/test/test_allocator.cc b/test/test_allocator.cc
index 8a76709..3de50f0 100644
--- a/test/test_allocator.cc
+++ b/test/test_allocator.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "../internal/allocator.h"
 #include "test.h"
+#include "../internal/allocator.h"
 
 namespace gemmlowp {
 
diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc
index 8260576..d1e0932 100644
--- a/test/test_blocking_counter.cc
+++ b/test/test_blocking_counter.cc
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "test.h"
+#include "../profiling/pthread_everywhere.h"
 
-#include <pthread.h>
 #include <vector>
 
 #include "../internal/multi_thread_gemm.h"
@@ -26,6 +26,7 @@ class Thread {
   Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement)
       : blocking_counter_(blocking_counter),
         number_of_times_to_decrement_(number_of_times_to_decrement),
+        finished_(false),
         made_the_last_decrement_(false) {
     pthread_create(&thread_, nullptr, ThreadFunc, this);
   }
@@ -33,7 +34,9 @@ class Thread {
   ~Thread() { Join(); }
 
   bool Join() const {
-    pthread_join(thread_, nullptr);
+    if (!finished_) {
+      pthread_join(thread_, nullptr);
+    }
     return made_the_last_decrement_;
   }
 
@@ -45,6 +48,7 @@ class Thread {
       Check(!made_the_last_decrement_);
       made_the_last_decrement_ = blocking_counter_->DecrementCount();
     }
+    finished_ = true;
   }
 
   static void* ThreadFunc(void* ptr) {
@@ -55,6 +59,7 @@ class Thread {
   BlockingCounter* const blocking_counter_;
   const int number_of_times_to_decrement_;
   pthread_t thread_;
+  bool finished_;
   bool made_the_last_decrement_;
 };
 
diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc
index 591bf44..e9d4b84 100644
--- a/test/test_math_helpers.cc
+++ b/test/test_math_helpers.cc
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
author	Miao Wang <miaowang@google.com>	2017-12-12 14:22:24 -0800
committer	Miao Wang <miaowang@google.com>	2017-12-12 16:14:38 -0800
commit	1963df9ac4a0424674e72ef5da522b5d830605fd (patch)
tree	efd8fbbe69f13c4057f2cc5a5b1f7852fd57a2ab /test
parent	cbcfdf963151219ca77f54657defabde8d845bac (diff)
download	gemmlowp-1963df9ac4a0424674e72ef5da522b5d830605fd.tar.gz