diff options
author | Miao Wang <miaowang@google.com> | 2017-12-12 14:22:24 -0800 |
---|---|---|
committer | Miao Wang <miaowang@google.com> | 2017-12-12 16:14:38 -0800 |
commit | 1963df9ac4a0424674e72ef5da522b5d830605fd (patch) | |
tree | efd8fbbe69f13c4057f2cc5a5b1f7852fd57a2ab /test | |
parent | cbcfdf963151219ca77f54657defabde8d845bac (diff) | |
download | gemmlowp-1963df9ac4a0424674e72ef5da522b5d830605fd.tar.gz |
Rebase gemmlowp to 6a2a908temp_72223856
Bug: 70573221
Test: mm
Test: mm and Pixel2 boot
Test: NeuralNetworksTest pass
Change-Id: I8fac98811e9a276d3ff8054167dc45225c04147e
Diffstat (limited to 'test')
-rw-r--r-- | test/benchmark.cc | 23 | ||||
-rw-r--r-- | test/benchmark_all_sizes.cc | 357 | ||||
-rw-r--r-- | test/test.cc | 1 | ||||
-rw-r--r-- | test/test.h | 14 | ||||
-rw-r--r-- | test/test_allocator.cc | 2 | ||||
-rw-r--r-- | test/test_blocking_counter.cc | 11 | ||||
-rw-r--r-- | test/test_math_helpers.cc | 2 |
7 files changed, 386 insertions, 24 deletions
diff --git a/test/benchmark.cc b/test/benchmark.cc index 20dd369..9a87a41 100644 --- a/test/benchmark.cc +++ b/test/benchmark.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include <unistd.h> #ifdef __APPLE__ #include <sys/time.h> #endif @@ -44,18 +43,6 @@ namespace gemmlowp { -double time() { -#ifdef __APPLE__ - timeval t; - gettimeofday(&t, nullptr); - return t.tv_sec + 1e-6 * t.tv_usec; -#else - timespec t; - clock_gettime(CLOCK_REALTIME, &t); - return t.tv_sec + 1e-9 * t.tv_nsec; -#endif -} - const double min_accurate_duration = 1e-1; const std::size_t min_working_set_size = 16 * 1024 * 1024; @@ -111,10 +98,10 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) { std::size_t pool_index = 0; while (true) { - double starttime = time(); + double starttime = real_time_in_seconds(); for (int i = 0; i < iters_at_a_time; i++) { for (size_t j = 0; j < gemms.size(); j++) { - int k = pool_index * gemms.size() + j; + size_t k = pool_index * gemms.size() + j; Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>( context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(), -75, -91, 74980, 123, 20); @@ -124,7 +111,7 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) { pool_index = 0; } } - double endtime = time(); + double endtime = real_time_in_seconds(); const float timing = static_cast<float>(endtime - starttime); @@ -228,8 +215,8 @@ void benchmark_gemm_sizes(GemmContext* context, gemmlowp::StartProfiling(); #endif - double starttime = time(); - while (time() < starttime + mintime) { + double starttime = real_time_in_seconds(); + while (real_time_in_seconds() < starttime + mintime) { gemm_times.push_back( time_for_gemms<LhsType, RhsType, ResultType>(context, gemms)); } diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc new file mode 100644 index 0000000..16cc57c --- /dev/null +++ b/test/benchmark_all_sizes.cc @@ -0,0 +1,357 @@ +// Example command line to build on Android ARM64: +/* +~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \ +test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \ +-DBENCHMARK_QUICK -DBENCHMARK_8bit +*/ + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <ctime> +#include <iostream> +#include <map> +#include <random> +#include <set> + +#include "../public/gemmlowp.h" + +#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32 +// Compilation workaround +namespace std { + using ::round; +} +#endif + +// Minimum duration of each benchmark measurement. Also, duration +// of sleep time between each two consecutive benchmark measurements to +// prevent over-heating. +const double kBenchmarkSecs = 0.1; + +// Sleep time before each benchmark. +const int kCooldownBeforeBenchmarkSecs = 0; + +// Number of benchmark passes. +const int kPasses = 4; + +#ifdef BENCHMARK_NUM_THREADS +const int kNumThreads = BENCHMARK_NUM_THREADS; +#else +const int kNumThreads = 1; +#endif + +namespace gemmlowp { + +// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, +// since it only maps existing data. In tests though, we need to +// create our own matrices. +template <typename tScalar, MapOrder tOrder> +class Matrix : public MatrixMap<tScalar, tOrder> { + public: + typedef MatrixMap<tScalar, tOrder> Map; + typedef MatrixMap<const tScalar, tOrder> ConstMap; + typedef typename Map::Scalar Scalar; + static const MapOrder Order = tOrder; + using Map::cols_; + using Map::data_; + using Map::kOrder; + using Map::rows_; + using Map::stride_; + + public: + Matrix() : Map(nullptr, 0, 0, 0) {} + + Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } + + Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } + + Matrix& operator=(const Matrix& other) { + Resize(other.rows_, other.cols_); + std::memcpy(data_, other.data_, size() * sizeof(Scalar)); + return *this; + } + + friend bool operator==(const Matrix& a, const Matrix& b) { + return a.rows_ == b.rows_ && a.cols_ == b.cols_ && + !std::memcmp(a.data_, b.data_, a.size()); + } + + void Resize(int rows, int cols) { + rows_ = rows; + cols_ = cols; + stride_ = kOrder == MapOrder::ColMajor ? rows : cols; + storage.resize(size()); + data_ = storage.data(); + } + + int size() const { return rows_ * cols_; } + + Map& map() { return *static_cast<Map*>(this); } + + ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } + + protected: + std::vector<Scalar> storage; +}; + +template <typename MatrixType> +void MakeZero(MatrixType* m) { + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = 128; + } + } +} + +} // end namespace gemmlowp + +template <typename BitDepthParams> +float benchmark_8bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, + OutputStageSaturatingCastToUint8> + Pipeline; + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = 128; + quantize_down_stage.result_fixedpoint_multiplier = 1234567890; + quantize_down_stage.result_shift = 16; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + const auto output_pipeline = + std::make_tuple(quantize_down_stage, saturating_cast_stage); + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, + BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +template <typename BitDepthParams> +float benchmark_8bit_to_32bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple<> EmptyPipeline; + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, + BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +struct Shape { + int rows; + int depth; + int cols; +}; + +bool operator==(const Shape& s1, const Shape& s2) { + return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols; +} + +bool operator<(const Shape& shape1, const Shape& shape2) { + return shape1.depth < shape2.depth || + (shape1.depth == shape2.depth && + (shape1.rows < shape2.rows || + (shape1.rows == shape2.rows && shape1.cols < shape2.cols))); +}; + +#ifdef _WIN32 +#define sleep(t) Sleep(t) +#endif + +float benchmark(const Shape& shape) { + if (kCooldownBeforeBenchmarkSecs) { + sleep(kCooldownBeforeBenchmarkSecs); + } +#if defined BENCHMARK_8bit + // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams. + // This is the recommended thing to default to: it's what most applications + // want to use, as it's the fastest. + // The contract is that LHS must take values in [1, 255], while RHS can take + // any value in [0, 255]. + return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_wide + // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams. + // The only contract difference is that both LHS and RHS can take values in + // [0, 255]. + return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit + // Variant of BENCHMARK_8bit where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit_wide + // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_float + return benchmark_float(shape.rows, shape.depth, shape.cols); +#else +#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit) +#endif +} + +std::set<int> all_sizes() { + std::set<int> sizes; + for (int i = 1; i <= 2048; i *= 2) { + sizes.insert(i); + } + for (double x = 8; x <= 2048; x *= std::sqrt(2.)) { + sizes.insert(static_cast<int>(std::round(x))); + } + for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) { + sizes.insert(static_cast<int>(std::round(x))); + } + return sizes; +} + +std::mt19937& RandomEngine() { + static std::mt19937 engine; + return engine; +} + +std::vector<Shape> all_shapes_in_random_order() { + std::vector<Shape> shapes; + const std::set<int> sizes = all_sizes(); +#if defined BENCHMARK_ROWS + // Benchmark one specific shape + Shape shape; + shape.rows = BENCHMARK_ROWS; + shape.depth = BENCHMARK_DEPTH; + shape.cols = BENCHMARK_COLS; + shapes.push_back(shape); +#elif defined BENCHMARK_QUICK + // Benchmark an assortment of cubic shapes + for (int size : sizes) { + Shape shape; + shape.rows = size; + shape.depth = size; + shape.cols = size; + shapes.push_back(shape); + } +#elif defined BENCHMARK_EXHAUSTIVE + // Benchmark all sorts of shapes + for (int rows : sizes) { + for (int depth : sizes) { + for (int cols : sizes) { + Shape shape; + shape.rows = rows; + shape.depth = depth; + shape.cols = cols; + shapes.push_back(shape); + } + } + } +#else +#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK) +#endif + std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine()); + return shapes; +} + +void run_benchmarks(std::map<Shape, float>* results) { + std::vector<Shape> shapes; + for (int pass = 0; pass < kPasses; pass++) { + const std::vector<Shape> pass_shapes = all_shapes_in_random_order(); + shapes.insert(std::end(shapes), std::begin(pass_shapes), + std::end(pass_shapes)); + } + + const double time_start = gemmlowp::real_time_in_seconds(); + for (std::size_t i = 0; i < shapes.size(); i++) { + const double ratio = static_cast<double>(i) / shapes.size(); + const double elapsed = gemmlowp::real_time_in_seconds() - time_start; + const double elapsed_hours = elapsed / 3600.; + const double eta_hours = elapsed_hours * (1. - ratio) / ratio; + fprintf(stderr, + "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f " + "hours... \r", + 100. * ratio, elapsed_hours, eta_hours); + fflush(stderr); + const Shape& shape = shapes[i]; + float latency = benchmark(shape); + if (results->count(shape)) { + (*results)[shape] = std::min(latency, (*results)[shape]); + } else { + (*results)[shape] = latency; + } + } + fprintf(stderr, "\n"); +} + +int main() { + std::map<Shape, float> results; + run_benchmarks(&results); + printf("Using %d thread(s)\n", kNumThreads); + printf("depth,rows,cols,latency(s),Gop/s\n"); + for (const auto& result : results) { + const Shape& shape = result.first; + printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols, + result.second, + 2e-9 * shape.depth * shape.rows * shape.cols / result.second); + } +} diff --git a/test/test.cc b/test/test.cc index fdc7bcc..eee16b4 100644 --- a/test/test.cc +++ b/test/test.cc @@ -14,7 +14,6 @@ #include "test.h" -#include <unistd.h> #include <array> #include <cstdint> #include <cstdlib> diff --git a/test/test.h b/test/test.h index b6a540d..aecd0c1 100644 --- a/test/test.h +++ b/test/test.h @@ -102,6 +102,19 @@ int Random() { return dist(RandomEngine()); } +#ifdef _MSC_VER +// msvc does not support 8bit types in uniform_int_distribution<>. +// Take 32 bit uniform_int_distribution<> and only use the lower 8 bits. +template <typename OperandRange, typename MatrixType> +void MakeRandom(MatrixType* m) { + ScopedProfilingLabel("MakeRandom(matrix)"); + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = Random() % OperandRange::kMaxValue; + } + } +} +#else template <typename OperandRange, typename MatrixType> void MakeRandom(MatrixType* m) { ScopedProfilingLabel("MakeRandom(matrix)"); @@ -114,6 +127,7 @@ void MakeRandom(MatrixType* m) { } } } +#endif template <typename MatrixType> void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) { diff --git a/test/test_allocator.cc b/test/test_allocator.cc index 8a76709..3de50f0 100644 --- a/test/test_allocator.cc +++ b/test/test_allocator.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "../internal/allocator.h" #include "test.h" +#include "../internal/allocator.h" namespace gemmlowp { diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc index 8260576..d1e0932 100644 --- a/test/test_blocking_counter.cc +++ b/test/test_blocking_counter.cc @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ // limitations under the License. #include "test.h" +#include "../profiling/pthread_everywhere.h" -#include <pthread.h> #include <vector> #include "../internal/multi_thread_gemm.h" @@ -26,6 +26,7 @@ class Thread { Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement) : blocking_counter_(blocking_counter), number_of_times_to_decrement_(number_of_times_to_decrement), + finished_(false), made_the_last_decrement_(false) { pthread_create(&thread_, nullptr, ThreadFunc, this); } @@ -33,7 +34,9 @@ class Thread { ~Thread() { Join(); } bool Join() const { - pthread_join(thread_, nullptr); + if (!finished_) { + pthread_join(thread_, nullptr); + } return made_the_last_decrement_; } @@ -45,6 +48,7 @@ class Thread { Check(!made_the_last_decrement_); made_the_last_decrement_ = blocking_counter_->DecrementCount(); } + finished_ = true; } static void* ThreadFunc(void* ptr) { @@ -55,6 +59,7 @@ class Thread { BlockingCounter* const blocking_counter_; const int number_of_times_to_decrement_; pthread_t thread_; + bool finished_; bool made_the_last_decrement_; }; diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc index 591bf44..e9d4b84 100644 --- a/test/test_math_helpers.cc +++ b/test/test_math_helpers.cc @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. |