aboutsummaryrefslogtreecommitdiff
path: root/test/benchmark_all_sizes.cc
diff options
context:
space:
mode:
Diffstat (limited to 'test/benchmark_all_sizes.cc')
-rw-r--r--test/benchmark_all_sizes.cc357
1 files changed, 357 insertions, 0 deletions
diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc
new file mode 100644
index 0000000..16cc57c
--- /dev/null
+++ b/test/benchmark_all_sizes.cc
@@ -0,0 +1,357 @@
+// Example command line to build on Android ARM64:
+/*
+~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
+test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
+-DBENCHMARK_QUICK -DBENCHMARK_8bit
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <ctime>
+#include <iostream>
+#include <map>
+#include <random>
+#include <set>
+
+#include "../public/gemmlowp.h"
+
+#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
+// Compilation workaround
+namespace std {
+ using ::round;
+}
+#endif
+
+// Minimum duration of each benchmark measurement. Also, duration
+// of sleep time between each two consecutive benchmark measurements to
+// prevent over-heating.
+const double kBenchmarkSecs = 0.1;
+
+// Sleep time before each benchmark.
+const int kCooldownBeforeBenchmarkSecs = 0;
+
+// Number of benchmark passes.
+const int kPasses = 4;
+
+#ifdef BENCHMARK_NUM_THREADS
+const int kNumThreads = BENCHMARK_NUM_THREADS;
+#else
+const int kNumThreads = 1;
+#endif
+
+namespace gemmlowp {
+
+// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
+// since it only maps existing data. In tests though, we need to
+// create our own matrices.
+template <typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+ typedef MatrixMap<tScalar, tOrder> Map;
+ typedef MatrixMap<const tScalar, tOrder> ConstMap;
+ typedef typename Map::Scalar Scalar;
+ static const MapOrder Order = tOrder;
+ using Map::cols_;
+ using Map::data_;
+ using Map::kOrder;
+ using Map::rows_;
+ using Map::stride_;
+
+ public:
+ Matrix() : Map(nullptr, 0, 0, 0) {}
+
+ Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+ Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
+
+ Matrix& operator=(const Matrix& other) {
+ Resize(other.rows_, other.cols_);
+ std::memcpy(data_, other.data_, size() * sizeof(Scalar));
+ return *this;
+ }
+
+ friend bool operator==(const Matrix& a, const Matrix& b) {
+ return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+ !std::memcmp(a.data_, b.data_, a.size());
+ }
+
+ void Resize(int rows, int cols) {
+ rows_ = rows;
+ cols_ = cols;
+ stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
+ storage.resize(size());
+ data_ = storage.data();
+ }
+
+ int size() const { return rows_ * cols_; }
+
+ Map& map() { return *static_cast<Map*>(this); }
+
+ ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+ std::vector<Scalar> storage;
+};
+
+template <typename MatrixType>
+void MakeZero(MatrixType* m) {
+ for (int c = 0; c < m->cols(); c++) {
+ for (int r = 0; r < m->rows(); r++) {
+ (*m)(r, c) = 128;
+ }
+ }
+}
+
+} // end namespace gemmlowp
+
+template <typename BitDepthParams>
+float benchmark_8bit(int rows, int depth, int cols) {
+ using namespace gemmlowp;
+ typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+ typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+ typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+ LhsType lhs;
+ RhsType rhs;
+ ResultType result;
+ lhs.Resize(rows, depth);
+ rhs.Resize(depth, cols);
+ result.Resize(rows, cols);
+ MakeZero(&lhs);
+ MakeZero(&rhs);
+ MakeZero(&result);
+
+ typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+ OutputStageSaturatingCastToUint8>
+ Pipeline;
+ gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+ quantize_down_stage;
+ quantize_down_stage.result_offset_after_shift = 128;
+ quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
+ quantize_down_stage.result_shift = 16;
+ gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+ const auto output_pipeline =
+ std::make_tuple(quantize_down_stage, saturating_cast_stage);
+ GemmContext gemm_context;
+ gemm_context.set_max_num_threads(kNumThreads);
+ gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
+ &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+ -128, output_pipeline);
+
+ double time_start = real_time_in_seconds();
+ double t = time_start;
+ int iters = 0;
+ int iters_at_a_time = 1;
+ while (t - time_start < kBenchmarkSecs) {
+ for (int i = 0; i < iters_at_a_time; i++) {
+ gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
+ BitDepthParams>(
+ &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+ -128, output_pipeline);
+ iters++;
+ }
+ iters_at_a_time *= 2;
+ t = real_time_in_seconds();
+ }
+ return (t - time_start) / iters;
+}
+
+template <typename BitDepthParams>
+float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
+ using namespace gemmlowp;
+ typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+ typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+ typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
+
+ LhsType lhs;
+ RhsType rhs;
+ ResultType result;
+ lhs.Resize(rows, depth);
+ rhs.Resize(depth, cols);
+ result.Resize(rows, cols);
+ MakeZero(&lhs);
+ MakeZero(&rhs);
+ MakeZero(&result);
+
+ typedef std::tuple<> EmptyPipeline;
+ GemmContext gemm_context;
+ gemm_context.set_max_num_threads(kNumThreads);
+ gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
+ &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+ -128, EmptyPipeline());
+
+ double time_start = real_time_in_seconds();
+ double t = time_start;
+ int iters = 0;
+ int iters_at_a_time = 1;
+ while (t - time_start < kBenchmarkSecs) {
+ for (int i = 0; i < iters_at_a_time; i++) {
+ gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+ BitDepthParams>(
+ &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+ -128, EmptyPipeline());
+ iters++;
+ }
+ iters_at_a_time *= 2;
+ t = real_time_in_seconds();
+ }
+ return (t - time_start) / iters;
+}
+
+struct Shape {
+ int rows;
+ int depth;
+ int cols;
+};
+
+bool operator==(const Shape& s1, const Shape& s2) {
+ return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
+}
+
+bool operator<(const Shape& shape1, const Shape& shape2) {
+ return shape1.depth < shape2.depth ||
+ (shape1.depth == shape2.depth &&
+ (shape1.rows < shape2.rows ||
+ (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
+};
+
+#ifdef _WIN32
+#define sleep(t) Sleep(t)
+#endif
+
+float benchmark(const Shape& shape) {
+ if (kCooldownBeforeBenchmarkSecs) {
+ sleep(kCooldownBeforeBenchmarkSecs);
+ }
+#if defined BENCHMARK_8bit
+ // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
+ // This is the recommended thing to default to: it's what most applications
+ // want to use, as it's the fastest.
+ // The contract is that LHS must take values in [1, 255], while RHS can take
+ // any value in [0, 255].
+ return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+ shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_wide
+ // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
+ // The only contract difference is that both LHS and RHS can take values in
+ // [0, 255].
+ return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
+ shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit
+ // Variant of BENCHMARK_8bit where the user asks for getting raw int32
+ // accumulators, instead of a 8bit-downscaled result.
+ return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+ shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit_wide
+ // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
+ // accumulators, instead of a 8bit-downscaled result.
+ return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
+ shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_float
+ return benchmark_float(shape.rows, shape.depth, shape.cols);
+#else
+#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
+#endif
+}
+
+std::set<int> all_sizes() {
+ std::set<int> sizes;
+ for (int i = 1; i <= 2048; i *= 2) {
+ sizes.insert(i);
+ }
+ for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
+ sizes.insert(static_cast<int>(std::round(x)));
+ }
+ for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
+ sizes.insert(static_cast<int>(std::round(x)));
+ }
+ return sizes;
+}
+
+std::mt19937& RandomEngine() {
+ static std::mt19937 engine;
+ return engine;
+}
+
+std::vector<Shape> all_shapes_in_random_order() {
+ std::vector<Shape> shapes;
+ const std::set<int> sizes = all_sizes();
+#if defined BENCHMARK_ROWS
+ // Benchmark one specific shape
+ Shape shape;
+ shape.rows = BENCHMARK_ROWS;
+ shape.depth = BENCHMARK_DEPTH;
+ shape.cols = BENCHMARK_COLS;
+ shapes.push_back(shape);
+#elif defined BENCHMARK_QUICK
+ // Benchmark an assortment of cubic shapes
+ for (int size : sizes) {
+ Shape shape;
+ shape.rows = size;
+ shape.depth = size;
+ shape.cols = size;
+ shapes.push_back(shape);
+ }
+#elif defined BENCHMARK_EXHAUSTIVE
+ // Benchmark all sorts of shapes
+ for (int rows : sizes) {
+ for (int depth : sizes) {
+ for (int cols : sizes) {
+ Shape shape;
+ shape.rows = rows;
+ shape.depth = depth;
+ shape.cols = cols;
+ shapes.push_back(shape);
+ }
+ }
+ }
+#else
+#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
+#endif
+ std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
+ return shapes;
+}
+
+void run_benchmarks(std::map<Shape, float>* results) {
+ std::vector<Shape> shapes;
+ for (int pass = 0; pass < kPasses; pass++) {
+ const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
+ shapes.insert(std::end(shapes), std::begin(pass_shapes),
+ std::end(pass_shapes));
+ }
+
+ const double time_start = gemmlowp::real_time_in_seconds();
+ for (std::size_t i = 0; i < shapes.size(); i++) {
+ const double ratio = static_cast<double>(i) / shapes.size();
+ const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
+ const double elapsed_hours = elapsed / 3600.;
+ const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
+ fprintf(stderr,
+ "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
+ "hours... \r",
+ 100. * ratio, elapsed_hours, eta_hours);
+ fflush(stderr);
+ const Shape& shape = shapes[i];
+ float latency = benchmark(shape);
+ if (results->count(shape)) {
+ (*results)[shape] = std::min(latency, (*results)[shape]);
+ } else {
+ (*results)[shape] = latency;
+ }
+ }
+ fprintf(stderr, "\n");
+}
+
+int main() {
+ std::map<Shape, float> results;
+ run_benchmarks(&results);
+ printf("Using %d thread(s)\n", kNumThreads);
+ printf("depth,rows,cols,latency(s),Gop/s\n");
+ for (const auto& result : results) {
+ const Shape& shape = result.first;
+ printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
+ result.second,
+ 2e-9 * shape.depth * shape.rows * shape.cols / result.second);
+ }
+}