diff options
Diffstat (limited to 'test/benchmark_all_sizes.cc')
-rw-r--r-- | test/benchmark_all_sizes.cc | 357 |
1 files changed, 357 insertions, 0 deletions
diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc new file mode 100644 index 0000000..16cc57c --- /dev/null +++ b/test/benchmark_all_sizes.cc @@ -0,0 +1,357 @@ +// Example command line to build on Android ARM64: +/* +~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \ +test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \ +-DBENCHMARK_QUICK -DBENCHMARK_8bit +*/ + +#include <algorithm> +#include <cmath> +#include <cstdint> +#include <ctime> +#include <iostream> +#include <map> +#include <random> +#include <set> + +#include "../public/gemmlowp.h" + +#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32 +// Compilation workaround +namespace std { + using ::round; +} +#endif + +// Minimum duration of each benchmark measurement. Also, duration +// of sleep time between each two consecutive benchmark measurements to +// prevent over-heating. +const double kBenchmarkSecs = 0.1; + +// Sleep time before each benchmark. +const int kCooldownBeforeBenchmarkSecs = 0; + +// Number of benchmark passes. +const int kPasses = 4; + +#ifdef BENCHMARK_NUM_THREADS +const int kNumThreads = BENCHMARK_NUM_THREADS; +#else +const int kNumThreads = 1; +#endif + +namespace gemmlowp { + +// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, +// since it only maps existing data. In tests though, we need to +// create our own matrices. +template <typename tScalar, MapOrder tOrder> +class Matrix : public MatrixMap<tScalar, tOrder> { + public: + typedef MatrixMap<tScalar, tOrder> Map; + typedef MatrixMap<const tScalar, tOrder> ConstMap; + typedef typename Map::Scalar Scalar; + static const MapOrder Order = tOrder; + using Map::cols_; + using Map::data_; + using Map::kOrder; + using Map::rows_; + using Map::stride_; + + public: + Matrix() : Map(nullptr, 0, 0, 0) {} + + Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } + + Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } + + Matrix& operator=(const Matrix& other) { + Resize(other.rows_, other.cols_); + std::memcpy(data_, other.data_, size() * sizeof(Scalar)); + return *this; + } + + friend bool operator==(const Matrix& a, const Matrix& b) { + return a.rows_ == b.rows_ && a.cols_ == b.cols_ && + !std::memcmp(a.data_, b.data_, a.size()); + } + + void Resize(int rows, int cols) { + rows_ = rows; + cols_ = cols; + stride_ = kOrder == MapOrder::ColMajor ? rows : cols; + storage.resize(size()); + data_ = storage.data(); + } + + int size() const { return rows_ * cols_; } + + Map& map() { return *static_cast<Map*>(this); } + + ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } + + protected: + std::vector<Scalar> storage; +}; + +template <typename MatrixType> +void MakeZero(MatrixType* m) { + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = 128; + } + } +} + +} // end namespace gemmlowp + +template <typename BitDepthParams> +float benchmark_8bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint, + OutputStageSaturatingCastToUint8> + Pipeline; + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = 128; + quantize_down_stage.result_fixedpoint_multiplier = 1234567890; + quantize_down_stage.result_shift = 16; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + const auto output_pipeline = + std::make_tuple(quantize_down_stage, saturating_cast_stage); + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, + BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +template <typename BitDepthParams> +float benchmark_8bit_to_32bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple<> EmptyPipeline; + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, + BitDepthParams>( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +struct Shape { + int rows; + int depth; + int cols; +}; + +bool operator==(const Shape& s1, const Shape& s2) { + return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols; +} + +bool operator<(const Shape& shape1, const Shape& shape2) { + return shape1.depth < shape2.depth || + (shape1.depth == shape2.depth && + (shape1.rows < shape2.rows || + (shape1.rows == shape2.rows && shape1.cols < shape2.cols))); +}; + +#ifdef _WIN32 +#define sleep(t) Sleep(t) +#endif + +float benchmark(const Shape& shape) { + if (kCooldownBeforeBenchmarkSecs) { + sleep(kCooldownBeforeBenchmarkSecs); + } +#if defined BENCHMARK_8bit + // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams. + // This is the recommended thing to default to: it's what most applications + // want to use, as it's the fastest. + // The contract is that LHS must take values in [1, 255], while RHS can take + // any value in [0, 255]. + return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_wide + // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams. + // The only contract difference is that both LHS and RHS can take values in + // [0, 255]. + return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit + // Variant of BENCHMARK_8bit where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit_wide + // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_float + return benchmark_float(shape.rows, shape.depth, shape.cols); +#else +#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit) +#endif +} + +std::set<int> all_sizes() { + std::set<int> sizes; + for (int i = 1; i <= 2048; i *= 2) { + sizes.insert(i); + } + for (double x = 8; x <= 2048; x *= std::sqrt(2.)) { + sizes.insert(static_cast<int>(std::round(x))); + } + for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) { + sizes.insert(static_cast<int>(std::round(x))); + } + return sizes; +} + +std::mt19937& RandomEngine() { + static std::mt19937 engine; + return engine; +} + +std::vector<Shape> all_shapes_in_random_order() { + std::vector<Shape> shapes; + const std::set<int> sizes = all_sizes(); +#if defined BENCHMARK_ROWS + // Benchmark one specific shape + Shape shape; + shape.rows = BENCHMARK_ROWS; + shape.depth = BENCHMARK_DEPTH; + shape.cols = BENCHMARK_COLS; + shapes.push_back(shape); +#elif defined BENCHMARK_QUICK + // Benchmark an assortment of cubic shapes + for (int size : sizes) { + Shape shape; + shape.rows = size; + shape.depth = size; + shape.cols = size; + shapes.push_back(shape); + } +#elif defined BENCHMARK_EXHAUSTIVE + // Benchmark all sorts of shapes + for (int rows : sizes) { + for (int depth : sizes) { + for (int cols : sizes) { + Shape shape; + shape.rows = rows; + shape.depth = depth; + shape.cols = cols; + shapes.push_back(shape); + } + } + } +#else +#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK) +#endif + std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine()); + return shapes; +} + +void run_benchmarks(std::map<Shape, float>* results) { + std::vector<Shape> shapes; + for (int pass = 0; pass < kPasses; pass++) { + const std::vector<Shape> pass_shapes = all_shapes_in_random_order(); + shapes.insert(std::end(shapes), std::begin(pass_shapes), + std::end(pass_shapes)); + } + + const double time_start = gemmlowp::real_time_in_seconds(); + for (std::size_t i = 0; i < shapes.size(); i++) { + const double ratio = static_cast<double>(i) / shapes.size(); + const double elapsed = gemmlowp::real_time_in_seconds() - time_start; + const double elapsed_hours = elapsed / 3600.; + const double eta_hours = elapsed_hours * (1. - ratio) / ratio; + fprintf(stderr, + "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f " + "hours... \r", + 100. * ratio, elapsed_hours, eta_hours); + fflush(stderr); + const Shape& shape = shapes[i]; + float latency = benchmark(shape); + if (results->count(shape)) { + (*results)[shape] = std::min(latency, (*results)[shape]); + } else { + (*results)[shape] = latency; + } + } + fprintf(stderr, "\n"); +} + +int main() { + std::map<Shape, float> results; + run_benchmarks(&results); + printf("Using %d thread(s)\n", kNumThreads); + printf("depth,rows,cols,latency(s),Gop/s\n"); + for (const auto& result : results) { + const Shape& shape = result.first; + printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols, + result.second, + 2e-9 * shape.depth * shape.rows * shape.cols / result.second); + } +} |