diff options
author | Pete Warden <petewarden@google.com> | 2015-07-08 12:50:10 -0700 |
---|---|---|
committer | Pete Warden <petewarden@google.com> | 2015-07-08 12:50:10 -0700 |
commit | 5de59a56e832f01aafa27d651f9c638dfc2bb00b (patch) | |
tree | 0bf3af0b392748b1983aaa38c8ef918961472fae | |
parent | 041e4a5d64d58e5b7348e69f89b845880aae8577 (diff) | |
download | gemmlowp-5de59a56e832f01aafa27d651f9c638dfc2bb00b.tar.gz |
Added documentation and additional benchmarking code
-rw-r--r-- | README.txt | 33 | ||||
-rw-r--r-- | test/benchmark.cc | 149 |
2 files changed, 180 insertions, 2 deletions
@@ -16,7 +16,7 @@ Portability, target platforms/architectures =========================================== Should be portable to any platform with some C++11 and POSIX support, -while we have optional optimized code paths for specifica architectures. +while we have optional optimized code paths for specific architectures. Required: C++11 (a small conservative subset of it) @@ -79,6 +79,37 @@ Then run: $ ./scripts/test-android.sh test/test.cc eight_bit_int_gemm/eight_bit_int_gemm.cc +Troubleshooting Compilation +=========================== + +If you're having trouble finding the compiler, follow these instructions to +build a standalone toolchain: +https://developer.android.com/ndk/guides/standalone_toolchain.html + +Here's an example of setting up Clang 3.5: + +$ export INSTALL_DIR=~/toolchains/clang-21-stl-gnu +$ $NDK/build/tools/make-standalone-toolchain.sh \ +--toolchain=arm-linux-androideabi-clang3.5 --platform=android-21 \ +--install-dir=$INSTALL_DIR +$ export CXX="$INSTALL_DIR/bin/arm-linux-androideabi-g++ \ +--sysroot=$INSTALL_DIR/sysroot" + +Some compilers (e.g. the default clang++ in the same bin directory) don't +support NEON assembly. The benchmark build process will issue a warning if +support isn't detected, and you should make sure you're using a compiler like +arm-linux-androideabi-g++ that does include NEON. + + +Benchmarking +============ + +To see what the performance is like on some typical operations, run +$ ../scripts/test-android.sh benchmark.cc + +This will compile and run a small benchmark binary, which runs through GEMMs +with varying input matrix sizes and outputs the performance. The final test +simulates the sort of GEMM sizes you'd expect for a GoogLeNet-style CNN. Profiling ========= diff --git a/test/benchmark.cc b/test/benchmark.cc index 624390e..bb678af 100644 --- a/test/benchmark.cc +++ b/test/benchmark.cc @@ -27,6 +27,10 @@ #include "test/test.h" #include "public/gemmlowp.h" +#if defined(__arm__) && !defined(GEMMLOWP_NEON) +#warning "Building without NEON support on ARM, check your compiler setup!" +#endif + namespace gemmlowp { double time() { @@ -46,7 +50,7 @@ const std::size_t min_working_set_size = 16 * 1024 * 1024; template <typename Kernel, typename LhsType, typename RhsType, typename ResultType> -double gflops_for_gemm_size(GemmContext* context, int rows, int depth, +double time_for_gemm_size(GemmContext* context, int rows, int depth, int cols) { typedef std::uint8_t Scalar; @@ -108,6 +112,16 @@ double gflops_for_gemm_size(GemmContext* context, int rows, int depth, iters_at_a_time *= 2; } + return time_per_iter; +} + +template <typename Kernel, typename LhsType, typename RhsType, + typename ResultType> +double gflops_for_gemm_size(GemmContext* context, int rows, int depth, + int cols) { + const double time_per_iter = + time_for_gemm_size<Kernel, LhsType, RhsType, ResultType>( + context, rows, depth, cols); return 2e-9 * rows * depth * cols / time_per_iter; } @@ -191,6 +205,133 @@ void benchmark(GemmContext* context) { std::cout << std::endl; } +void benchmark_googlenet(GemmContext* context) { + +#ifdef GEMMLOWP_TEST_KERNEL + typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMM; + typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMV; +#else + typedef gemmlowp::DefaultKernelForGEMM KernelForGEMM; + typedef gemmlowp::DefaultKernelForGEMV KernelForGEMV; +#endif + + // These are the m, n, k sizes for a typical GoogLeNet. + const int googlenet_gemm_sizes[] = { + 12544, 64, 147, + 3136, 64, 64, + 3136, 192, 576, + 784, 64, 192, + 784, 96, 192, + 784, 128, 864, + 784, 16, 192, + 784, 32, 400, + 784, 32, 192, + 784, 128, 256, + 784, 128, 256, + 784, 192, 1152, + 784, 32, 256, + 784, 96, 800, + 784, 64, 256, + 196, 192, 480, + 196, 96, 480, + 196, 204, 864, + 196, 16, 480, + 196, 48, 400, + 196, 64, 480, + 196, 160, 508, + 196, 112, 508, + 196, 224, 1008, + 196, 24, 508, + 196, 64, 600, + 196, 64, 508, + 196, 128, 512, + 196, 128, 512, + 196, 256, 1152, + 196, 24, 512, + 196, 64, 600, + 196, 64, 512, + 196, 112, 512, + 196, 144, 512, + 196, 288, 1296, + 196, 32, 512, + 196, 64, 800, + 196, 64, 512, + 196, 256, 528, + 196, 160, 528, + 196, 320, 1440, + 196, 32, 528, + 196, 128, 800, + 196, 128, 528, + 49, 256, 832, + 49, 160, 832, + 49, 320, 1440, + 49, 48, 832, + 49, 128, 1200, + 49, 128, 832, + 49, 384, 832, + 49, 192, 832, + 49, 384, 1728, + 49, 48, 832, + 49, 128, 1200, + 49, 128, 832, + 16, 128, 508, + 1, 1024, 2048, + 1, 1008, 1024, + 16, 128, 528, + 1, 1024, 2048, + 1, 1008, 1024, + 1, 1008, 1024, + }; + const int param_count = + sizeof(googlenet_gemm_sizes) / sizeof(googlenet_gemm_sizes[0]); + const int gemm_count = param_count / 3; + + const int repeat = 2; + + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; + +#ifdef GEMMLOWP_TEST_PROFILE + gemmlowp::RegisterCurrentThreadForProfiling(); + gemmlowp::StartProfiling(); +#endif + + float total_time = 0; + + // We don't record the first repetition, it's just warm-up. + for (int r = 0; r < repeat + 1; r++) { + std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r" + << std::flush; + for (int gemm_index = 0; gemm_index < gemm_count; ++gemm_index) { + float gemm_time = 0; + const int rows = googlenet_gemm_sizes[(gemm_index * 3) + 1]; + const int cols = googlenet_gemm_sizes[(gemm_index * 3) + 0]; + const int depth = googlenet_gemm_sizes[(gemm_index * 3) + 2]; + if (cols > KernelForGEMM::Format::kCols / 2) { + gemm_time = + time_for_gemm_size<KernelForGEMM, LhsType, RhsType, ResultType>( + context, rows, depth, cols); + } else { + gemm_time = + time_for_gemm_size<KernelForGEMV, LhsType, RhsType, ResultType>( + context, rows, depth, cols); + } + if (r > 0) { + total_time += gemm_time; + } + } + } + +#ifdef GEMMLOWP_TEST_PROFILE + gemmlowp::FinishProfiling(); +#endif + + const float ms_per_network = (total_time / repeat) * 1000.0f; + std::cout.precision(4); + std::cout << "GoogLeNet GEMMs took " << ms_per_network << "ms" << std::endl; +} + } // end namespace gemmlowp int main() { @@ -207,4 +348,10 @@ int main() { std::cout << "Benchmarking single-threaded mode..." << std::endl; gemmlowp::benchmark(&context); } + + { + gemmlowp::GemmContext context; + std::cout << "Benchmarking typical GoogLeNet GEMMs..." << std::endl; + gemmlowp::benchmark_googlenet(&context); + } } |