From 3dd9fc332efa9e209c142431d9f8225ccf03fe57 Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Wed, 6 Dec 2017 17:00:59 -0800 Subject: Add a header lib to include all gemmlowp headers Test: mm Change-Id: I5f493dc35b269a75c7a8473e5002a90df98e92a3 --- Android.bp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Android.bp b/Android.bp index f6d9324..3f1dab4 100644 --- a/Android.bp +++ b/Android.bp @@ -26,4 +26,11 @@ cc_library_headers { export_include_dirs: ["public"], } +cc_library_headers { + name: "gemmlowp_headers", + vendor_available: true, + host_supported: true, + export_include_dirs: ["."], +} + subdirs = ["eight_bit_int_gemm"] -- cgit v1.2.3 From 1963df9ac4a0424674e72ef5da522b5d830605fd Mon Sep 17 00:00:00 2001 From: Miao Wang Date: Tue, 12 Dec 2017 14:22:24 -0800 Subject: Rebase gemmlowp to 6a2a908 Bug: 70573221 Test: mm Test: mm and Pixel2 boot Test: NeuralNetworksTest pass Change-Id: I8fac98811e9a276d3ff8054167dc45225c04147e --- doc/design.txt | 158 -------------- doc/kernels.txt | 176 --------------- doc/less-than-8-bit.txt | 305 -------------------------- doc/low-precision.md | 2 +- doc/low-precision.txt | 159 -------------- doc/output.md | 4 +- doc/packing.txt | 206 ------------------ eight_bit_int_gemm/eight_bit_int_gemm.cc | 8 +- internal/allocator.h | 27 +-- internal/block_params.h | 25 ++- internal/common.h | 28 ++- internal/compute.h | 14 ++ internal/kernel.h | 2 +- internal/kernel_default.h | 4 + internal/multi_thread_gemm.h | 105 +++++---- internal/platform.h | 112 ++++++++++ meta/multi_thread_common.h | 9 - meta/multi_thread_gemm.h | 6 +- meta/multi_thread_gemv.h | 168 --------------- meta/multi_thread_transform.h | 6 +- meta/operations_common.h | 61 ------ profiling/instrumentation.h | 115 +++++----- profiling/profiler.h | 12 +- profiling/pthread_everywhere.h | 88 ++++++++ test/benchmark.cc | 23 +- test/benchmark_all_sizes.cc | 357 +++++++++++++++++++++++++++++++ test/test.cc | 1 - test/test.h | 14 ++ test/test_allocator.cc | 2 +- test/test_blocking_counter.cc | 11 +- test/test_math_helpers.cc | 2 +- 31 files changed, 772 insertions(+), 1438 deletions(-) delete mode 100644 doc/design.txt delete mode 100644 doc/kernels.txt delete mode 100644 doc/less-than-8-bit.txt delete mode 100644 doc/low-precision.txt delete mode 100644 doc/packing.txt create mode 100755 internal/platform.h delete mode 100644 meta/multi_thread_gemv.h delete mode 100644 meta/operations_common.h create mode 100644 profiling/pthread_everywhere.h create mode 100644 test/benchmark_all_sizes.cc diff --git a/doc/design.txt b/doc/design.txt deleted file mode 100644 index cb78dbb..0000000 --- a/doc/design.txt +++ /dev/null @@ -1,158 +0,0 @@ - Overview of gemmlowp design - *************************** - - -Primer on GEMM, kernels, and cache friendliness -=============================================== - -gemmlowp, like most GEMMs, implements the straightforward matrix multiplication -algorithm, which takes n^3 multiply-accumulate instructions for n*n sized -matrices. Because the arithmetic complexity grows quicker than the memory -complexity (n^3 vs. n^2), memory accesses are redundant (each matrix entry -is accessed n times). A large part of a GEMM's performance and design goes -toward minimizing the inefficiency resulting from these redundant memory -accesses. - -Ultimately, once values are loaded into CPU registers, they cost nothing to -access, so as long as we can work within registers, this problem doesn't exist. -Thus, in order to be efficient, a GEMM's inner loops must wisely use the -available registers to do as much arithmetic work as possible before loading -more data from memory into registers. This means that -a GEMM implementation needs to have architecture-specific inner loops tailored -for architecture details such as the number of registers, and typically written -in assembly. This 'inner loops' architecture-specific component is referred to -as the GEMM kernel. (More details about kernels are in doc/kernels.txt). - -However, only small blocks can fit at a given time in registers, so at larger -scales one needs to repeatedly load blocks of matrices from memory, and -these accesses are redundant for the reason outlined above. The way that -one minimizes the resulting inefficiency is by organizing for cache locality, -so that most of these accesses hit the L1 cache, and most of the remaining -ones hit the L2 cache, etc. - -This is achieved by subdividing the matrices into blocks sized to fit in L2 -cache, and subdividing these blocks into sub-blocks sizes to fit in L1 cache, -and performing the matrix multiplication one such block at a time. - -In practice, it tends to pay off to "pack" input blocks for optimally -efficient traversal by the kernel, since they will be traversed multiple times. -"packing" means at least reordering the data layout for 1) simple access -patterns that fit the CPU's cache behavior (in particular, the cache line size), -and 2) simple loading into SIMD vector registers by the kernel. - -So a typical GEMM, in pseudo-code, tends to look like this: - -allocate(some_lhs_L2_block); -allocate(some_rhs_L2_block); -for (some_lhs_L2_block) { - pack(some_lhs_L2_block); - for (some_rhs_L2_block) { - pack(some_rhs_L2_block); - for (some_lhs_sub_block in some_lhs_L2_block) { - for (some_rhs_sub_block in some_rhs_L2_block) { - kernel(some_lhs_sub_block, some_rhs_sub_block); - } - } - } -} - - -Impact of low-precision computation on gemmlowp design -====================================================== - -Refer to doc/low-precision.txt for specifics of the low-precision-computation -paradigm and how it's implemented in gemmlowp. - -Inputs and outputs are matrices of uint8 values, but internally we are -accumulating int32 values, only converting them back to uint8 at the end. This -means that we need so store a block of int32 accumulators at a time. We compute -a block of the result in int32 accumulators and then we "unpack" it into the -destination matrix at once. In this way, we minimize the amount of memory used to -store int32 values at a given time. - -Because of that, besides the "pack" and "kernel" stages outlined above, a third -stage is needed in gemmlowp, which we call "unpack". Thus we arrive at the -3-stage computation scheme that gemmlowp uses: - - 1. Pack lhs/rhs blocks from the input matrices. - 2. Compute the product of the packed blocks, using the kernel. - 3. Unpack the result block into the output matrix. - -The pseudo-code overview of gemmlowp now looks like: - -allocate(some_lhs_L2_block); -allocate(some_rhs_L2_block); -// new: temp storage for int32 accums -allocate(some_int32_accumulators_block); -for (some_lhs_L2_block) { - pack(some_lhs_L2_block); - for (some_rhs_L2_block) { - pack(some_rhs_L2_block); - for (some_lhs_sub_block in some_lhs_L2_block) { - for (some_rhs_sub_block in some_rhs_L2_block) { - // new: pass int32 accums to kernel - kernel(&some_int32_accumulators_block, - some_lhs_sub_block, - some_rhs_sub_block); - } - } - // new: unpack int32 accums into destination matrix - unpack(some_int32_accumulators_block); - } -} - - -Exploring gemmlowp code -======================= - -The design outlined above can be readily matched to gemmlowp source code, -in particular in this file, which gives a simple GEMM implementation fitting in -one rather small function: - - internal/single_thread_gemm.h - -The reader can compare the above pseudo-code to the actual code in this file: - - for (int r = 0; r < rows; r += block_params.l2_rows) { - int rs = std::min(block_params.l2_rows, rows - r); - - PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); - - for (int c = 0; c < cols; c += block_params.l2_cols) { - int cs = std::min(block_params.l2_cols, cols - c); - - if (!pack_rhs_once) { - PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); - } - - Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs); - - auto result_block = result->block(r, c, rs, cs); - UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth, - result_offset, result_mult_int, result_shift); - } - } - -The files in internal/ fall into a few categories: - -There are two top-level GEMM implementations, - single_thread_gemm.h - multi_thread_gemm.h - -They both call into pack/compute/unpack stages implemented in the following files: - pack.h - compute.h - unpack.h - -The pack.h and unpack.h files contain generic templated code that can be overridden -by optimized code in template specializations; see the NEON optimized code here: - pack_neon.h - unpack_neon.h - -The compute stage contains generic code in compute.h that only calls into -optimized code through the Kernel::Run() entry point. Each kernel is basically just -as struct offering a Run() implementation; see the NEON kernels in: - kernel_neon.h - -More details about the interplay between these components can be found in this file: - doc/kernels.txt diff --git a/doc/kernels.txt b/doc/kernels.txt deleted file mode 100644 index 43dcb40..0000000 --- a/doc/kernels.txt +++ /dev/null @@ -1,176 +0,0 @@ - Kernels in gemmlowp - ******************* - - -Kernels provide an inner-loop implementation, and a format -========================================================== - -Here we assume familiarity with the concepts of kernels and of packing -as explained in doc/design.txt. - -gemmlowp is designed to be easily extensible to different architectures and -other low-level details, while achieving high performance. Thus a line had to -be drawn between the generic GEMM code and the specific parts that need to -be manually designed for each architecture, etc. The design choice made in -gemmlowp is to have easily swappable GEMM kernels. - -In itself, a GEMM kernel is just an implementation of the inner-most loop -in a GEMM (That inner-most loop has to be over the 'depth' dimension so as -to be able to accumulate into a small enough number of accumulators to fit -in registers). - -Thus, by itself, a GEMM kernel should be just a function computing a block -of GEMM. - -However, GEMM kernels may need to differ not just in how they implement this -computation, but also in the format of data that they operate on. Indeed, -in order to maximize the ratio of arithmetic instructions to memory access -instructions, GEMM kernels want to handle blocks as wide as possible given -the number of registers of the CPU architecture. - -Thus, in order to allow efficient specialization to diverse architectures, -gemmlowp allows each GEMM kernel to dictate the format of data that it expects, -in addition to providing its inner-loop implementation. - -The former is given by a 'Format' typedef, and the latter by a 'Run' -method. - -A good example is to look at internal/kernel_neon.h, and specifically at -the NEONKernel12x4Depth2 kernel, which specifies its format as - - typedef KernelFormat, 3>, - KernelSideFormat, 1> > Format; - -The meaning of these terms is explained in the lengthy comment at the -top of internal/kernel.h. Here, they mean that this kernel handles at -each iteration (along the depth dimension): - - 3 'cells' of size 4x2 each of the lhs, so a total lhs block - of size 12x2 - - 1 'cell' of size 2x4 of the rhs. -In other words, this kernel handles 12 rows of the lhs and 4 columns of the -rhs, and handles two levels of depth at once. The 'cells' and 'CellFormat' -details the layout of these 12x2 and 2x4 blocks. - -This kernel then loads these 12x2 and 2x4 blocks and computes the corresponding -12x4 GEMM; for ease of reference let us paste the critical comment and code here: - - "loop_NEONKernel12x4Depth2_%=:\n" - - // Overview of register layout: - // - // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0). - // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7 - // (q1--q3). - // A 12x4 block of accumulators is stored in 32bit in q4--q15. - // - // +-----+-----+-----+-----+ - // |d0[0]|d0[1]|d0[2]|d0[3]| - // Rhs +-----+-----+-----+-----+ - // |d1[0]|d1[1]|d1[2]|d1[3]| - // +-----+-----+-----+-----+ - // - // | | | | | - // - // Lhs | | | | | - // - // +--+--+ - - - - +-----+-----+-----+-----+ - // |d2|d3| | q4 | q5 | q6 | q7 | - // |d2|d3| | q4 | q5 | q6 | q7 | - // |d2|d3| | q4 | q5 | q6 | q7 | - // |d2|d3| | q4 | q5 | q6 | q7 | - // +--+--+ - - - - +-----+-----+-----+-----+ - // |d4|d5| | q8 | q9 | q10 | q11 | - // |d4|d5| | q8 | q9 | q10 | q11 | - // |d4|d5| | q8 | q9 | q10 | q11 | - // |d4|d5| | q8 | q9 | q10 | q11 | - // +--+--+ - - - - +-----+-----+-----+-----+ - // |d6|d7| | q12 | q13 | q14 | q15 | - // |d6|d7| | q12 | q13 | q14 | q15 | - // |d6|d7| | q12 | q13 | q14 | q15 | - // |d6|d7| | q12 | q13 | q14 | q15 | - // +--+--+ - - - - +-----+-----+-----+-----+ - // - // Accumulator - - // Load 1 Rhs cell of size 2x4 - "vld1.8 {d0}, [%[rhs_ptr]:64]!\n" - - // Load 3 Lhs cells of size 4x2 each - "vld1.8 {d2}, [%[lhs_ptr]:64]!\n" - "vld1.8 {d4}, [%[lhs_ptr]:64]!\n" - "vld1.8 {d6}, [%[lhs_ptr]:64]!\n" - - // Expand Lhs/Rhs cells to 16 bit. - "vmovl.u8 q0, d0\n" - "vmovl.u8 q1, d2\n" - "vmovl.u8 q2, d4\n" - "vmovl.u8 q3, d6\n" - - // Multiply-accumulate, level of depth 0 - "vmlal.u16 q4, d2, d0[0]\n" - "vmlal.u16 q5, d2, d0[1]\n" - "vmlal.u16 q6, d2, d0[2]\n" - "vmlal.u16 q7, d2, d0[3]\n" - "vmlal.u16 q8, d4, d0[0]\n" - "vmlal.u16 q9, d4, d0[1]\n" - "vmlal.u16 q10, d4, d0[2]\n" - "vmlal.u16 q11, d4, d0[3]\n" - "vmlal.u16 q12, d6, d0[0]\n" - "vmlal.u16 q13, d6, d0[1]\n" - "vmlal.u16 q14, d6, d0[2]\n" - "vmlal.u16 q15, d6, d0[3]\n" - - // Multiply-accumulate, level of depth 1 - "vmlal.u16 q4, d3, d1[0]\n" - "vmlal.u16 q5, d3, d1[1]\n" - "vmlal.u16 q6, d3, d1[2]\n" - "vmlal.u16 q7, d3, d1[3]\n" - "vmlal.u16 q8, d5, d1[0]\n" - "vmlal.u16 q9, d5, d1[1]\n" - "vmlal.u16 q10, d5, d1[2]\n" - "vmlal.u16 q11, d5, d1[3]\n" - "vmlal.u16 q12, d7, d1[0]\n" - "vmlal.u16 q13, d7, d1[1]\n" - "vmlal.u16 q14, d7, d1[2]\n" - "vmlal.u16 q15, d7, d1[3]\n" - - // Loop. Decrement loop index (depth) by 2, since we just handled 2 - // levels of depth (Kernel::kDepth=2). - "subs %[run_depth], #2\n" - "bne loop_NEONKernel12x4Depth2_%=\n" - - - -Packing code adapts to the format chosen by the kernel -====================================================== - -As explained in doc/design.txt, gemmlowp starts by packing blocks of the -lhs and rhs matrices for optimally efficient traversal by the kernel. This -depends on fine details of the kernel format, in ways that can only be -efficiently handled by knowing these kernel format details at compile-time. - -This is the reason why all the code in internal/pack.h is templated in -the corresponding kernel format. - -The code in internal/pack.h isn't tightly optimized by itself, but it is -structured in such a way that the critical code is in a template, - PackingRegisterBlock, -that can easily be specialized to override the slow generic code with -fast specific packing code for specific formats, on specific platforms. - -See internal/pack_neon.h which provides NEON specializations of the -packing code for the particular kernel formats that are used by the NEON -kernels in internal/kernel_neon.h. - - -Wrapping up: how to optimize gemmlowp for a CPU architecture -============================================================ - -In conclusion, the key feature of gemmlowp when it comes to efficiently -supporting a specific CPU architecture, is that it allows to freely replace -the inner loop of the GEMM by providing one's own GEMM kernel, which is -also free to dictate its required data layout; each data layout then also -needs optimized packing code. The steps are thus: - 1) Freely design a GEMM kernel with a freely chosen data layout - 2) Implement the GEMM kernel, similar to internal/kernel_neon.h - 3) Implement the optimized packing code, similar to internal/pack_neon.h. diff --git a/doc/less-than-8-bit.txt b/doc/less-than-8-bit.txt deleted file mode 100644 index 1a1deaa..0000000 --- a/doc/less-than-8-bit.txt +++ /dev/null @@ -1,305 +0,0 @@ - Computation with less than 8 bits in gemmlowp - ********************************************* - - -Introduction -============ - -We assume familiarity with gemmlowp's low-precision uint8 computation -paradigm, which is described in doc/low-precision.txt. - -This document is about the possibility of further reducing precision -below 8 bits. - -That allows to get higher arithmetic throughput on some architectures, -at the cost of decreased accuracy. - - -Public interface -================ - - -The BitDepthSetting parameter in the EightBitIntGemm interface --------------------------------------------------------------- - -Accessing less-than-8-bit computation via the EightBitIntGemm is very -simple: EightBitIntGemm takes a BitDepthSetting enum -which allows to choose among a fixed set of supported bit-depth -combinations. - - -The BitDepthParams parameter in the public/gemmlowp.h interface ---------------------------------------------------------------- - -The public/gemmlowp.h interface exposes more extensive control over -quantization, by means of a BitDepthParams template parameter, -which is a type parameter, carrying information about: - 1. The LHS and RHS bit depth, which can be set arbitrarily and - independently; - 2. The 'RoundingStrategy', which is the heuristic used to choose - a rounding mode, based on the accumulation size (a.k.a. the - "depth" dimension of the Gemm). -Details can be seen in public/bit_depth.h. - - -How does BitDepth{Setting,Params} affect input/output uint8 matrix data? -------------------------------------------------------------------- - -Input/output matrix data is all uint8's, ranging from 0 to 255, regardless of -the BitDepth{Setting,Params}. - -So the BitDepth{Setting,Params} is only an internal detail. It only means to -allow gemmlowp to use lower precision internally, but the input/output data -format is unaffected. - -As far as the API contract goes, the only thing that the -BitDepth{Setting,Params} does is to relax the accuracy requirement. -With standard 8bit/8bit computation, gemmlowp is required to return the exact -result as specified in doc/low-precision.txt. With lower bit depths, gemmlowp -is no longer required to return an exact result. - - -Implementation -============== - -Here we refer to the 3 stages of computation as described in doc/design.txt, -namely: packing, computation kernel, unpacking. - -The general idea is that at the packing stage, we requantize input (Lhs/Rhs) -data to less-than-8-bit depths by scaling them, thus shrinking the range of -the packed matrix entries; for instance, if the Rhs bit depth is to be 5 bits -then packed Rhs matrix entries will be in the range [0 ... 31]. This then -allows the GEMM kernel to use narrower accumulators without risking overflow, -thus achieving higher arithmetic throughput. Finally, at the unpacking stage, -it only remains to scale the result values to compensate for the scalings -applied earlier. - -Let us go into more detail for each of those stages: - - -Packing stage -------------- - -The packing stage is where most of the work specific to the BitDepthParams -takes place. - -Here, we have to scale input matrix values from their original range of -[0 ... 255] to the range specified by the BitDepthParams, which is -[0 ... (2^N)-1] where N is the number of bits for the matrix at hand -(Lhs or Rhs). For example, for a bit depth of 5 bits, we need to scale -down to [0 ... 31]. - -This scaling is what we call "requantization". The pedantic name matches -the fact that this is actually quite nontrivial to do correctly i.e. -in such a way that the result accuracy will be good enough for real-world -applications. See the section below on requantization details. - -Concretely, this work happens in PackingRegisterBlock::Pack(), which calls -Requantize(). This is in internal/pack.h. This code can be overridden for -specific architectures, see internal/pack_neon.h. - -This requantization work is costly and makes packing slower. This means -that, at least in our approach, less-than-8-bit computation is only -interesting for large-enough, square-enough GEMMs where packing is only -a small fraction of the overall cost. In cases where packing overhead -is more prevalent (highly rectangular cases), less-than-8-bit is probably -a waste of time as long as we treat it as an internal computation detail. -What might help there, might be if we shrunk the input/output data format -to lower memory bandwidth usage. - - -Computation kernel stage ------------------------- - -In principle, the computation kernel stage simply doesn't have to care -about the bit depth at all. In fact, on architectures where we do not have -specific optimized kernels for less-than-8-bit cases, we simply use our -standard kernel there, and that's correct! - -However, while the kernel doesn't have to know about the fact that the -operands are on less than 8 bits, it can use that information to make -special optimizations that would be incorrect in the general 8-bit case -and become correct here thanks to the more restricted range of inputs. -That's the whole point of this less-than-8-bit computation idea. - -With Lhs entries guaranteed to be smaller than 2^N, and Rhs entries -guaranteed to be smaller than 2^M, each product is thus guaranteed to be -smaller than 2^(M+N). Thus, one may accumulate 2^(16-(M+N)) such products -and still be guaranteed that such an accumulator will be smaller than 2^16, -and thus can be stored as a uint16 without risking overflow. - -For example, in the L7R5 case, the Lhs enties are on 7 bits (N=7) and the -Rhs entries are on 5 bits (M=5), so each product fits in 12 bits and one can -thus accumulate 16 ( = 2^(16-12)) such products into uint16 accumulators -with no risk of overflow. - -This means that a computation kernel may use uint16 accumulators for -several loop iterations (16 in the above example), provided that it is -allowed to assume that inputs are in such restricted range. - -After this fixed number of loop iterations, the kernel must accumulate -the local uint16 accumulators back into global uint32 accumulators. - -On SIMD architectures with suitable uint16 arithmetic, this in principle -allows to multiply arithmetic throughput by up to 2x, since twice more -accumulators now fit in each SIMD vector register. This is partially offset -by the cost of accumulating back into global uint32 accumulators every -several loop iterations, but our experience on ARM NEON has been that -we still get quite close to a 2x speedup. See internal/kernel_neon.h, -specifically NEON32Kernel12x4Depth2Assuming12BitProducts. - - -Unpacking stage ---------------- - -At the unpacking stage, it only remains to scale the result values -to compensate for the scaling of the inputs. This is easier because -now we are expanding the range instead of shrinking it, so we don't -need to worry about ways to minimize a loss of accuracy. We simply -need to multiply result values by a constant fraction, rounding to nearest. - -Since the inputs were scaled by factors of (2^lhs_bits - 1)/255 and -(2^rhs_bits - 1)/255 respectively, the scaling of the outputs needs to be -by the following factor: - - 255 * 255 - ----------------------------------- - (2^lhs_bits - 1) * (2^rhs_bits - 1) - -This is done by a MultiplyByConstantFraction function, see internal/unpack.h - - -Requantization details -====================== - -Here we go into more detail on the Requantize() function used at the packing -stage to requantize input matrix data. See this function in internal/pack.h. - -It depends on the bit depth and on a rounding mode, and requantizes an input -value in [0 ... 255] to the range [0 ... (2^N)-1] specified by the bit depth N. - - -Naive, bad rounding, that's plainly biased ------------------------------------------- - -Naive and inaccurate ways to achieve this requantization include: - 1. By shifting bits rights by (8-N) bits; - 2. By multiplying by ((2^N) - 1) and dividing by 255. - -Both of those are biased in some way: 1. has the wrong "derivative", since it -approximates (((2^N) - 1) / 255) by ((2^N) / 256) ; 2. has bias since it -effectively implements rounding towards 0. - -In practice, both of the above requantization functions give results that are -too inaccurate in practice for the neural network that we tried (GoogLeNet). - -Round-to-nearest rounding: unbiased in principle but not in practice --------------------------------------------------------------------- - -The simplest fix is to avoid the bias in 2. by rounding-to-nearest instead -of rounding towards 0. This can be achieved by doing - - dst = (src * maxval + rounding_offset) / 255; - -Where maxval = ((2^N) - 1) is the highest requantized value, and the -rounding_offset can be set to - - rounding_offset = 127 - -to achieve rounding-to-nearest (while the above rounding towards 0 -corresponded to rounding_offset = 0). - -In principle, rounding-to-nearest is unbiased and optimal in various ways. - -In practice though, our input data is not random real numbers, but -already-quantized 8-bit values. That means that even in the best case, there -would be at most 255 different possible input values; in practice, we generally -see the input values distributed non-uniformly in that range, so that a majority -of input values tend to be in a much smaller range. See test/test_data.cc. - -Having a large part of the input values in a very small finite set, means that -the corresponding rounding errors are also in a very small finite set, which -can be small enough that the mean of these rounding errors is significantly -different from 0. That rounding-to-nearest is "unbiased" only means that over -a sufficiently large set of input values, the bias would become arbitrarily -close to 0; here, the set of input values is effectively small enough that the -resulting bias is significant. - -This leads to biasing the matrix product entries, resulting in an error that -grows linearly with the depth dimension of the GEMM. - - -Probabilistic rounding: unbiased even on small finite input distributions -------------------------------------------------------------------------- - -To address that, we can instead use probabilistic rounding. The idea is that -for instance if we have to round the value 3.8 to the nearest integer, we can -round it to 3 with 20% probability and to 4 with probability 80%. If that value -3.8 occurs many times, the mean requantized value will thus tend to 3.8. - -This amounts to keeping the above requantization formula, - - dst = (src * maxval + rounding_offset) / 255; - -but now the rounding_offset is a random value in [0 .. 254]. - -This guarantees zero bias no matter how small the distribution of input values -is. - -On the other hand, the variance of the error term here is higher than with -rounding-to-nearest --- one can check that it is 2x higher. - -So the error term coming from the Central Limit Theorem, which grows with -the square root of the accumulator depth i.e. the GEMM depth, -will be 2x higher here. - -Still, for large enough GEMM depth, that is better than rounding-to-nearest -which has an error term growing linearly with the GEMM depth. - - -Switching between rounding-to-nearest and probabilistic rounding ----------------------------------------------------------------- - -Thus, for fixed input values and bit depths, we expect that probabilistic -rounding will give more accurate results for large enough GEMM depths, while -rounding-to-nearest will be more accurate for smaller GEMM depths. - -That is why use switch between these rounding modes based on GEMM depth, -see ChooseRoundingMode in internal/bit_depth_util.h. - -It is based on a constant, kProbabilisticRoundingThreshold, defined -in internal/common.h and empirically determined. See the comment there. -It would be nice to better understand the statistics here and come up -with better heuristics for this switching. - - -Choice of pseudorandom number generator ---------------------------------------- -We provide two PRNGs. The first is an 8-bit Xorshift. -It is fast, naturally produces values ranging -over an interval of width 255, which is what we need here (as opposed -to an interval of width 256), and turns out, from empirical tests, -to produce better results than a linear congruential generator (LCG). -That's unfortunate, as a 8-bit LCG performs better (we confirmed that -on various ARM devices) but we need as perfect un-biased-ness as we can -get. - -The second is an "add-mod" sequence generator, which generates -non-random values in the sequence x = (x + 97) % 255. This -generates a low-discrepancy sequence that minimizes the "clumpiness" -of the random offsets (Thus, for example, quantizing a 3x3 matrix will -have a maximum additive error of about 200 from the random offsets). -While not random, this sequence performs well empirically for many -quantizations. (For information about why 97 is a good value, see -https://en.wikipedia.org/wiki/Low-discrepancy_sequence#Additive_recurrence -and http://mollwollfumble.blogspot.com/2011/03/subrandom-numbers.html -97/255 = 0.38; 0.382 is the best choice. For discrete numbers, -the choice must be relatively prime to the modulus. 97 is prime, -so it is safely relatively prime to 255. 107 is another near-optimal -choice. - -The low-discrepancy sequence generator is the default. - -More details and results are given in a comment on the default -PRNG in internal/pack.h. Interested users can change the -PRNG used by setting DefaultRoundingGenerator in bit_depth_util.h. diff --git a/doc/low-precision.md b/doc/low-precision.md index 97b1498..b4ff59e 100644 --- a/doc/low-precision.md +++ b/doc/low-precision.md @@ -26,7 +26,7 @@ computation paradigms, by allowing the user to set up a chain of transformations to be performed on internal 32bit accumulators to obtain the final outputs. The public entry point in [public/gemmlowp.h](../public/gemmlowp.h) allowing to -set un an arbitrary output pipeline is `GemmWithOutputPipeline`. +set up an arbitrary output pipeline is `GemmWithOutputPipeline`. Refer to [quantization.md](quantization.md) for details of how one gets from first principles to the actual output pipelines to assemble for successful diff --git a/doc/low-precision.txt b/doc/low-precision.txt deleted file mode 100644 index 893961f..0000000 --- a/doc/low-precision.txt +++ /dev/null @@ -1,159 +0,0 @@ - The low-precision paradigm in gemmlowp, and how it's implemented - **************************************************************** - - -Introduction -============ - -"Low-precision" means that the input and output matrix entries are integers -on at most 8 bits. The scalar type is uint8_t. - -This isn't the same as just doing plain matrix arithmetic over uint8_t, -because that would overflow. To avoid overflow, we internally accumulate -results on more than 8 bits, and at the end we keep only some significant -8 bits. This relies on the caller providing suitable offset/multiplier/shift -parameters, which effectively govern how we extract some significant 8 bit -from our more-than-8bit temporary accumulators. - -Gemmlowp supports further reducing precision below 8 bits. That is not -the subject of this document; for that, refer to doc/less-than-8-bit.txt. - - -The low-precision paradigm -========================== - -gemmlowp is an implementation of the EightBitIntGemm paradigm, where quantized -matrix multiplication takes the following input parameters: - - the lhs matrix of uint8 quantized values - - the rhs matrix of uint8 quantized values - - the following int32 "quantization parameters", which control how the - uint8 quantized values in the matrices are to be interpreted during the - matrix computation: - - lhs_offset - - rhs_offset - - result_offset - - result_mult_int - - result_shift - -The mathematical expression to be computed is the result of the following steps: - 1. Cast lhs entries from uint8 to int32 and add lhs_offset to each of them. - 2. Cast rhs entries from uint8 to int32 and add rhs_offset to each of them. - 3. Compute the int32 matrix product of the resulting lhs times rhs. - 4. Add result_offset to each entry of the result. - 5. Multiply each entry of the result by the following fraction, and round - to the nearest integer: - - result_mult_int - --------------- (1) - 2^result_shift - - 6. Clamp the resulting int32 values to the [0..255] range and cast to uint8. - -Thus the caller of this interface is expected to have precomputed suitable -quantization parameters - -The rationale for these parameters is as follows: - - The three offsets may improve quantization accuracy in cases where the - range of values is limited, and they also conveniently allow to reduce all - eight combinations of signednesses to just the unsigned*unsigned->unsigned - case. One may at first glance worry that these offsets would incur - substantial overhead to the GEMM computation, but that is actually not the - case thanks to a trick described below (see "Efficient handling of - offsets"). - - The result_mult_int and result_shift parameters allow approximating - arbitrarily closely any real multiplier, as a fraction of the form given - in (1) above, without using floating-point arithmetic and without using - a division instruction (only a right shift). - - -Efficient handling of offsets -============================= - -At first glance it may seem like the above-described quantized computation -scheme requires adding the lhs_offset and rhs_offset to each of the lhs and -rhs matrix entries. - -Doing that in the GEMM kernel would incur substantial overhead: - - It would mean extra arithmetic work in the GEMM kernel; - - It would require storing the lhs_offset and rhs_offset in registers, - which would eat into the register space available for the rest of the - GEMM kernel. - -One may then consider adding the lhs_offset and rhs_offset once and for all -to lhs and rhs blocks, in a GEMM implementation operating on one lhs block -and one rhs block at a time. However, doing so would require storing lhs and -rhs blocks in 32 bit (or at least in 16 bit in real-world cases), which would -partially negate the memory bandwidth benefits of low-precision computation. - -Fortunately, there is another way to handle these offsets that has none of -the costs of the approaches described above. The idea is as follows. - -Let P denote the matrix shaped like lhs, but filled with 1's. -Let Q denote the matrix shaped like rhs, but filled with 1's. - -Adding lhs_offset to each entry of lhs, means adding lhs_offset * P to lhs. -Adding rhs_offset to each entry of rhs, means adding rhs_offset * Q to rhs. - -Thus, as far as handling lhs_offset and rhs_offset goes, the matrix product to be -computed is: - - (lhs + lhs_offset * P) * (rhs + rhs_offset * Q) - -Expanding this (using distributivity of matrix multiplication over addition), -we see that the above product is equal to the following sum of 4 terms: - - lhs * rhs (2) - + lhs_offset * P * rhs - + lhs * rhs_offset * Q - + lhs_offset * rhs_offset * P * Q - -The first term, lhs * rhs, is just the matrix multiplication ignoring the -offsets, i.e. as if lhs_offset==rhs_offset==0. Our claim here is that this -is all what we have to compute in the GEMM kernel. - -In the second term, lhs_offset * P * rhs, notice that since P is filled -with 1's, P * rhs has all its rows equal to each other, and equal to the -row-vector of sums of all the entries in each column of rhs. - -Thus, we can compute the second term, lhs_offset * P * rhs, by summing -each column of rhs. This produces a single row-vector, and in order to add the -second term, we simply need to add this row-vector (multiplied by lhs_offset) -to each row of the result. This is just a rank one update of the result -(equivalently, the second term is a rank one matrix), and we can efficiently -store it as a single vector. - -The third term, lhs * rhs_offset * Q, is entirely similar to the second one, -and can be similarly computed by summing each row of lhs, storing this in a -single column-vector, and later multiplying these sums by rhs_offset. - -The fourth term is a single constant, repeated into all the entries of the -matrix. The matrix P * Q is filled with the single constant value 'depth' -(the depth the the matrix product i.e. the number of columns of the lhs). -Thus the fourth term is simply the rank zero update adding this constant -to each matrix entry: - - lhs_offset * rhs_offset * depth - - -Implementation of this technique in gemmlowp -============================================ - -In gemmlowp, at the packing stage (where we traverse blocks of the lhs and rhs -to prepare them for efficient repeated traversal by the kernel), we compute -the sum of each row of the lhs block and the sum of each column of the rhs -block. - -See in internal/pack.h, in the PackedSideBlock class, the following member: - - // Handle on the additional buffer backing the vector of sums of slices - // associated with this block. Owned. - Allocator::Handle sums_of_each_slice_handle_; - -sums_of_each_slice_handle_ is the handle to the buffer allocated to store -the vector containing sums of rows of lhs, or of sums of columns of rhs. - -After these rank one updates have been computed at the packing stage, they are -ignored at the compute kernel stage, since that stage is only concerned -with the first of the four terms in (2); they are only used at the unpacking -stage. See the default/reference implementation, UnpackResultImpl, in -internal/unpack.h. diff --git a/doc/output.md b/doc/output.md index f9985a8..8620fbe 100644 --- a/doc/output.md +++ b/doc/output.md @@ -10,8 +10,8 @@ the output pipeline, so as to allow different users to implement different quantization paradigms. See [low-precision.md](low-precision.md) and [quantization.md](quantization.md). -Besides implementing a quantization paradigms, the other thing that output -pipelines are good for, is implementing fused operations where a matrix +Besides implementing a quantization paradigm, the other thing that output +pipelines is good for, is implementing fused operations where a matrix multiplication feeds into other operations applied to its result, without additional array traversals. For instance, when implementing neural network inference, one might have a Convolutional layer with a bias-addition and an diff --git a/doc/packing.txt b/doc/packing.txt deleted file mode 100644 index 2b94512..0000000 --- a/doc/packing.txt +++ /dev/null @@ -1,206 +0,0 @@ - The packing stage in gemmlowp - ***************************** - - -Introduction -============ - -We assume familiarity with doc/design.txt and with the overall -3 stages of computations described there: packing, kernel, unpacking. - -This page goes into more details about the first stage: packing. - -We also assume familiarity with doc/kernel.txt as it describes the -packed format requirements that the kernels expect, and that forms -basically the contract that the packing stage must honor. - -Some parts below also assume familiarity with doc/low-precision.txt -as the packing stage also has to compute the vectors of sums or columns as -described there. - - -The storage order of packed blocks, partly hidden behind sequential access -========================================================================== - -As explained in doc/design.txt, the primary purpose of packing is to -ensure that when the kernel traverses Lhs/Rhs matrix data, it can do -so efficiently thanks to having the data stored in an order that is -as similar as possible to the order in which the compute stage has to -traverse this data. - -This traversal order is nontrivial for the reasons outlined in -doc/design.txt: at the innermost level, one tries to work within registers -as much as possible; at the next level, one tries to stay within L1 cache -as much as possible. The packed blocks that we handle are supposed -to fit entirely in L2 cache. - -Thus it has become standard in GEMM design to describe complicated -"Z-order" or "fractal order" storage for packed blocks. - -However, we should keep in mind that the whole point of the packed storage -order is to be as similar as possible to the order of traversal during -the compute stage. The storage order doesn't matter in itself; the only -thing that matters is simple access patterns during the compute stage. - -This suggests the following approach to implementing packing: take the -exact same hierarchy of nested loops of the compute stage, drop the loops -that are not relevant to the side (Lhs or Rhs) being packed, and try to use -mostly sequential access to the destination packed data. - -This hierarchy of nested loops can be seen in PackSideBlockImpl -(PackL2, PackL1, PackRun), compare to the similar hierarchy of loops -in internal/compute.h. - -In this way, the more intricate small-scale details or the packed data format -never need to be made explicit (which would be complicated). We still use -some "seeking" but only at larger scales, where the storage order is less\ -complicated to describe. - - -Sequential access to PackedSideBlock data ------------------------------------------ - -See PackedSideBlock in internal/pack.h, specifically the following data -members: - - // Handle on the buffer backing this packed block. Owned. - Allocator::Handle data_handle_; - -and: - - // pos_ is the current position in the buffer, which we access - // sequentially, like a file. - // The idea is that we pack data in the same order as it is - // going to be traversed during the computation, which for - // cache-friendliness reasons is complicated to random-access, - // as the offsets calculations would be intricate. So we - // give up random-access addressing, and instead content ourselves - // with sequential access. - // - // pos_ is mutable because during the computation we will want to - // be able to iterate on the data in a const PackedSideBlock. - mutable int pos_; - -The methods exposing sequential access are: - - std::uint8_t* current_data() { - return allocator_->GetPointer(data_handle_) + pos_; - } - -and: - - void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } - - void seek_forward_n_cells(int n) const { - pos_ += n * KernelSideFormat::Cell::kSize; - } - - -Random access to PackedSideBlock data at larger scales ------------------------------------------------------- - -We still need some random access at larger scales (with high granularity), -which is unavoidable since GEMM is O(n^3) and has to traverse each of the -O(n^2) inputs O(n) times. - -The watershed between sequential access and random access is at the level -of a 'Run'. Throughout gemmlowp we consistently use the term 'Run' to refer -to the innermost GEMM loop in the depth dimension. That's the critical -inner loop that must be as fast as possible, thus for which we absolutely -want sequential access to packed data so that the storage order is optimal -by construction. At larger scales i.e. between runs, we accept that -the storage order is less optimal and since it's also less intricate, it's -not too hard to implement random access there. - -This is done by the seek_run method: - - void seek_run(int start_width, int start_depth) const { - int kernel_run_depth = - std::min(params_.l1_depth, params_.l2_depth - start_depth); - pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; - } - -We see that the formula involves the l1_depth parameter, which is how the -packed storage order depends on L1 cache size. Again, the whole packed -block is supposed to fit in L2 cache. - - -The innermost loop of the packing stage, PackRun, and PackingRegisterBlock -========================================================================== - -Keeping with our consistent usage of the term 'Run' throughout gemmlowp, -the innermost loop is called PackRun(). - -Here we recall a very important principle that was explained in -doc/kernels.txt: the kernel is free to dictate the precise data format -that it expects; the packing code has to honor it. So there's an asymmetry -here: the kernel is the master, the packing is the slave. That's why the -packing code is templatized in the KernelSideFormat. At larger scales, -the packing is independent of kernel format details, but inside PackRun is -where we take care of the small-scale details that do depend on the kernel -format details. That's why it's a good thing that we only need sequential -access here, as it would be very complicated to spell out random access -at this scale. - -Anyway, PackRun. - -Since it is the critical inner loop, it is what we want to allow specializing -for particular CPU architectures. To allow that, we handle at a time blocks -of fixed dimensions, that is intended to be friendly enough to optimization. -These blocks are PackingRegisterBlock's and their dimensions are: - width = KernelWidth - depth = kRegisterSize -See doc/kernels.txt and internal/kernel.h for the former, and internal/common.h -for the latter. - -See the comments around PackingRegisterBlock in internal/pack.h: - -// A PackingRegisterBlock is a small fixed-size block of a matrix being -// packed. This class is the generic non-optimized implementation, -// it is inherited by the generic implementation of PackingRegisterBlock, -// which may be overriden by template specialization. Overriding it is how -// one may provide optimized packing code paths. -// -// The packing of a block proceeds in two steps: -// 1. Ensuring that we have a complete block of source data, i.e. a block of -// the compile-time prescribed size. This is where we handle unaligned -// boundaries: if we don't have a complete block of source data, then -// we copy and zero-extend it into a local temporary (complete_src_), -// see MakeCompleteSrc. In the generic case, we do have a complete block, -// so we just use it in-place, see UseCompleteSrcInPlace. -// 2. Packing a complete block into the destination, see Pack. This is the -// most critical part, so it's convenient that unaligned boundaries have -// already been handled in step 1. - - -Other things that the packing stage has to do -============================================= - -Besides storing matrix entries in a suitable order, the packing stages also has -two other things to do. - -First, packing has to compute the vectors of sums of entries along the depth -dimension. If this is any mysterious, read doc/low-precision.txt. These will -only be used at the unpacking stage. - -Second, if the BitDepthSetting requires less than 8 bit of precision, then at -the packing stage we have to requantize inputs accordingly. -See doc/less-than-8-bit.txt for details. This is the Requantize() function. - - -Specialized packing paths for specific formats on specific CPU architectures -============================================================================ - -Please refer to internal/pack_neon.h for examples of doing that. The piece -of code to be specialized is PackingRegisterBlock. However, inside of it, -only the Pack() method typically needs to be specialized (the rest is unlikely -to be critical). So one typically specializes PackingRegisterBlock but still -inheriting PackingRegisterBlockBase to keep the generic stuff, and then one -typically wants to override the Pack() method. - -Template specialization for the right template parameters is how one specifies -in which case a given path is to be used in place of the generic packing code. - -It is entirely possible to set the value of kRegisterSize differently based on -the CPU architecture (for example, 32 on x86 with AVX) as long as all the -specialized packing paths used on that CPU architecture are consistent with it. diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.cc b/eight_bit_int_gemm/eight_bit_int_gemm.cc index 8113bf3..512c483 100644 --- a/eight_bit_int_gemm/eight_bit_int_gemm.cc +++ b/eight_bit_int_gemm/eight_bit_int_gemm.cc @@ -307,7 +307,7 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c, std::int32_t b_offset, int ldb, std::uint8_t* c, std::int32_t c_offset, std::int32_t c_mult_int, std::int32_t c_shift, int ldc, BitDepthSetting bit_depth) { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::EightBitIntGemm()); GemmContext* context = GetOrCreateGlobalContext(); #if defined(GEMMLOWP_USE_META_FASTPATH) && defined(GEMMLOWP_NEON) @@ -344,7 +344,7 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c, const std::uint8_t* b, std::int32_t b_offset, std::int32_t ldb, float* c, float c_offset, std::int32_t ldc, BitDepthSetting bit_depth) { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::EightBitIntGemm()); GemmContext* context = GetOrCreateGlobalContext(); #if defined(GEMMLOWP_USE_META_FASTPATH) && defined(GEMMLOWP_NEON) @@ -405,13 +405,13 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c, } void SetMaxNumThreads(int n) { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::EightBitIntGemm()); GemmContext* context = GetOrCreateGlobalContext(); context->set_max_num_threads(n); } void FreePersistentResources() { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::EightBitIntGemm()); DestroyGlobalContext(); DestroyGlobalScratch(); } diff --git a/internal/allocator.h b/internal/allocator.h index da325a4..3a6f077 100644 --- a/internal/allocator.h +++ b/internal/allocator.h @@ -41,23 +41,6 @@ #include "common.h" -#if defined(__ANDROID__) -#include -// The 18 here should be 16, but has to be 18 for now due -// to a Google-internal issue. -#if __ANDROID_API__ < 18 -#include -#define GEMMLOWP_USE_MEMALIGN -#endif -// posix_memalign is missing on some 4.1 x86 devices -#if __ANDROID_API__ == 18 -#ifdef GEMMLOWP_X86_32 -#include -#define GEMMLOWP_USE_MEMALIGN -#endif -#endif -#endif - namespace gemmlowp { enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 }; @@ -115,13 +98,7 @@ class Allocator { if (reserved_bytes_ > storage_size_) { DeallocateStorage(); storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_); -#ifdef GEMMLOWP_USE_MEMALIGN - storage_ = memalign(kAlignment, storage_size_); -#else - if (posix_memalign(&storage_, kAlignment, storage_size_)) { - storage_ = nullptr; - } -#endif + storage_ = aligned_alloc(kAlignment, storage_size_); } ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure"); @@ -192,7 +169,7 @@ class Allocator { private: void DeallocateStorage() { assert(!committed_); - free(storage_); + aligned_free(storage_); storage_size_ = 0; } diff --git a/internal/block_params.h b/internal/block_params.h index b2fc3ff..aedd261 100644 --- a/internal/block_params.h +++ b/internal/block_params.h @@ -43,13 +43,12 @@ struct BlockParams { int l2_depth; template - void Init(int rows, int cols, int depth, int num_threads, - int l1_bytes_to_use, int l2_bytes_to_use, float l2_rhs_factor) { + void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use, + int l2_bytes_to_use, float l2_rhs_factor) { FindL2BlockSizes(rows, cols, depth, num_threads, - l2_bytes_to_use, l2_rhs_factor, - &l2_rows, &l2_cols, &l2_depth); - FindL1BlockSizes(l2_rows, l2_cols, l2_depth, - l1_bytes_to_use, + l2_bytes_to_use, l2_rhs_factor, &l2_rows, + &l2_cols, &l2_depth); + FindL1BlockSizes(l2_rows, l2_cols, l2_depth, l1_bytes_to_use, &l1_rows, &l1_cols, &l1_depth); } @@ -61,6 +60,10 @@ struct BlockParams { int l2_rows = 0; int l2_cols = 0; int l2_depth = 0; + + int per_thread_rows = + std::max(1, RoundUp(rows) / num_threads); + // No L2 blocking in the depth dimension at the moment. // Too much loss of accuracy due to storing intermediate results in // low precision. @@ -81,15 +84,15 @@ struct BlockParams { // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances // the performance on x86. if (l2_rhs_factor == 1.0f) { - l2_rows = RoundUp(rows); + l2_rows = RoundUp(per_thread_rows); } else { int max_cache_friendly_l2_rows = std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / (num_threads * (l2_depth + 4 * l2_cols))); - int min_l2_rows_blocks = - std::max(1, CeilQuotient(rows, max_cache_friendly_l2_rows)); - l2_rows = - RoundUp(CeilQuotient(rows, min_l2_rows_blocks)); + int min_l2_rows_blocks = std::max( + 1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows)); + l2_rows = RoundUp( + CeilQuotient(per_thread_rows, min_l2_rows_blocks)); } *out_l2_rows = l2_rows; diff --git a/internal/common.h b/internal/common.h index 511809d..9de151b 100644 --- a/internal/common.h +++ b/internal/common.h @@ -18,7 +18,8 @@ #ifndef GEMMLOWP_INTERNAL_COMMON_H_ #define GEMMLOWP_INTERNAL_COMMON_H_ -#include +#include "../internal/platform.h" +#include "../profiling/pthread_everywhere.h" #include #include @@ -112,12 +113,22 @@ #define GEMMLOWP_SSE3_64 #endif +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +#include +#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison +#elif __has_feature(address_sanitizer) +#include +#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region +#endif +#endif + #endif // GEMMLOWP_ALLOW_INLINE_ASM // Detect Android. Don't conflate with ARM - we care about tuning // for non-ARM Android devices too. This can be used in conjunction // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. -#if defined(__ANDROID__) +#if defined(__ANDROID__) || defined(ANDROID) #define GEMMLOWP_ANDROID #endif @@ -205,7 +216,7 @@ inline void Prefetch(const void* ptr) { // leaving __builtin_prefetch a no-op on this architecture. // For our purposes, "pldl1keep" is usually what we want, meaning: // "prefetch for load, into L1 cache, using each value multiple times". - asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) : ); + asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); #elif defined \ __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. __builtin_prefetch(ptr); @@ -251,6 +262,17 @@ struct IsPowerOfTwo { static const bool value = !(N & (N - 1)); }; +template +void MarkMemoryAsInitialized(T* ptr, int size) { +#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED + GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast(ptr), + size * sizeof(T)); +#else + (void)ptr; + (void)size; +#endif +} + } // namespace gemmlowp #endif // GEMMLOWP_INTERNAL_COMMON_H_ diff --git a/internal/compute.h b/internal/compute.h index bbc9e2a..4417507 100644 --- a/internal/compute.h +++ b/internal/compute.h @@ -63,6 +63,19 @@ class ComputeImpl { } private: + static void MarkPackedResultBlockAsInitialized( + const MatrixMap& packed_result_block) { +#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED + for (int col = 0; col < packed_result_block.cols(); col++) { + MarkMemoryAsInitialized( + packed_result_block.data() + col * packed_result_block.cols_stride(), + packed_result_block.rows()); + } +#else + (void)packed_result_block; +#endif + } + void ComputeRun(int start_row, int start_col, int start_depth, int depth) GEMMLOWP_NOINLINE { packed_lhs_.seek_run(start_row, start_depth); @@ -72,6 +85,7 @@ class ComputeImpl { kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(), packed_result_block.cols_stride(), packed_lhs_.current_data(), packed_rhs_.current_data(), start_depth, depth); + MarkPackedResultBlockAsInitialized(packed_result_block); } void ComputeL1(int start_row, int rows, int start_col, int cols, diff --git a/internal/kernel.h b/internal/kernel.h index 4d006af..825a7f3 100644 --- a/internal/kernel.h +++ b/internal/kernel.h @@ -183,6 +183,7 @@ inline const char* CellOrderName(CellOrder o) { // Returns the offset into a cell, at which a given coefficient is stored. template inline int OffsetIntoCell(int w, int d) { + const int size = CellFormat::kWidth; switch (CellFormat::kOrder) { case CellOrder::DepthMajor: return w + d * CellFormat::kWidth; @@ -190,7 +191,6 @@ inline int OffsetIntoCell(int w, int d) { return d + w * CellFormat::kDepth; case CellOrder::Diagonal: assert(CellFormat::kWidth == CellFormat::kDepth); - static const int size = CellFormat::kWidth; return ((size + w - d) * size + d) % (size * size); default: assert(false); diff --git a/internal/kernel_default.h b/internal/kernel_default.h index 7ed55b8..7037bda 100644 --- a/internal/kernel_default.h +++ b/internal/kernel_default.h @@ -88,6 +88,10 @@ GEMMLOWP_SET_DEFAULT_KERNEL(false, false, SSE4_64_Kernel12x4Depth2) // SIMD is not available on this platform. The slow fallback will be used. // Don't require GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK because there's nothing // the user can do about it. +#elif defined __powerpc__ +// There is currently no fast kernel using SIMD instructions on POWER. Don't +// require GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK because there's nothing the user +// can do about it. #else #error \ "SIMD not enabled, you'd be getting a slow software fallback. Consider \ diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h index 0234b26..df7387a 100644 --- a/internal/multi_thread_gemm.h +++ b/internal/multi_thread_gemm.h @@ -19,8 +19,6 @@ #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ -#include -#include #include #include "single_thread_gemm.h" @@ -58,8 +56,10 @@ inline int Do256NOPs() { #undef GEMMLOWP_NOP inline void WriteBarrier() { -#ifdef GEMMLOWP_ARM_32 +#if defined(_MSC_VER) MemoryBarrier(); +#elif defined(GEMMLOWP_ARM_32) + asm volatile("" ::: "memory"); #elif defined(GEMMLOWP_ARM_64) asm volatile("dmb ishst" ::: "memory"); #elif defined(GEMMLOWP_X86) @@ -70,8 +70,10 @@ inline void WriteBarrier() { } inline void ReadBarrier() { -#ifdef GEMMLOWP_ARM_32 +#if defined(_MSC_VER) MemoryBarrier(); +#elif defined(GEMMLOWP_ARM_32) + asm volatile("" ::: "memory"); #elif defined(GEMMLOWP_ARM_64) asm volatile("dmb ishld" ::: "memory"); #elif defined(GEMMLOWP_X86) @@ -148,10 +150,16 @@ T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond, class BlockingCounter { public: BlockingCounter() - : cond_(PTHREAD_COND_INITIALIZER), - mutex_(PTHREAD_MUTEX_INITIALIZER), - count_(0), - initial_count_(0) {} + : count_(0), + initial_count_(0) { + pthread_cond_init(&cond_, nullptr); + pthread_mutex_init(&mutex_, nullptr); + } + + ~BlockingCounter() { + pthread_cond_destroy(&cond_); + pthread_mutex_destroy(&mutex_); + } // Sets/resets the counter; initial_count is the number of // decrementing events that the Wait() call will be waiting for. @@ -187,7 +195,15 @@ class BlockingCounter { void Wait() { ScopedProfilingLabel label("BlockingCounter::Wait"); while (count_) { - MemoryBarrier(); +#ifdef GEMMLOWP_USE_BUSYWAIT + ReadBarrier(); +#else + // This is likely unnecessary, but is kept to ensure regressions are not + // introduced. +#ifndef _WIN32 + asm volatile("" ::: "memory"); +#endif +#endif const std::size_t count_value = count_; if (count_value) { WaitForVariableChange(&count_, count_value, &cond_, &mutex_); @@ -222,16 +238,18 @@ class Worker { explicit Worker(BlockingCounter* counter_to_decrement_when_ready) : task_(nullptr), - state_cond_(PTHREAD_COND_INITIALIZER), - state_mutex_(PTHREAD_MUTEX_INITIALIZER), state_(State::ThreadStartup), counter_to_decrement_when_ready_(counter_to_decrement_when_ready) { + pthread_cond_init(&state_cond_, nullptr); + pthread_mutex_init(&state_mutex_, nullptr); pthread_create(&thread_, nullptr, ThreadFunc, this); } ~Worker() { ChangeState(State::ExitAsSoonAsPossible); pthread_join(thread_, nullptr); + pthread_cond_destroy(&state_cond_); + pthread_mutex_destroy(&state_mutex_); } // Changes State; may be called from either the worker thread @@ -359,14 +377,13 @@ class WorkersPool { void Execute(const std::vector& tasks) { assert(tasks.size() >= 1); // One of the tasks will be run on the current thread. - int workers_count = tasks.size() - 1; + std::size_t workers_count = tasks.size() - 1; CreateWorkers(workers_count); assert(workers_count <= workers_.size()); counter_to_decrement_when_ready_.Reset(workers_count); int n = 0; - std::for_each(tasks.begin(), --tasks.end(), [this, &n](Task *task) { - workers_[n++]->StartWork(task); - }); + std::for_each(tasks.begin(), --tasks.end(), + [this, &n](Task* task) { workers_[n++]->StartWork(task); }); // Execute the remaining workload immediately on the current thread. Task* task = tasks.back(); task->local_allocator = &main_thread_task_allocator_; @@ -375,9 +392,7 @@ class WorkersPool { counter_to_decrement_when_ready_.Wait(); // Cleanup tasks (best to do this from the same thread that allocated // the memory). - std::for_each(tasks.begin(), tasks.end(), [](Task *task) { - delete task; - }); + std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; }); } private: @@ -421,18 +436,18 @@ class WorkersPool { template + typename OutputPipelineType, typename GemmContextType> struct GemmWithPackedRhsTask : Task { typedef PackedSideBlock PackedLhs; typedef PackedSideBlock PackedRhs; - GemmWithPackedRhsTask(GemmContextType* _context, - const KernelBase& _kernel, + GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel, const MatrixMap& _lhs, const PackedRhs& _packed_rhs, MatrixMap* _result, const MatrixBlockBounds& _result_block, const LhsOffset& _lhs_offset, const RhsOffset& _rhs_offset, + const BlockParams& _block_params, const OutputPipelineType& _output_pipeline) : context(_context), kernel(_kernel), @@ -442,6 +457,7 @@ struct GemmWithPackedRhsTask : Task { result_block(_result_block), lhs_offset(_lhs_offset), rhs_offset(_rhs_offset), + block_params(_block_params), output_pipeline(_output_pipeline) {} void Run() override { @@ -451,12 +467,6 @@ struct GemmWithPackedRhsTask : Task { const int cols = result_block.cols; const int depth = lhs.cols(); - BlockParams block_params; - block_params.Init(rows, cols, depth, 1, - context->l1_bytes_to_use(), - context->l2_bytes_to_use(), - context->l2_rhs_factor()); - PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params); PackedResult packed_result(local_allocator, block_params); @@ -495,6 +505,7 @@ struct GemmWithPackedRhsTask : Task { const MatrixBlockBounds result_block; const LhsOffset& lhs_offset; const RhsOffset& rhs_offset; + const BlockParams& block_params; const OutputPipelineType& output_pipeline; }; @@ -552,19 +563,7 @@ inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) { } // Determine the maximum number of threads. - int max_count = max_num_threads; - // The special value 0 means try to determine the total number of cores. - if (max_count == 0) { - // No user-set maximum number of threads, so we need to - // do some hardware detection. - // This is expensive to query so we do it only once. - // Too bad for dynamicness. Also, we dont use the c++11 standard getter - // because Google's coding style currently bans #include . - static const int hardware_threads_count = - static_cast(sysconf(_SC_NPROCESSORS_CONF)); - - max_count = hardware_threads_count; - } + int max_count = GetHardwareConcurrency(max_num_threads); // Basic calculation: take into account max pool size, and // how many rows we have to feed our kernel. @@ -654,10 +653,9 @@ void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel, auto* workers_pool = context->workers_pool(); BlockParams block_params; - block_params.Init(rows, cols, depth, task_count, - context->l1_bytes_to_use(), - context->l2_bytes_to_use(), - context->l2_rhs_factor()); + block_params.Init( + rows, cols, depth, task_count, context->l1_bytes_to_use(), + context->l2_bytes_to_use(), context->l2_rhs_factor()); PackedSideBlock packed_rhs(Side::Rhs, allocator, block_params); @@ -675,19 +673,20 @@ void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel, int next_start_row = 0; for (int n = 0; n < task_count; ++n) { int start_row = next_start_row; - next_start_row = std::min(rows, RoundUp( - rows * (n + 1) / task_count)); + next_start_row = std::min( + rows, RoundUp(rows * (n + 1) / task_count)); int block_rows = next_start_row - start_row; auto lhs_block = lhs.block(start_row, 0, block_rows, depth); - typedef GemmWithPackedRhsTask< - KernelFormat, InputScalar, OutputScalar, BitDepthParams, LhsOrder, - RhsOrder, ResultOrder, LhsOffset, RhsOffset, OutputPipelineType, - GemmContextType> + typedef GemmWithPackedRhsTask TaskType; - tasks.push_back(new TaskType(context, kernel, lhs_block, packed_rhs, result, - MatrixBlockBounds(start_row, c, block_rows, cs), - lhs_offset, rhs_offset, output_pipeline)); + tasks.push_back( + new TaskType(context, kernel, lhs_block, packed_rhs, result, + MatrixBlockBounds(start_row, c, block_rows, cs), + lhs_offset, rhs_offset, block_params, output_pipeline)); } // Execute the work on the workers (and partially on this thread). workers_pool->Execute(tasks); diff --git a/internal/platform.h b/internal/platform.h new file mode 100755 index 0000000..49e41a9 --- /dev/null +++ b/internal/platform.h @@ -0,0 +1,112 @@ +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// internal/platform.h: a place to put platform specific code + +#ifndef GEMMLOWP_INTERNAL_PLATFORM_H_ +#define GEMMLOWP_INTERNAL_PLATFORM_H_ + + +#ifdef _WIN32 +#include +#else +#include +#include +#include +#endif +#include + +#if defined ANDROID || defined __ANDROID__ +#include +// The 18 here should be 16, but has to be 18 for now due +// to a Google-internal issue. +#if __ANDROID_API__ < 18 +#define GEMMLOWP_USE_MEMALIGN +#endif +// posix_memalign is missing on some 4.1 x86 devices +#if __ANDROID_API__ == 18 +#ifdef GEMMLOWP_X86_32 +#define GEMMLOWP_USE_MEMALIGN +#endif +#endif +#endif + + +namespace gemmlowp { + +#ifdef _WIN32 +inline void *aligned_alloc(size_t alignment, size_t size) { + return _aligned_malloc(size, alignment); +} + +inline void aligned_free(void *memptr) { + _aligned_free(memptr); +} + +inline int GetHardwareConcurrency(int max_threads) { + if (max_threads == 0) { + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + return sysinfo.dwNumberOfProcessors; + } + return max_threads; +} + +inline double real_time_in_seconds() { + __int64 wintime; GetSystemTimeAsFileTime((FILETIME*)&wintime); + wintime -= 116444736000000000i64; //1jan1601 to 1jan1970 + return wintime / 10000000i64 + wintime % 10000000i64 * 100 * 1e-9; +} + +#else +inline void *aligned_alloc(size_t alignment, size_t size) { +#ifdef GEMMLOWP_USE_MEMALIGN + return memalign(alignment, size); +#else + void *memptr; + if (posix_memalign(&memptr, alignment, size)) { + memptr = nullptr; + } + return memptr; +#endif +} + +inline int GetHardwareConcurrency(int max_threads) { + if (max_threads == 0) { + static const int hardware_threads_count = + static_cast(sysconf(_SC_NPROCESSORS_CONF)); + return hardware_threads_count; + } + return max_threads; +} + +inline void aligned_free(void *memptr) { + free(memptr); +} + +inline double real_time_in_seconds() { +#ifdef __APPLE__ + timeval t; + gettimeofday(&t, nullptr); + return t.tv_sec + 1e-6 * t.tv_usec; +#else + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return t.tv_sec + 1e-9 * t.tv_nsec; +#endif +} + +#endif +} // namespace gemmlowp +#endif // GEMMLOWP_INTERNAL_PLATFORM_H_ diff --git a/meta/multi_thread_common.h b/meta/multi_thread_common.h index 0b35759..dc1b799 100644 --- a/meta/multi_thread_common.h +++ b/meta/multi_thread_common.h @@ -20,15 +20,6 @@ namespace gemmlowp { namespace meta { -inline int ResolveMaxThreads(int max_threads) { - if (max_threads == 0) { - static const int hardware_threads_count = - static_cast(sysconf(_SC_NPROCESSORS_CONF)); - return hardware_threads_count; - } - return max_threads; -} - template class SimpleContext { public: diff --git a/meta/multi_thread_gemm.h b/meta/multi_thread_gemm.h index bc569e8..f69f810 100644 --- a/meta/multi_thread_gemm.h +++ b/meta/multi_thread_gemm.h @@ -132,9 +132,9 @@ inline void MultiThreadGemm(MultiThreadingContext* context, auto workers_pool = context->workers_pool(); std::vector tasks; - std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) { - tasks.push_back(new TaskRunnerType(param)); - }); + for (auto& task_param : task_params) { + tasks.push_back(new TaskRunnerType(task_param)); + }; workers_pool->Execute(tasks); } diff --git a/meta/multi_thread_gemv.h b/meta/multi_thread_gemv.h deleted file mode 100644 index 2e25ea8..0000000 --- a/meta/multi_thread_gemv.h +++ /dev/null @@ -1,168 +0,0 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// multi_thread_gemv.h: Entry point to the multithreaded version of the -// generated (meta) gemv library. - -#ifndef GEMMLOWP_META_MULTI_THREAD_GEMV_H_ -#define GEMMLOWP_META_MULTI_THREAD_GEMV_H_ - -#ifdef GEMMLOWP_NEON_32 - -#include "multi_thread_common.h" -#include "operations_common.h" -#include "single_thread_gemm.h" - -namespace gemmlowp { -namespace meta { -namespace internal { - -class GemvQuantized8BitOperation : public Quantized8BitOperation { - public: - GemvQuantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, - std::int32_t sum_offset, std::int32_t multiplier, - std::int32_t shift) - : Quantized8BitOperation(lhs_offset, rhs_offset, sum_offset, multiplier, - shift) {} - - void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs, - const std::uint8_t* rhs, std::int32_t m, - std::int32_t n, std::int32_t k, std::uint8_t* result, - std::int32_t result_stride) const { - gemv_q8(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, sum_offset, - multiplier, shift, result); - } - - static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n, - std::int32_t k) { - return 128 * 1024; - } -}; - -class GemvFloatOperation : public FloatOperation { - public: - GemvFloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, - float result_offset) - : FloatOperation(lhs_offset, rhs_offset, result_offset) {} - - void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs, - const std::uint8_t* rhs, std::int32_t m, - std::int32_t n, std::int32_t k, float* result, - std::int32_t result_stride) const { - gemv_f(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, result_offset, - result); - } - - static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n, - std::int32_t k) { - return 128 * 1024; - } -}; - -class GemvInt32Operation : public Int32Operation { - public: - GemvInt32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset) - : Int32Operation(lhs_offset, rhs_offset) {} - - void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs, - const std::uint8_t* rhs, std::int32_t m, - std::int32_t n, std::int32_t k, std::int32_t* result, - std::int32_t result_stride) const { - gemv_i32(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, result); - } - - static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n, - std::int32_t k) { - return 128 * 1024; - } -}; - -} // namespace internal - -std::int32_t gemv_q8_scratch(std::int32_t m, std::int32_t n, std::int32_t k, - std::int32_t max_threads) { - return internal::ResolveMaxThreads(max_threads) * - internal::GemvQuantized8BitOperation::ScratchPerThread(m, n, k); -} - -void multi_thread_gemv_q8(gemmlowp::WorkersPool* pool, std::int32_t max_threads, - std::uint8_t* scratch, const std::uint8_t* lhs, - const std::uint8_t* rhs, std::int32_t n, - std::int32_t k, std::int32_t lhs_offset, - std::int32_t rhs_offset, std::int32_t sum_offset, - std::int32_t multiplier, std::int32_t shift, - std::uint8_t* result) { - max_threads = internal::ResolveMaxThreads(max_threads); - internal::GemvQuantized8BitOperation operation(lhs_offset, rhs_offset, - sum_offset, multiplier, shift); - if (max_threads == 1) { - operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n); - } else { - internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1, - n, k, result, n, operation); - } -} - -std::int32_t gemv_f_scratch(std::int32_t m, std::int32_t n, std::int32_t k, - std::int32_t max_threads) { - return internal::ResolveMaxThreads(max_threads) * - internal::GemvFloatOperation::ScratchPerThread(m, n, k); -} - -void multi_thread_gemv_f(gemmlowp::WorkersPool* pool, std::int32_t max_threads, - std::uint8_t* scratch, const std::uint8_t* lhs, - const std::uint8_t* rhs, std::int32_t n, - std::int32_t k, std::int32_t lhs_offset, - std::int32_t rhs_offset, float result_offset, - float* result) { - max_threads = internal::ResolveMaxThreads(max_threads); - internal::GemvFloatOperation operation(lhs_offset, rhs_offset, result_offset); - if (max_threads == 1) { - operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n); - } else { - internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1, - n, k, result, n, operation); - } -} - -std::int32_t gemv_i32_scratch(std::int32_t m, std::int32_t n, std::int32_t k, - std::int32_t max_threads) { - return internal::ResolveMaxThreads(max_threads) * - internal::GemvInt32Operation::ScratchPerThread(m, n, k); -} - -void multi_thread_gemv_i32(gemmlowp::WorkersPool* pool, - std::int32_t max_threads, std::uint8_t* scratch, - const std::uint8_t* lhs, const std::uint8_t* rhs, - std::int32_t n, std::int32_t k, - std::int32_t lhs_offset, std::int32_t rhs_offset, - std::int32_t* result) { - max_threads = internal::ResolveMaxThreads(max_threads); - internal::GemvInt32Operation operation(lhs_offset, rhs_offset); - if (max_threads == 1) { - operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n); - } else { - internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1, - n, k, result, n, operation); - } -} - -} // namespace meta -} // namespace gemmlowp - -#else -#warning "Meta gemm fast-path requires GEMMLOWP_NEON_32!" -#endif - -#endif // GEMMLOWP_META_MULTI_THREAD_GEMV_H_ diff --git a/meta/multi_thread_transform.h b/meta/multi_thread_transform.h index d21aec1..3bd4b3e 100644 --- a/meta/multi_thread_transform.h +++ b/meta/multi_thread_transform.h @@ -86,9 +86,9 @@ inline void MultiThreadTransform1D(MultiThreadingContext* context, auto workers_pool = context->workers_pool(); std::vector tasks; - std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) { - tasks.push_back(new TaskRunnerType(param)); - }); + for (auto& task_param : task_params) { + tasks.push_back(new TaskRunnerType(task_param)); + } workers_pool->Execute(tasks); } diff --git a/meta/operations_common.h b/meta/operations_common.h deleted file mode 100644 index d80375a..0000000 --- a/meta/operations_common.h +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright 2015 Google Inc. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef GEMMLOWP_META_OPERATIONS_COMMON_H_ -#define GEMMLOWP_META_OPERATIONS_COMMON_H_ - -class Quantized8BitOperation { - public: - Quantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, - std::int32_t sum_offset, std::int32_t multiplier, - std::int32_t shift) - : lhs_offset(lhs_offset), - rhs_offset(rhs_offset), - sum_offset(sum_offset), - multiplier(multiplier), - shift(shift) {} - - protected: - std::int32_t lhs_offset; - std::int32_t rhs_offset; - std::int32_t sum_offset; - std::int32_t multiplier; - std::int32_t shift; -}; - -class FloatOperation { - public: - FloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, - float result_offset) - : lhs_offset(lhs_offset), - rhs_offset(rhs_offset), - result_offset(result_offset) {} - - protected: - std::int32_t lhs_offset; - std::int32_t rhs_offset; - float result_offset; -}; - -class Int32Operation { - public: - Int32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset) - : lhs_offset(lhs_offset), rhs_offset(rhs_offset) {} - - protected: - std::int32_t lhs_offset; - std::int32_t rhs_offset; -}; - -#endif // GEMMLOWP_META_OPERATIONS_COMMON_H_ diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h index 51b6525..539076a 100644 --- a/profiling/instrumentation.h +++ b/profiling/instrumentation.h @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -52,15 +52,6 @@ using ::uintptr_t; #include #endif -// We should always use C++11 thread_local; unfortunately that -// isn't fully supported on Apple yet. -#ifdef __APPLE__ -#define GEMMLOWP_THREAD_LOCAL static __thread -#define GEMMLOWP_USING_OLD_THREAD_LOCAL -#else -#define GEMMLOWP_THREAD_LOCAL thread_local -#endif - namespace gemmlowp { inline void ReleaseBuildAssertion(bool condition, const char* msg) { @@ -70,41 +61,42 @@ inline void ReleaseBuildAssertion(bool condition, const char* msg) { } } -// To be used as template parameter for GlobalLock. -// GlobalLock is the profiler global lock: -// registering threads, starting profiling, finishing profiling, and -// the profiler itself as it samples threads, all need to lock it. -struct ProfilerLockId; - -// A very plain global lock. Templated in LockId so we can have multiple -// locks, one for each LockId type. -template -class GlobalLock { - static pthread_mutex_t* Mutex() { - static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; +class Mutex { + public: + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; + + Mutex() { pthread_mutex_init(&m, NULL); } + ~Mutex() { pthread_mutex_destroy(&m); } + + void Lock() { pthread_mutex_lock(&m); } + void Unlock() { pthread_mutex_unlock(&m); } + + private: + pthread_mutex_t m; +}; + +class GlobalMutexes { + public: + static Mutex* Profiler() { + static Mutex m; return &m; } - public: - static void Lock() { pthread_mutex_lock(Mutex()); } - static void Unlock() { pthread_mutex_unlock(Mutex()); } + static Mutex* EightBitIntGemm() { + static Mutex m; + return &m; + } }; -// A very simple RAII helper to lock and unlock a GlobalLock -template -struct AutoGlobalLock { - AutoGlobalLock() { GlobalLock::Lock(); } - ~AutoGlobalLock() { GlobalLock::Unlock(); } -}; +// A very simple RAII helper to lock and unlock a Mutex +struct ScopedLock { + ScopedLock(Mutex* m) : _m(m) { _m->Lock(); } + ~ScopedLock() { _m->Unlock(); } -// MemoryBarrier is purely a compile-time thing; it tells two things -// to the compiler: -// 1) It prevents reordering code across it -// (thanks to the 'volatile' after 'asm') -// 2) It requires the compiler to assume that any value previously -// read from memory, may have changed. Thus it offers an alternative -// to using 'volatile' variables. -inline void MemoryBarrier() { asm volatile("" ::: "memory"); } + private: + Mutex* _m; +}; // Profiling definitions. Two paths: when profiling is enabled, // and when profiling is disabled. @@ -115,34 +107,31 @@ inline void MemoryBarrier() { asm volatile("" ::: "memory"); } // contains pointers to literal strings that were manually entered // in the instrumented code (see ScopedProfilingLabel). struct ProfilingStack { - static const std::size_t kMaxSize = 15; + static const std::size_t kMaxSize = 14; typedef const char* LabelsArrayType[kMaxSize]; LabelsArrayType labels; std::size_t size; + Mutex* lock; ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } void Push(const char* label) { - MemoryBarrier(); + ScopedLock sl(lock); ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); labels[size] = label; - MemoryBarrier(); size++; - MemoryBarrier(); } void Pop() { - MemoryBarrier(); + ScopedLock sl(lock); ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); size--; - MemoryBarrier(); } void UpdateTop(const char* new_label) { - MemoryBarrier(); + ScopedLock sl(lock); assert(size); labels[size - 1] = new_label; - MemoryBarrier(); } ProfilingStack& operator=(const ProfilingStack& other) { @@ -174,29 +163,35 @@ struct ThreadInfo { ThreadInfo() { pthread_key_create(&key, ThreadExitCallback); pthread_setspecific(key, this); + stack.lock = new Mutex(); } static void ThreadExitCallback(void* ptr) { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::Profiler()); ThreadInfo* self = static_cast(ptr); ThreadsUnderProfiling().erase(self); pthread_key_delete(self->key); + delete self->stack.lock; } }; inline ThreadInfo& ThreadLocalThreadInfo() { -#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL - // We're leaking this ThreadInfo structure, because Apple doesn't support - // non-trivial constructors or destructors for their __thread type modifier. - GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr; - if (i == nullptr) { - i = new ThreadInfo(); + static pthread_key_t key; + static auto DeleteThreadInfo = [](void* threadInfoPtr) { + ThreadInfo* threadInfo = static_cast(threadInfoPtr); + if (threadInfo) { + delete threadInfo; + } + }; + + static int key_result = pthread_key_create(&key, DeleteThreadInfo); + + ThreadInfo* threadInfo = static_cast(pthread_getspecific(key)); + if (!threadInfo) { + threadInfo = new ThreadInfo(); + pthread_setspecific(key, threadInfo); } - return *i; -#else - GEMMLOWP_THREAD_LOCAL ThreadInfo i; - return i; -#endif + return *threadInfo; } // ScopedProfilingLabel is how one instruments code for profiling @@ -221,7 +216,7 @@ class ScopedProfilingLabel { // To be called once on each thread to be profiled. inline void RegisterCurrentThreadForProfiling() { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::Profiler()); ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); } diff --git a/profiling/profiler.h b/profiling/profiler.h index a18c036..018da57 100644 --- a/profiling/profiler.h +++ b/profiling/profiler.h @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -306,12 +306,12 @@ inline pthread_t& ProfilerThread() { // In the end, the key atomicity property that we are relying on // here is that pointers are changed atomically, and the labels // are pointers (to literal strings). -inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { +inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) { + ScopedLock sl(thread->stack.lock); assert(!dst->size); while (dst->size < thread->stack.size) { dst->labels[dst->size] = thread->stack.labels[dst->size]; dst->size++; - MemoryBarrier(); // thread->stack can change at any time } } @@ -330,7 +330,7 @@ inline void* ProfilerThreadFunc(void*) { while (!ProfilerThreadShouldFinish()) { WaitOneProfilerTick(); { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::Profiler()); for (auto t : ThreadsUnderProfiling()) { ProfilingStack s; RecordStack(t, &s); @@ -347,7 +347,7 @@ inline void* ProfilerThreadFunc(void*) { // Starts recording samples. inline void StartProfiling() { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::Profiler()); ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); IsProfiling() = true; ProfilerThreadShouldFinish() = false; @@ -357,7 +357,7 @@ inline void StartProfiling() { // Stops recording samples, and prints a profile tree-view on stdout. inline void FinishProfiling() { { - AutoGlobalLock lock; + ScopedLock sl(GlobalMutexes::Profiler()); ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, // as the scary comments below should make clear. diff --git a/profiling/pthread_everywhere.h b/profiling/pthread_everywhere.h new file mode 100644 index 0000000..7e12d66 --- /dev/null +++ b/profiling/pthread_everywhere.h @@ -0,0 +1,88 @@ +// Copyright 2017 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// pthread_everywhere.h: Either includes or implements a +// subset of pthread functionality on top of C++11 for portability. + +#ifndef GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ +#define GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ + +#include "pthread_everywhere.h" + +#ifndef _WIN32 +#define GEMMLOWP_USE_PTHREAD +#endif + +#if defined GEMMLOWP_USE_PTHREAD +#include +#else +// Implement a small subset of pthread on top of C++11 threads. +// The function signatures differ from true pthread functions in two ways: +// - True pthread functions return int error codes, ours return void. +// Rationale: the c++11 equivalent functions return void +// and use exceptions to report errors; we don't want to deal with +// exceptions in this code, so we couldn't meaningfully return errors +// in the polyfill. Also, the gemmlowp code using these pthread functions +// never checks their return values anyway. +// - True pthread *_create/*_init functions take pointers to 'attribute' +// structs; ours take nullptr_t. That is because gemmlowp always passes +// nullptr at the moment, so any support we would code for non-null +// attribs would be unused. +#include +#include +#include +#include +namespace gemmlowp { +using pthread_t = std::thread*; +using pthread_mutex_t = std::mutex*; +using pthread_cond_t = std::condition_variable*; +inline void pthread_create(pthread_t* thread, std::nullptr_t, + void *(*start_routine) (void *), void *arg) { + *thread = new std::thread(start_routine, arg); +} +inline void pthread_join(pthread_t thread, std::nullptr_t) { + thread->join(); +} +inline void pthread_mutex_init(pthread_mutex_t *mutex, std::nullptr_t) { + *mutex = new std::mutex; +} +inline void pthread_mutex_lock(pthread_mutex_t* mutex) { + (*mutex)->lock(); +} +inline void pthread_mutex_unlock(pthread_mutex_t* mutex) { + (*mutex)->unlock(); +} +inline void pthread_mutex_destroy(pthread_mutex_t *mutex) { + delete *mutex; +} +inline void pthread_cond_init(pthread_cond_t *cond, std::nullptr_t) { + *cond = new std::condition_variable; +} +inline void pthread_cond_signal(pthread_cond_t* cond) { + (*cond)->notify_one(); +} +inline void pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { + std::unique_lock lock(**mutex, std::adopt_lock); + (*cond)->wait(lock); + // detach lock from mutex so when we leave this conext + // the lock is not released + lock.release(); +} +inline void pthread_cond_destroy(pthread_cond_t *cond) { + delete *cond; +} +} // end namespace gemmlowp +#endif + +#endif // GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ \ No newline at end of file diff --git a/test/benchmark.cc b/test/benchmark.cc index 20dd369..9a87a41 100644 --- a/test/benchmark.cc +++ b/test/benchmark.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #ifdef __APPLE__ #include #endif @@ -44,18 +43,6 @@ namespace gemmlowp { -double time() { -#ifdef __APPLE__ - timeval t; - gettimeofday(&t, nullptr); - return t.tv_sec + 1e-6 * t.tv_usec; -#else - timespec t; - clock_gettime(CLOCK_REALTIME, &t); - return t.tv_sec + 1e-9 * t.tv_nsec; -#endif -} - const double min_accurate_duration = 1e-1; const std::size_t min_working_set_size = 16 * 1024 * 1024; @@ -111,10 +98,10 @@ double time_for_gemms(GemmContext* context, const std::vector& gemms) { std::size_t pool_index = 0; while (true) { - double starttime = time(); + double starttime = real_time_in_seconds(); for (int i = 0; i < iters_at_a_time; i++) { for (size_t j = 0; j < gemms.size(); j++) { - int k = pool_index * gemms.size() + j; + size_t k = pool_index * gemms.size() + j; Gemm( context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(), -75, -91, 74980, 123, 20); @@ -124,7 +111,7 @@ double time_for_gemms(GemmContext* context, const std::vector& gemms) { pool_index = 0; } } - double endtime = time(); + double endtime = real_time_in_seconds(); const float timing = static_cast(endtime - starttime); @@ -228,8 +215,8 @@ void benchmark_gemm_sizes(GemmContext* context, gemmlowp::StartProfiling(); #endif - double starttime = time(); - while (time() < starttime + mintime) { + double starttime = real_time_in_seconds(); + while (real_time_in_seconds() < starttime + mintime) { gemm_times.push_back( time_for_gemms(context, gemms)); } diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc new file mode 100644 index 0000000..16cc57c --- /dev/null +++ b/test/benchmark_all_sizes.cc @@ -0,0 +1,357 @@ +// Example command line to build on Android ARM64: +/* +~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \ +test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \ +-DBENCHMARK_QUICK -DBENCHMARK_8bit +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../public/gemmlowp.h" + +#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32 +// Compilation workaround +namespace std { + using ::round; +} +#endif + +// Minimum duration of each benchmark measurement. Also, duration +// of sleep time between each two consecutive benchmark measurements to +// prevent over-heating. +const double kBenchmarkSecs = 0.1; + +// Sleep time before each benchmark. +const int kCooldownBeforeBenchmarkSecs = 0; + +// Number of benchmark passes. +const int kPasses = 4; + +#ifdef BENCHMARK_NUM_THREADS +const int kNumThreads = BENCHMARK_NUM_THREADS; +#else +const int kNumThreads = 1; +#endif + +namespace gemmlowp { + +// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, +// since it only maps existing data. In tests though, we need to +// create our own matrices. +template +class Matrix : public MatrixMap { + public: + typedef MatrixMap Map; + typedef MatrixMap ConstMap; + typedef typename Map::Scalar Scalar; + static const MapOrder Order = tOrder; + using Map::cols_; + using Map::data_; + using Map::kOrder; + using Map::rows_; + using Map::stride_; + + public: + Matrix() : Map(nullptr, 0, 0, 0) {} + + Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } + + Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } + + Matrix& operator=(const Matrix& other) { + Resize(other.rows_, other.cols_); + std::memcpy(data_, other.data_, size() * sizeof(Scalar)); + return *this; + } + + friend bool operator==(const Matrix& a, const Matrix& b) { + return a.rows_ == b.rows_ && a.cols_ == b.cols_ && + !std::memcmp(a.data_, b.data_, a.size()); + } + + void Resize(int rows, int cols) { + rows_ = rows; + cols_ = cols; + stride_ = kOrder == MapOrder::ColMajor ? rows : cols; + storage.resize(size()); + data_ = storage.data(); + } + + int size() const { return rows_ * cols_; } + + Map& map() { return *static_cast(this); } + + ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } + + protected: + std::vector storage; +}; + +template +void MakeZero(MatrixType* m) { + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = 128; + } + } +} + +} // end namespace gemmlowp + +template +float benchmark_8bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix LhsType; + typedef Matrix RhsType; + typedef Matrix ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple + Pipeline; + gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint + quantize_down_stage; + quantize_down_stage.result_offset_after_shift = 128; + quantize_down_stage.result_fixedpoint_multiplier = 1234567890; + quantize_down_stage.result_shift = 16; + gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; + const auto output_pipeline = + std::make_tuple(quantize_down_stage, saturating_cast_stage); + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, output_pipeline); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +template +float benchmark_8bit_to_32bit(int rows, int depth, int cols) { + using namespace gemmlowp; + typedef Matrix LhsType; + typedef Matrix RhsType; + typedef Matrix ResultType; + + LhsType lhs; + RhsType rhs; + ResultType result; + lhs.Resize(rows, depth); + rhs.Resize(depth, cols); + result.Resize(rows, cols); + MakeZero(&lhs); + MakeZero(&rhs); + MakeZero(&result); + + typedef std::tuple<> EmptyPipeline; + GemmContext gemm_context; + gemm_context.set_max_num_threads(kNumThreads); + gemmlowp::GemmWithOutputPipeline( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + + double time_start = real_time_in_seconds(); + double t = time_start; + int iters = 0; + int iters_at_a_time = 1; + while (t - time_start < kBenchmarkSecs) { + for (int i = 0; i < iters_at_a_time; i++) { + gemmlowp::GemmWithOutputPipeline( + &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128, + -128, EmptyPipeline()); + iters++; + } + iters_at_a_time *= 2; + t = real_time_in_seconds(); + } + return (t - time_start) / iters; +} + +struct Shape { + int rows; + int depth; + int cols; +}; + +bool operator==(const Shape& s1, const Shape& s2) { + return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols; +} + +bool operator<(const Shape& shape1, const Shape& shape2) { + return shape1.depth < shape2.depth || + (shape1.depth == shape2.depth && + (shape1.rows < shape2.rows || + (shape1.rows == shape2.rows && shape1.cols < shape2.cols))); +}; + +#ifdef _WIN32 +#define sleep(t) Sleep(t) +#endif + +float benchmark(const Shape& shape) { + if (kCooldownBeforeBenchmarkSecs) { + sleep(kCooldownBeforeBenchmarkSecs); + } +#if defined BENCHMARK_8bit + // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams. + // This is the recommended thing to default to: it's what most applications + // want to use, as it's the fastest. + // The contract is that LHS must take values in [1, 255], while RHS can take + // any value in [0, 255]. + return benchmark_8bit( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_wide + // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams. + // The only contract difference is that both LHS and RHS can take values in + // [0, 255]. + return benchmark_8bit( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit + // Variant of BENCHMARK_8bit where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_8bit_to_32bit_wide + // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32 + // accumulators, instead of a 8bit-downscaled result. + return benchmark_8bit_to_32bit( + shape.rows, shape.depth, shape.cols); +#elif defined BENCHMARK_float + return benchmark_float(shape.rows, shape.depth, shape.cols); +#else +#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit) +#endif +} + +std::set all_sizes() { + std::set sizes; + for (int i = 1; i <= 2048; i *= 2) { + sizes.insert(i); + } + for (double x = 8; x <= 2048; x *= std::sqrt(2.)) { + sizes.insert(static_cast(std::round(x))); + } + for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) { + sizes.insert(static_cast(std::round(x))); + } + return sizes; +} + +std::mt19937& RandomEngine() { + static std::mt19937 engine; + return engine; +} + +std::vector all_shapes_in_random_order() { + std::vector shapes; + const std::set sizes = all_sizes(); +#if defined BENCHMARK_ROWS + // Benchmark one specific shape + Shape shape; + shape.rows = BENCHMARK_ROWS; + shape.depth = BENCHMARK_DEPTH; + shape.cols = BENCHMARK_COLS; + shapes.push_back(shape); +#elif defined BENCHMARK_QUICK + // Benchmark an assortment of cubic shapes + for (int size : sizes) { + Shape shape; + shape.rows = size; + shape.depth = size; + shape.cols = size; + shapes.push_back(shape); + } +#elif defined BENCHMARK_EXHAUSTIVE + // Benchmark all sorts of shapes + for (int rows : sizes) { + for (int depth : sizes) { + for (int cols : sizes) { + Shape shape; + shape.rows = rows; + shape.depth = depth; + shape.cols = cols; + shapes.push_back(shape); + } + } + } +#else +#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK) +#endif + std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine()); + return shapes; +} + +void run_benchmarks(std::map* results) { + std::vector shapes; + for (int pass = 0; pass < kPasses; pass++) { + const std::vector pass_shapes = all_shapes_in_random_order(); + shapes.insert(std::end(shapes), std::begin(pass_shapes), + std::end(pass_shapes)); + } + + const double time_start = gemmlowp::real_time_in_seconds(); + for (std::size_t i = 0; i < shapes.size(); i++) { + const double ratio = static_cast(i) / shapes.size(); + const double elapsed = gemmlowp::real_time_in_seconds() - time_start; + const double elapsed_hours = elapsed / 3600.; + const double eta_hours = elapsed_hours * (1. - ratio) / ratio; + fprintf(stderr, + "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f " + "hours... \r", + 100. * ratio, elapsed_hours, eta_hours); + fflush(stderr); + const Shape& shape = shapes[i]; + float latency = benchmark(shape); + if (results->count(shape)) { + (*results)[shape] = std::min(latency, (*results)[shape]); + } else { + (*results)[shape] = latency; + } + } + fprintf(stderr, "\n"); +} + +int main() { + std::map results; + run_benchmarks(&results); + printf("Using %d thread(s)\n", kNumThreads); + printf("depth,rows,cols,latency(s),Gop/s\n"); + for (const auto& result : results) { + const Shape& shape = result.first; + printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols, + result.second, + 2e-9 * shape.depth * shape.rows * shape.cols / result.second); + } +} diff --git a/test/test.cc b/test/test.cc index fdc7bcc..eee16b4 100644 --- a/test/test.cc +++ b/test/test.cc @@ -14,7 +14,6 @@ #include "test.h" -#include #include #include #include diff --git a/test/test.h b/test/test.h index b6a540d..aecd0c1 100644 --- a/test/test.h +++ b/test/test.h @@ -102,6 +102,19 @@ int Random() { return dist(RandomEngine()); } +#ifdef _MSC_VER +// msvc does not support 8bit types in uniform_int_distribution<>. +// Take 32 bit uniform_int_distribution<> and only use the lower 8 bits. +template +void MakeRandom(MatrixType* m) { + ScopedProfilingLabel("MakeRandom(matrix)"); + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = Random() % OperandRange::kMaxValue; + } + } +} +#else template void MakeRandom(MatrixType* m) { ScopedProfilingLabel("MakeRandom(matrix)"); @@ -114,6 +127,7 @@ void MakeRandom(MatrixType* m) { } } } +#endif template void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) { diff --git a/test/test_allocator.cc b/test/test_allocator.cc index 8a76709..3de50f0 100644 --- a/test/test_allocator.cc +++ b/test/test_allocator.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "../internal/allocator.h" #include "test.h" +#include "../internal/allocator.h" namespace gemmlowp { diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc index 8260576..d1e0932 100644 --- a/test/test_blocking_counter.cc +++ b/test/test_blocking_counter.cc @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ // limitations under the License. #include "test.h" +#include "../profiling/pthread_everywhere.h" -#include #include #include "../internal/multi_thread_gemm.h" @@ -26,6 +26,7 @@ class Thread { Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement) : blocking_counter_(blocking_counter), number_of_times_to_decrement_(number_of_times_to_decrement), + finished_(false), made_the_last_decrement_(false) { pthread_create(&thread_, nullptr, ThreadFunc, this); } @@ -33,7 +34,9 @@ class Thread { ~Thread() { Join(); } bool Join() const { - pthread_join(thread_, nullptr); + if (!finished_) { + pthread_join(thread_, nullptr); + } return made_the_last_decrement_; } @@ -45,6 +48,7 @@ class Thread { Check(!made_the_last_decrement_); made_the_last_decrement_ = blocking_counter_->DecrementCount(); } + finished_ = true; } static void* ThreadFunc(void* ptr) { @@ -55,6 +59,7 @@ class Thread { BlockingCounter* const blocking_counter_; const int number_of_times_to_decrement_; pthread_t thread_; + bool finished_; bool made_the_last_decrement_; }; diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc index 591bf44..e9d4b84 100644 --- a/test/test_math_helpers.cc +++ b/test/test_math_helpers.cc @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. -- cgit v1.2.3