From 3dd9fc332efa9e209c142431d9f8225ccf03fe57 Mon Sep 17 00:00:00 2001
From: Miao Wang <miaowang@google.com>
Date: Wed, 6 Dec 2017 17:00:59 -0800
Subject: Add a header lib to include all gemmlowp headers

Test: mm
Change-Id: I5f493dc35b269a75c7a8473e5002a90df98e92a3
---
 Android.bp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Android.bp b/Android.bp
index f6d9324..3f1dab4 100644
--- a/Android.bp
+++ b/Android.bp
@@ -26,4 +26,11 @@ cc_library_headers {
     export_include_dirs: ["public"],
 }
 
+cc_library_headers {
+    name: "gemmlowp_headers",
+    vendor_available: true,
+    host_supported: true,
+    export_include_dirs: ["."],
+}
+
 subdirs = ["eight_bit_int_gemm"]
-- 
cgit v1.2.3


From 1963df9ac4a0424674e72ef5da522b5d830605fd Mon Sep 17 00:00:00 2001
From: Miao Wang <miaowang@google.com>
Date: Tue, 12 Dec 2017 14:22:24 -0800
Subject: Rebase gemmlowp to 6a2a908

Bug: 70573221
Test: mm
Test: mm and Pixel2 boot
Test: NeuralNetworksTest pass

Change-Id: I8fac98811e9a276d3ff8054167dc45225c04147e
---
 doc/design.txt                           | 158 --------------
 doc/kernels.txt                          | 176 ---------------
 doc/less-than-8-bit.txt                  | 305 --------------------------
 doc/low-precision.md                     |   2 +-
 doc/low-precision.txt                    | 159 --------------
 doc/output.md                            |   4 +-
 doc/packing.txt                          | 206 ------------------
 eight_bit_int_gemm/eight_bit_int_gemm.cc |   8 +-
 internal/allocator.h                     |  27 +--
 internal/block_params.h                  |  25 ++-
 internal/common.h                        |  28 ++-
 internal/compute.h                       |  14 ++
 internal/kernel.h                        |   2 +-
 internal/kernel_default.h                |   4 +
 internal/multi_thread_gemm.h             | 105 +++++----
 internal/platform.h                      | 112 ++++++++++
 meta/multi_thread_common.h               |   9 -
 meta/multi_thread_gemm.h                 |   6 +-
 meta/multi_thread_gemv.h                 | 168 ---------------
 meta/multi_thread_transform.h            |   6 +-
 meta/operations_common.h                 |  61 ------
 profiling/instrumentation.h              | 115 +++++-----
 profiling/profiler.h                     |  12 +-
 profiling/pthread_everywhere.h           |  88 ++++++++
 test/benchmark.cc                        |  23 +-
 test/benchmark_all_sizes.cc              | 357 +++++++++++++++++++++++++++++++
 test/test.cc                             |   1 -
 test/test.h                              |  14 ++
 test/test_allocator.cc                   |   2 +-
 test/test_blocking_counter.cc            |  11 +-
 test/test_math_helpers.cc                |   2 +-
 31 files changed, 772 insertions(+), 1438 deletions(-)
 delete mode 100644 doc/design.txt
 delete mode 100644 doc/kernels.txt
 delete mode 100644 doc/less-than-8-bit.txt
 delete mode 100644 doc/low-precision.txt
 delete mode 100644 doc/packing.txt
 create mode 100755 internal/platform.h
 delete mode 100644 meta/multi_thread_gemv.h
 delete mode 100644 meta/operations_common.h
 create mode 100644 profiling/pthread_everywhere.h
 create mode 100644 test/benchmark_all_sizes.cc

diff --git a/doc/design.txt b/doc/design.txt
deleted file mode 100644
index cb78dbb..0000000
--- a/doc/design.txt
+++ /dev/null
@@ -1,158 +0,0 @@
-                        Overview of gemmlowp design
-                        ***************************
-
-
-Primer on GEMM, kernels, and cache friendliness
-===============================================
-
-gemmlowp, like most GEMMs, implements the straightforward matrix multiplication
-algorithm, which takes n^3 multiply-accumulate instructions for n*n sized
-matrices. Because the arithmetic complexity grows quicker than the memory
-complexity (n^3 vs. n^2), memory accesses are redundant (each matrix entry
-is accessed n times). A large part of a GEMM's performance and design goes
-toward minimizing the inefficiency resulting from these redundant memory
-accesses.
-
-Ultimately, once values are loaded into CPU registers, they cost nothing to
-access, so as long as we can work within registers, this problem doesn't exist.
-Thus, in order to be efficient, a GEMM's inner loops must wisely use the
-available registers to do as much arithmetic work as possible before loading
-more data from memory into registers. This means that
-a GEMM implementation needs to have architecture-specific inner loops tailored
-for architecture details such as the number of registers, and typically written
-in assembly. This 'inner loops' architecture-specific component is referred to
-as the GEMM kernel. (More details about kernels are in doc/kernels.txt).
-
-However, only small blocks can fit at a given time in registers, so at larger
-scales one needs to repeatedly load blocks of matrices from memory, and
-these accesses are redundant for the reason outlined above. The way that
-one minimizes the resulting inefficiency is by organizing for cache locality,
-so that most of these accesses hit the L1 cache, and most of the remaining
-ones hit the L2 cache, etc.
-
-This is achieved by subdividing the matrices into blocks sized to fit in L2
-cache, and subdividing these blocks into sub-blocks sizes to fit in L1 cache,
-and performing the matrix multiplication one such block at a time.
-
-In practice, it tends to pay off to "pack" input blocks for optimally
-efficient traversal by the kernel, since they will be traversed multiple times.
-"packing" means at least reordering the data layout for 1) simple access
-patterns that fit the CPU's cache behavior (in particular, the cache line size),
-and 2) simple loading into SIMD vector registers by the kernel.
-
-So a typical GEMM, in pseudo-code, tends to look like this:
-
-allocate(some_lhs_L2_block);
-allocate(some_rhs_L2_block);
-for (some_lhs_L2_block) {
-  pack(some_lhs_L2_block);
-  for (some_rhs_L2_block) {
-    pack(some_rhs_L2_block);
-    for (some_lhs_sub_block in some_lhs_L2_block) {
-      for (some_rhs_sub_block in some_rhs_L2_block) {
-        kernel(some_lhs_sub_block, some_rhs_sub_block);
-      }
-    }
-  }
-}
-
-
-Impact of low-precision computation on gemmlowp design
-======================================================
-
-Refer to doc/low-precision.txt for specifics of the low-precision-computation
-paradigm and how it's implemented in gemmlowp.
-
-Inputs and outputs are matrices of uint8 values, but internally we are
-accumulating int32 values, only converting them back to uint8 at the end. This
-means that we need so store a block of int32 accumulators at a time. We compute
-a block of the result in int32 accumulators and then we "unpack" it into the
-destination matrix at once. In this way, we minimize the amount of memory used to
-store int32 values at a given time.
-
-Because of that, besides the "pack" and "kernel" stages outlined above, a third
-stage is needed in gemmlowp, which we call "unpack". Thus we arrive at the
-3-stage computation scheme that gemmlowp uses:
-
-  1. Pack lhs/rhs blocks from the input matrices.
-  2. Compute the product of the packed blocks, using the kernel.
-  3. Unpack the result block into the output matrix.
-
-The pseudo-code overview of gemmlowp now looks like:
-
-allocate(some_lhs_L2_block);
-allocate(some_rhs_L2_block);
-// new: temp storage for int32 accums
-allocate(some_int32_accumulators_block);
-for (some_lhs_L2_block) {
-  pack(some_lhs_L2_block);
-  for (some_rhs_L2_block) {
-    pack(some_rhs_L2_block);
-    for (some_lhs_sub_block in some_lhs_L2_block) {
-      for (some_rhs_sub_block in some_rhs_L2_block) {
-        // new: pass int32 accums to kernel
-        kernel(&some_int32_accumulators_block,
-               some_lhs_sub_block,
-               some_rhs_sub_block);
-      }
-    }
-    // new: unpack int32 accums into destination matrix
-    unpack(some_int32_accumulators_block);
-  }
-}
-
-
-Exploring gemmlowp code
-=======================
-
-The design outlined above can be readily matched to gemmlowp source code,
-in particular in this file, which gives a simple GEMM implementation fitting in
-one rather small function:
-
-  internal/single_thread_gemm.h
-
-The reader can compare the above pseudo-code to the actual code in this file:
-
-  for (int r = 0; r < rows; r += block_params.l2_rows) {
-    int rs = std::min(block_params.l2_rows, rows - r);
-
-    PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
-
-    for (int c = 0; c < cols; c += block_params.l2_cols) {
-      int cs = std::min(block_params.l2_cols, cols - c);
-
-      if (!pack_rhs_once) {
-        PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
-      }
-
-      Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
-
-      auto result_block = result->block(r, c, rs, cs);
-      UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth,
-                   result_offset, result_mult_int, result_shift);
-    }
-  }
-
-The files in internal/ fall into a few categories:
-
-There are two top-level GEMM implementations,
-  single_thread_gemm.h
-  multi_thread_gemm.h
-
-They both call into pack/compute/unpack stages implemented in the following files:
-  pack.h
-  compute.h
-  unpack.h
-
-The pack.h and unpack.h files contain generic templated code that can be overridden
-by optimized code in template specializations; see the NEON optimized code here:
-  pack_neon.h
-  unpack_neon.h
-
-The compute stage contains generic code in compute.h that only calls into
-optimized code through the Kernel::Run() entry point. Each kernel is basically just
-as struct offering a Run() implementation; see the NEON kernels in:
-  kernel_neon.h
-
-More details about the interplay between these components can be found in this file:
-  doc/kernels.txt
diff --git a/doc/kernels.txt b/doc/kernels.txt
deleted file mode 100644
index 43dcb40..0000000
--- a/doc/kernels.txt
+++ /dev/null
@@ -1,176 +0,0 @@
-                        Kernels in gemmlowp
-                        *******************
-
-
-Kernels provide an inner-loop implementation, and a format
-==========================================================
-
-Here we assume familiarity with the concepts of kernels and of packing
-as explained in doc/design.txt.
-
-gemmlowp is designed to be easily extensible to different architectures and
-other low-level details, while achieving high performance. Thus a line had to
-be drawn between the generic GEMM code and the specific parts that need to
-be manually designed for each architecture, etc. The design choice made in
-gemmlowp is to have easily swappable GEMM kernels.
-
-In itself, a GEMM kernel is just an implementation of the inner-most loop
-in a GEMM (That inner-most loop has to be over the 'depth' dimension so as
-to be able to accumulate into a small enough number of accumulators to fit
-in registers).
-
-Thus, by itself, a GEMM kernel should be just a function computing a block
-of GEMM.
-
-However, GEMM kernels may need to differ not just in how they implement this
-computation, but also in the format of data that they operate on. Indeed,
-in order to maximize the ratio of arithmetic instructions to memory access
-instructions, GEMM kernels want to handle blocks as wide as possible given
-the number of registers of the CPU architecture.
-
-Thus, in order to allow efficient specialization to diverse architectures,
-gemmlowp allows each GEMM kernel to dictate the format of data that it expects,
-in addition to providing its inner-loop implementation.
-
-The former is given by a 'Format' typedef, and the latter by a 'Run'
-method.
-
-A good example is to look at internal/kernel_neon.h, and specifically at
-the NEONKernel12x4Depth2 kernel, which specifies its format as
-
-  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
-                       KernelSideFormat<CellFormat<4, 2>, 1> > Format;
-
-The meaning of these terms is explained in the lengthy comment at the
-top of internal/kernel.h. Here, they mean that this kernel handles at
-each iteration (along the depth dimension):
-  - 3 'cells' of size 4x2 each of the lhs, so a total lhs block
-    of size 12x2
-  - 1 'cell' of size 2x4 of the rhs.
-In other words, this kernel handles 12 rows of the lhs and 4 columns of the
-rhs, and handles two levels of depth at once. The 'cells' and 'CellFormat'
-details the layout of these 12x2 and 2x4 blocks.
-
-This kernel then loads these 12x2 and 2x4 blocks and computes the corresponding
-12x4 GEMM; for ease of reference let us paste the critical comment and code here:
-
-      "loop_NEONKernel12x4Depth2_%=:\n"
-
-        // Overview of register layout:
-        //
-        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
-        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
-        // (q1--q3).
-        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
-        //
-        //                   +-----+-----+-----+-----+
-        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
-        //              Rhs  +-----+-----+-----+-----+
-        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
-        //                   +-----+-----+-----+-----+
-        //
-        //                   |     |     |     |     |
-        //
-        //    Lhs            |     |     |     |     |
-        //
-        //  +--+--+ - - - -  +-----+-----+-----+-----+
-        //  |d2|d3|          | q4  | q5  | q6  | q7  |
-        //  |d2|d3|          | q4  | q5  | q6  | q7  |
-        //  |d2|d3|          | q4  | q5  | q6  | q7  |
-        //  |d2|d3|          | q4  | q5  | q6  | q7  |
-        //  +--+--+ - - - -  +-----+-----+-----+-----+
-        //  |d4|d5|          | q8  | q9  | q10 | q11 |
-        //  |d4|d5|          | q8  | q9  | q10 | q11 |
-        //  |d4|d5|          | q8  | q9  | q10 | q11 |
-        //  |d4|d5|          | q8  | q9  | q10 | q11 |
-        //  +--+--+ - - - -  +-----+-----+-----+-----+
-        //  |d6|d7|          | q12 | q13 | q14 | q15 |
-        //  |d6|d7|          | q12 | q13 | q14 | q15 |
-        //  |d6|d7|          | q12 | q13 | q14 | q15 |
-        //  |d6|d7|          | q12 | q13 | q14 | q15 |
-        //  +--+--+ - - - -  +-----+-----+-----+-----+
-        //
-        //                            Accumulator
-
-        // Load 1 Rhs cell of size 2x4
-        "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"
-
-        // Load 3 Lhs cells of size 4x2 each
-        "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
-        "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
-        "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
-
-        // Expand Lhs/Rhs cells to 16 bit.
-        "vmovl.u8 q0, d0\n"
-        "vmovl.u8 q1, d2\n"
-        "vmovl.u8 q2, d4\n"
-        "vmovl.u8 q3, d6\n"
-
-        // Multiply-accumulate, level of depth 0
-        "vmlal.u16 q4, d2, d0[0]\n"
-        "vmlal.u16 q5, d2, d0[1]\n"
-        "vmlal.u16 q6, d2, d0[2]\n"
-        "vmlal.u16 q7, d2, d0[3]\n"
-        "vmlal.u16 q8, d4, d0[0]\n"
-        "vmlal.u16 q9, d4, d0[1]\n"
-        "vmlal.u16 q10, d4, d0[2]\n"
-        "vmlal.u16 q11, d4, d0[3]\n"
-        "vmlal.u16 q12, d6, d0[0]\n"
-        "vmlal.u16 q13, d6, d0[1]\n"
-        "vmlal.u16 q14, d6, d0[2]\n"
-        "vmlal.u16 q15, d6, d0[3]\n"
-
-        // Multiply-accumulate, level of depth 1
-        "vmlal.u16 q4, d3, d1[0]\n"
-        "vmlal.u16 q5, d3, d1[1]\n"
-        "vmlal.u16 q6, d3, d1[2]\n"
-        "vmlal.u16 q7, d3, d1[3]\n"
-        "vmlal.u16 q8, d5, d1[0]\n"
-        "vmlal.u16 q9, d5, d1[1]\n"
-        "vmlal.u16 q10, d5, d1[2]\n"
-        "vmlal.u16 q11, d5, d1[3]\n"
-        "vmlal.u16 q12, d7, d1[0]\n"
-        "vmlal.u16 q13, d7, d1[1]\n"
-        "vmlal.u16 q14, d7, d1[2]\n"
-        "vmlal.u16 q15, d7, d1[3]\n"
-
-        // Loop. Decrement loop index (depth) by 2, since we just handled 2
-        // levels of depth (Kernel::kDepth=2).
-        "subs %[run_depth], #2\n"
-        "bne loop_NEONKernel12x4Depth2_%=\n"
-
-
-
-Packing code adapts to the format chosen by the kernel
-======================================================
-
-As explained in doc/design.txt, gemmlowp starts by packing blocks of the
-lhs and rhs matrices for optimally efficient traversal by the kernel. This
-depends on fine details of the kernel format, in ways that can only be
-efficiently handled by knowing these kernel format details at compile-time.
-
-This is the reason why all the code in internal/pack.h is templated in
-the corresponding kernel format.
-
-The code in internal/pack.h isn't tightly optimized by itself, but it is
-structured in such a way that the critical code is in a template,
-  PackingRegisterBlock,
-that can easily be specialized to override the slow generic code with
-fast specific packing code for specific formats, on specific platforms.
-
-See internal/pack_neon.h which provides NEON specializations of the
-packing code for the particular kernel formats that are used by the NEON
-kernels in internal/kernel_neon.h.
-
-
-Wrapping up: how to optimize gemmlowp for a CPU architecture
-============================================================
-
-In conclusion, the key feature of gemmlowp when it comes to efficiently
-supporting a specific CPU architecture, is that it allows to freely replace
-the inner loop of the GEMM by providing one's own GEMM kernel, which is
-also free to dictate its required data layout; each data layout then also
-needs optimized packing code. The steps are thus:
-  1) Freely design a GEMM kernel with a freely chosen data layout
-  2) Implement the GEMM kernel, similar to internal/kernel_neon.h
-  3) Implement the optimized packing code, similar to internal/pack_neon.h.
diff --git a/doc/less-than-8-bit.txt b/doc/less-than-8-bit.txt
deleted file mode 100644
index 1a1deaa..0000000
--- a/doc/less-than-8-bit.txt
+++ /dev/null
@@ -1,305 +0,0 @@
-         Computation with less than 8 bits in gemmlowp
-         *********************************************
-
-
-Introduction
-============
-
-We assume familiarity with gemmlowp's low-precision uint8 computation
-paradigm, which is described in doc/low-precision.txt.
-
-This document is about the possibility of further reducing precision
-below 8 bits.
-
-That allows to get higher arithmetic throughput on some architectures,
-at the cost of decreased accuracy.
-
-
-Public interface
-================
-
-
-The BitDepthSetting parameter in the EightBitIntGemm interface
---------------------------------------------------------------
-
-Accessing less-than-8-bit computation via the EightBitIntGemm is very
-simple: EightBitIntGemm takes a BitDepthSetting enum
-which allows to choose among a fixed set of supported bit-depth
-combinations.
-
-
-The BitDepthParams parameter in the public/gemmlowp.h interface
----------------------------------------------------------------
-
-The public/gemmlowp.h interface exposes more extensive control over
-quantization, by means of a BitDepthParams template parameter,
-which is a type parameter, carrying information about:
-  1. The LHS and RHS bit depth, which can be set arbitrarily and
-     independently;
-  2. The 'RoundingStrategy', which is the heuristic used to choose
-     a rounding mode, based on the accumulation size (a.k.a. the
-     "depth" dimension of the Gemm).
-Details can be seen in public/bit_depth.h.
-
-
-How does BitDepth{Setting,Params} affect input/output uint8 matrix data?
--------------------------------------------------------------------
-
-Input/output matrix data is all uint8's, ranging from 0 to 255, regardless of
-the BitDepth{Setting,Params}.
-
-So the BitDepth{Setting,Params} is only an internal detail. It only means to
-allow gemmlowp to use lower precision internally, but the input/output data
-format is unaffected.
-
-As far as the API contract goes, the only thing that the
-BitDepth{Setting,Params} does is to relax the accuracy requirement.
-With standard 8bit/8bit computation, gemmlowp is required to return the exact
-result as specified in doc/low-precision.txt. With lower bit depths, gemmlowp
-is no longer required to return an exact result.
-
-
-Implementation
-==============
-
-Here we refer to the 3 stages of computation as described in doc/design.txt,
-namely: packing, computation kernel, unpacking.
-
-The general idea is that at the packing stage, we requantize input (Lhs/Rhs)
-data to less-than-8-bit depths by scaling them, thus shrinking the range of
-the packed matrix entries; for instance, if the Rhs bit depth is to be 5 bits
-then packed Rhs matrix entries will be in the range [0 ... 31]. This then
-allows the GEMM kernel to use narrower accumulators without risking overflow,
-thus achieving higher arithmetic throughput. Finally, at the unpacking stage,
-it only remains to scale the result values to compensate for the scalings
-applied earlier.
-
-Let us go into more detail for each of those stages:
-
-
-Packing stage
--------------
-
-The packing stage is where most of the work specific to the BitDepthParams
-takes place.
-
-Here, we have to scale input matrix values from their original range of
-[0 ... 255] to the range specified by the BitDepthParams, which is
-[0 ... (2^N)-1] where N is the number of bits for the matrix at hand
-(Lhs or Rhs). For example, for a bit depth of 5 bits, we need to scale
-down to [0 ... 31].
-
-This scaling is what we call "requantization". The pedantic name matches
-the fact that this is actually quite nontrivial to do correctly i.e.
-in such a way that the result accuracy will be good enough for real-world
-applications. See the section below on requantization details.
-
-Concretely, this work happens in PackingRegisterBlock::Pack(), which calls
-Requantize(). This is in internal/pack.h. This code can be overridden for
-specific architectures, see internal/pack_neon.h.
-
-This requantization work is costly and makes packing slower. This means
-that, at least in our approach, less-than-8-bit computation is only
-interesting for large-enough, square-enough GEMMs where packing is only
-a small fraction of the overall cost. In cases where packing overhead
-is more prevalent (highly rectangular cases), less-than-8-bit is probably
-a waste of time as long as we treat it as an internal computation detail.
-What might help there, might be if we shrunk the input/output data format
-to lower memory bandwidth usage.
-
-
-Computation kernel stage
-------------------------
-
-In principle, the computation kernel stage simply doesn't have to care
-about the bit depth at all. In fact, on architectures where we do not have
-specific optimized kernels for less-than-8-bit cases, we simply use our
-standard kernel there, and that's correct!
-
-However, while the kernel doesn't have to know about the fact that the
-operands are on less than 8 bits, it can use that information to make
-special optimizations that would be incorrect in the general 8-bit case
-and become correct here thanks to the more restricted range of inputs.
-That's the whole point of this less-than-8-bit computation idea.
-
-With Lhs entries guaranteed to be smaller than 2^N, and Rhs entries
-guaranteed to be smaller than 2^M, each product is thus guaranteed to be
-smaller than 2^(M+N). Thus, one may accumulate 2^(16-(M+N)) such products
-and still be guaranteed that such an accumulator will be smaller than 2^16,
-and thus can be stored as a uint16 without risking overflow.
-
-For example, in the L7R5 case, the Lhs enties are on 7 bits (N=7) and the
-Rhs entries are on 5 bits (M=5), so each product fits in 12 bits and one can
-thus accumulate 16 ( = 2^(16-12)) such products into uint16 accumulators
-with no risk of overflow.
-
-This means that a computation kernel may use uint16 accumulators for
-several loop iterations (16 in the above example), provided that it is
-allowed to assume that inputs are in such restricted range.
-
-After this fixed number of loop iterations, the kernel must accumulate
-the local uint16 accumulators back into global uint32 accumulators.
-
-On SIMD architectures with suitable uint16 arithmetic, this in principle
-allows to multiply arithmetic throughput by up to 2x, since twice more
-accumulators now fit in each SIMD vector register. This is partially offset
-by the cost of accumulating back into global uint32 accumulators every
-several loop iterations, but our experience on ARM NEON has been that
-we still get quite close to a 2x speedup. See internal/kernel_neon.h,
-specifically NEON32Kernel12x4Depth2Assuming12BitProducts.
-
-
-Unpacking stage
----------------
-
-At the unpacking stage, it only remains to scale the result values
-to compensate for the scaling of the inputs. This is easier because
-now we are expanding the range instead of shrinking it, so we don't
-need to worry about ways to minimize a loss of accuracy. We simply
-need to multiply result values by a constant fraction, rounding to nearest.
-
-Since the inputs were scaled by factors of (2^lhs_bits - 1)/255 and
-(2^rhs_bits - 1)/255 respectively, the scaling of the outputs needs to be
-by the following factor:
-
-                 255 * 255
-    -----------------------------------
-    (2^lhs_bits - 1) * (2^rhs_bits - 1)
-
-This is done by a MultiplyByConstantFraction function, see internal/unpack.h
-
-
-Requantization details
-======================
-
-Here we go into more detail on the Requantize() function used at the packing
-stage to requantize input matrix data. See this function in internal/pack.h.
-
-It depends on the bit depth and on a rounding mode, and requantizes an input
-value in [0 ... 255] to the range [0 ... (2^N)-1] specified by the bit depth N.
-
-
-Naive, bad rounding, that's plainly biased
-------------------------------------------
-
-Naive and inaccurate ways to achieve this requantization include:
-  1. By shifting bits rights by (8-N) bits;
-  2. By multiplying by ((2^N) - 1) and dividing by 255.
-
-Both of those are biased in some way: 1. has the wrong "derivative", since it
-approximates (((2^N) - 1) / 255) by ((2^N) / 256) ; 2. has bias since it
-effectively implements rounding towards 0.
-
-In practice, both of the above requantization functions give results that are
-too inaccurate in practice for the neural network that we tried (GoogLeNet).
-
-Round-to-nearest rounding: unbiased in principle but not in practice
---------------------------------------------------------------------
-
-The simplest fix is to avoid the bias in 2. by rounding-to-nearest instead
-of rounding towards 0. This can be achieved by doing
-
-   dst = (src * maxval + rounding_offset) / 255;
-
-Where maxval = ((2^N) - 1) is the highest requantized value, and the
-rounding_offset can be set to
-
-  rounding_offset = 127
-
-to achieve rounding-to-nearest (while the above rounding towards 0
-corresponded to rounding_offset = 0).
-
-In principle, rounding-to-nearest is unbiased and optimal in various ways.
-
-In practice though, our input data is not random real numbers, but
-already-quantized 8-bit values. That means that even in the best case, there
-would be at most 255 different possible input values; in practice, we generally
-see the input values distributed non-uniformly in that range, so that a majority
-of input values tend to be in a much smaller range. See test/test_data.cc.
-
-Having a large part of the input values in a very small finite set, means that
-the corresponding rounding errors are also in a very small finite set, which
-can be small enough that the mean of these rounding errors is significantly
-different from 0. That rounding-to-nearest is "unbiased" only means that over
-a sufficiently large set of input values, the bias would become arbitrarily
-close to 0; here, the set of input values is effectively small enough that the
-resulting bias is significant.
-
-This leads to biasing the matrix product entries, resulting in an error that
-grows linearly with the depth dimension of the GEMM.
-
-
-Probabilistic rounding: unbiased even on small finite input distributions
--------------------------------------------------------------------------
-
-To address that, we can instead use probabilistic rounding. The idea is that
-for instance if we have to round the value 3.8 to the nearest integer, we can
-round it to 3 with 20% probability and to 4 with probability 80%. If that value
-3.8 occurs many times, the mean requantized value will thus tend to 3.8.
-
-This amounts to keeping the above requantization formula,
-
-   dst = (src * maxval + rounding_offset) / 255;
-
-but now the rounding_offset is a random value in [0 .. 254].
-
-This guarantees zero bias no matter how small the distribution of input values
-is.
-
-On the other hand, the variance of the error term here is higher than with
-rounding-to-nearest --- one can check that it is 2x higher.
-
-So the error term coming from the Central Limit Theorem, which grows with 
-the square root of the accumulator depth i.e. the GEMM depth,
-will be 2x higher here.
-
-Still, for large enough GEMM depth, that is better than rounding-to-nearest
-which has an error term growing linearly with the GEMM depth.
-
-
-Switching between rounding-to-nearest and probabilistic rounding
-----------------------------------------------------------------
-
-Thus, for fixed input values and bit depths, we expect that probabilistic
-rounding will give more accurate results for large enough GEMM depths, while
-rounding-to-nearest will be more accurate for smaller GEMM depths.
-
-That is why use switch between these rounding modes based on GEMM depth,
-see ChooseRoundingMode in internal/bit_depth_util.h.
-
-It is based on a constant, kProbabilisticRoundingThreshold, defined
-in internal/common.h and empirically determined. See the comment there.
-It would be nice to better understand the statistics here and come up
-with better heuristics for this switching.
-
-
-Choice of pseudorandom number generator
----------------------------------------
-We provide two PRNGs.  The first is an 8-bit Xorshift.
-It is fast, naturally produces values ranging
-over an interval of width 255, which is what we need here (as opposed
-to an interval of width 256), and turns out, from empirical tests,
-to produce better results than a linear congruential generator (LCG).
-That's unfortunate, as a 8-bit LCG performs better (we confirmed that
-on various ARM devices) but we need as perfect un-biased-ness as we can
-get. 
-
-The second is an "add-mod" sequence generator, which generates
-non-random values in the sequence x = (x + 97) % 255.  This
-generates a low-discrepancy sequence that minimizes the "clumpiness"
-of the random offsets (Thus, for example, quantizing a 3x3 matrix will
-have a maximum additive error of about 200 from the random offsets).
-While not random, this sequence performs well empirically for many
-quantizations.  (For information about why 97 is a good value, see
-https://en.wikipedia.org/wiki/Low-discrepancy_sequence#Additive_recurrence
-and http://mollwollfumble.blogspot.com/2011/03/subrandom-numbers.html
-97/255 = 0.38;  0.382 is the best choice.  For discrete numbers,
-the choice must be relatively prime to the modulus.  97 is prime,
-so it is safely relatively prime to 255.  107 is another near-optimal
-choice.
-
-The low-discrepancy sequence generator is the default.
-
-More details and results are given in a comment on the default
-PRNG in internal/pack.h.  Interested users can change the
-PRNG used by setting DefaultRoundingGenerator in bit_depth_util.h.
diff --git a/doc/low-precision.md b/doc/low-precision.md
index 97b1498..b4ff59e 100644
--- a/doc/low-precision.md
+++ b/doc/low-precision.md
@@ -26,7 +26,7 @@ computation paradigms, by allowing the user to set up a chain of transformations
 to be performed on internal 32bit accumulators to obtain the final outputs.
 
 The public entry point in [public/gemmlowp.h](../public/gemmlowp.h) allowing to
-set un an arbitrary output pipeline is `GemmWithOutputPipeline`.
+set up an arbitrary output pipeline is `GemmWithOutputPipeline`.
 
 Refer to [quantization.md](quantization.md) for details of how one gets from
 first principles to the actual output pipelines to assemble for successful
diff --git a/doc/low-precision.txt b/doc/low-precision.txt
deleted file mode 100644
index 893961f..0000000
--- a/doc/low-precision.txt
+++ /dev/null
@@ -1,159 +0,0 @@
-      The low-precision paradigm in gemmlowp, and how it's implemented
-      ****************************************************************
-
-
-Introduction
-============
-
-"Low-precision" means that the input and output matrix entries are integers
-on at most 8 bits. The scalar type is uint8_t.
-
-This isn't the same as just doing plain matrix arithmetic over uint8_t,
-because that would overflow. To avoid overflow, we internally accumulate
-results on more than 8 bits, and at the end we keep only some significant
-8 bits. This relies on the caller providing suitable offset/multiplier/shift
-parameters, which effectively govern how we extract some significant 8 bit
-from our more-than-8bit temporary accumulators.
-
-Gemmlowp supports further reducing precision below 8 bits. That is not
-the subject of this document; for that, refer to doc/less-than-8-bit.txt.
-
-
-The low-precision paradigm
-==========================
-
-gemmlowp is an implementation of the EightBitIntGemm paradigm, where quantized
-matrix multiplication takes the following input parameters:
-  - the lhs matrix of uint8 quantized values
-  - the rhs matrix of uint8 quantized values
-  - the following int32 "quantization parameters", which control how the
-    uint8 quantized values in the matrices are to be interpreted during the
-    matrix computation:
-    - lhs_offset
-    - rhs_offset
-    - result_offset
-    - result_mult_int
-    - result_shift
-
-The mathematical expression to be computed is the result of the following steps:
-  1. Cast lhs entries from uint8 to int32 and add lhs_offset to each of them.
-  2. Cast rhs entries from uint8 to int32 and add rhs_offset to each of them.
-  3. Compute the int32 matrix product of the resulting lhs times rhs.
-  4. Add result_offset to each entry of the result.
-  5. Multiply each entry of the result by the following fraction, and round
-     to the nearest integer:
-
-                        result_mult_int
-                        ---------------                                   (1)
-                        2^result_shift
-
-  6. Clamp the resulting int32 values to the [0..255] range and cast to uint8.
-
-Thus the caller of this interface is expected to have precomputed suitable
-quantization parameters
-
-The rationale for these parameters is as follows:
-  - The three offsets may improve quantization accuracy in cases where the
-    range of values is limited, and they also conveniently allow to reduce all
-    eight combinations of signednesses to just the unsigned*unsigned->unsigned
-    case. One may at first glance worry that these offsets would incur
-    substantial overhead to the GEMM computation, but that is actually not the
-    case thanks to a trick described below (see "Efficient handling of
-    offsets").
-  - The result_mult_int and result_shift parameters allow approximating
-    arbitrarily closely any real multiplier, as a fraction of the form given
-    in (1) above, without using floating-point arithmetic and without using
-    a division instruction (only a right shift).
-
-
-Efficient handling of offsets
-=============================
-
-At first glance it may seem like the above-described quantized computation
-scheme requires adding the lhs_offset and rhs_offset to each of the lhs and
-rhs matrix entries.
-
-Doing that in the GEMM kernel would incur substantial overhead:
-  - It would mean extra arithmetic work in the GEMM kernel;
-  - It would require storing the lhs_offset and rhs_offset in registers,
-    which would eat into the register space available for the rest of the
-    GEMM kernel.
-
-One may then consider adding the lhs_offset and rhs_offset once and for all
-to lhs and rhs blocks, in a GEMM implementation operating on one lhs block
-and one rhs block at a time. However, doing so would require storing lhs and
-rhs blocks in 32 bit (or at least in 16 bit in real-world cases), which would
-partially negate the memory bandwidth benefits of low-precision computation.
-
-Fortunately, there is another way to handle these offsets that has none of
-the costs of the approaches described above. The idea is as follows.
-
-Let P denote the matrix shaped like lhs, but filled with 1's.
-Let Q denote the matrix shaped like rhs, but filled with 1's.
-
-Adding lhs_offset to each entry of lhs, means adding lhs_offset * P to lhs.
-Adding rhs_offset to each entry of rhs, means adding rhs_offset * Q to rhs.
-
-Thus, as far as handling lhs_offset and rhs_offset goes, the matrix product to be
-computed is:
-
-  (lhs + lhs_offset * P) * (rhs + rhs_offset * Q)
-
-Expanding this (using distributivity of matrix multiplication over addition),
-we see that the above product is equal to the following sum of 4 terms:
-
-    lhs * rhs                                                             (2)
-  + lhs_offset * P * rhs
-  + lhs * rhs_offset * Q
-  + lhs_offset * rhs_offset * P * Q
-
-The first term, lhs * rhs, is just the matrix multiplication ignoring the
-offsets, i.e. as if lhs_offset==rhs_offset==0. Our claim here is that this
-is all what we have to compute in the GEMM kernel.
-
-In the second term, lhs_offset * P * rhs, notice that since P is filled
-with 1's, P * rhs has all its rows equal to each other, and equal to the
-row-vector of sums of all the entries in each column of rhs.
-
-Thus, we can compute the second term, lhs_offset * P * rhs, by summing
-each column of rhs. This produces a single row-vector, and in order to add the
-second term, we simply need to add this row-vector (multiplied by lhs_offset)
-to each row of the result. This is just a rank one update of the result
-(equivalently, the second term is a rank one matrix), and we can efficiently
-store it as a single vector.
-
-The third term, lhs * rhs_offset * Q, is entirely similar to the second one,
-and can be similarly computed by summing each row of lhs, storing this in a
-single column-vector, and later multiplying these sums by rhs_offset.
-
-The fourth term is a single constant, repeated into all the entries of the
-matrix. The matrix P * Q is filled with the single constant value 'depth'
-(the depth the the matrix product i.e. the number of columns of the lhs).
-Thus the fourth term is simply the rank zero update adding this constant
-to each matrix entry:
-
-  lhs_offset * rhs_offset * depth
-
-
-Implementation of this technique in gemmlowp
-============================================
-
-In gemmlowp, at the packing stage (where we traverse blocks of the lhs and rhs
-to prepare them for efficient repeated traversal by the kernel), we compute
-the sum of each row of the lhs block and the sum of each column of the rhs
-block.
-
-See in internal/pack.h, in the PackedSideBlock class, the following member:
-
-  // Handle on the additional buffer backing the vector of sums of slices
-  // associated with this block. Owned.
-  Allocator::Handle sums_of_each_slice_handle_;
-
-sums_of_each_slice_handle_ is the handle to the buffer allocated to store
-the vector containing sums of rows of lhs, or of sums of columns of rhs.
-
-After these rank one updates have been computed at the packing stage, they are
-ignored at the compute kernel stage, since that stage is only concerned
-with the first of the four terms in (2); they are only used at the unpacking
-stage. See the default/reference implementation, UnpackResultImpl, in
-internal/unpack.h.
diff --git a/doc/output.md b/doc/output.md
index f9985a8..8620fbe 100644
--- a/doc/output.md
+++ b/doc/output.md
@@ -10,8 +10,8 @@ the output pipeline, so as to allow different users to implement different
 quantization paradigms. See [low-precision.md](low-precision.md) and
 [quantization.md](quantization.md).
 
-Besides implementing a quantization paradigms, the other thing that output
-pipelines are good for, is implementing fused operations where a matrix
+Besides implementing a quantization paradigm, the other thing that output
+pipelines is good for, is implementing fused operations where a matrix
 multiplication feeds into other operations applied to its result, without
 additional array traversals. For instance, when implementing neural network
 inference, one might have a Convolutional layer with a bias-addition and an
diff --git a/doc/packing.txt b/doc/packing.txt
deleted file mode 100644
index 2b94512..0000000
--- a/doc/packing.txt
+++ /dev/null
@@ -1,206 +0,0 @@
-                    The packing stage in gemmlowp
-                    *****************************
-
-
-Introduction
-============
-
-We assume familiarity with doc/design.txt and with the overall
-3 stages of computations described there: packing, kernel, unpacking.
-
-This page goes into more details about the first stage: packing.
-
-We also assume familiarity with doc/kernel.txt as it describes the
-packed format requirements that the kernels expect, and that forms
-basically the contract that the packing stage must honor.
-
-Some parts below also assume familiarity with doc/low-precision.txt
-as the packing stage also has to compute the vectors of sums or columns as
-described there.
-
-
-The storage order of packed blocks, partly hidden behind sequential access
-==========================================================================
-
-As explained in doc/design.txt, the primary purpose of packing is to
-ensure that when the kernel traverses Lhs/Rhs matrix data, it can do
-so efficiently thanks to having the data stored in an order that is
-as similar as possible to the order in which the compute stage has to
-traverse this data.
-
-This traversal order is nontrivial for the reasons outlined in
-doc/design.txt: at the innermost level, one tries to work within registers
-as much as possible; at the next level, one tries to stay within L1 cache
-as much as possible. The packed blocks that we handle are supposed
-to fit entirely in L2 cache.
-
-Thus it has become standard in GEMM design to describe complicated
-"Z-order" or "fractal order" storage for packed blocks.
-
-However, we should keep in mind that the whole point of the packed storage
-order is to be as similar as possible to the order of traversal during
-the compute stage. The storage order doesn't matter in itself; the only
-thing that matters is simple access patterns during the compute stage.
-
-This suggests the following approach to implementing packing: take the
-exact same hierarchy of nested loops of the compute stage, drop the loops
-that are not relevant to the side (Lhs or Rhs) being packed, and try to use
-mostly sequential access to the destination packed data.
-
-This hierarchy of nested loops can be seen in PackSideBlockImpl
-(PackL2, PackL1, PackRun), compare to the similar hierarchy of loops
-in internal/compute.h.
-
-In this way, the more intricate small-scale details or the packed data format
-never need to be made explicit (which would be complicated). We still use
-some "seeking" but only at larger scales, where the storage order is less\
-complicated to describe.
-
-
-Sequential access to PackedSideBlock data
------------------------------------------
-
-See PackedSideBlock in internal/pack.h, specifically the following data
-members:
-
-  // Handle on the buffer backing this packed block. Owned.
-  Allocator::Handle data_handle_;
-
-and:
-
-  // pos_ is the current position in the buffer, which we access
-  // sequentially, like a file.
-  // The idea is that we pack data in the same order as it is
-  // going to be traversed during the computation, which for
-  // cache-friendliness reasons is complicated to random-access,
-  // as the offsets calculations would be intricate. So we
-  // give up random-access addressing, and instead content ourselves
-  // with sequential access.
-  //
-  // pos_ is mutable because during the computation we will want to
-  // be able to iterate on the data in a const PackedSideBlock.
-  mutable int pos_;
-
-The methods exposing sequential access are:
-
-  std::uint8_t* current_data() {
-    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
-  }
-
-and:
-
-  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
-
-  void seek_forward_n_cells(int n) const {
-    pos_ += n * KernelSideFormat::Cell::kSize;
-  }
-
-
-Random access to PackedSideBlock data at larger scales
-------------------------------------------------------
-
-We still need some random access at larger scales (with high granularity),
-which is unavoidable since GEMM is O(n^3) and has to traverse each of the
-O(n^2) inputs O(n) times.
-
-The watershed between sequential access and random access is at the level
-of a 'Run'. Throughout gemmlowp we consistently use the term 'Run' to refer
-to the innermost GEMM loop in the depth dimension. That's the critical
-inner loop that must be as fast as possible, thus for which we absolutely
-want sequential access to packed data so that the storage order is optimal
-by construction. At larger scales i.e. between runs, we accept that
-the storage order is less optimal and since it's also less intricate, it's
-not too hard to implement random access there.
-
-This is done by the seek_run method:
-
-  void seek_run(int start_width, int start_depth) const {
-    int kernel_run_depth =
-        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
-    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
-  }
-
-We see that the formula involves the l1_depth parameter, which is how the
-packed storage order depends on L1 cache size. Again, the whole packed
-block is supposed to fit in L2 cache.
-
-
-The innermost loop of the packing stage, PackRun, and PackingRegisterBlock
-==========================================================================
-
-Keeping with our consistent usage of the term 'Run' throughout gemmlowp,
-the innermost loop is called PackRun().
-
-Here we recall a very important principle that was explained in
-doc/kernels.txt: the kernel is free to dictate the precise data format
-that it expects; the packing code has to honor it. So there's an asymmetry
-here: the kernel is the master, the packing is the slave. That's why the
-packing code is templatized in the KernelSideFormat. At larger scales,
-the packing is independent of kernel format details, but inside PackRun is
-where we take care of the small-scale details that do depend on the kernel
-format details. That's why it's a good thing that we only need sequential
-access here, as it would be very complicated to spell out random access
-at this scale.
-
-Anyway, PackRun.
-
-Since it is the critical inner loop, it is what we want to allow specializing
-for particular CPU architectures. To allow that, we handle at a time blocks
-of fixed dimensions, that is intended to be friendly enough to optimization.
-These blocks are PackingRegisterBlock's and their dimensions are:
-  width = KernelWidth
-  depth = kRegisterSize
-See doc/kernels.txt and internal/kernel.h for the former, and internal/common.h
-for the latter.
-
-See the comments around PackingRegisterBlock in internal/pack.h:
-
-// A PackingRegisterBlock is a small fixed-size block of a matrix being
-// packed. This class is the generic non-optimized implementation,
-// it is inherited by the generic implementation of PackingRegisterBlock,
-// which may be overriden by template specialization. Overriding it is how
-// one may provide optimized packing code paths.
-//
-// The packing of a block proceeds in two steps:
-//   1. Ensuring that we have a complete block of source data, i.e. a block of
-//      the compile-time prescribed size. This is where we handle unaligned
-//      boundaries: if we don't have a complete block of source data, then
-//      we copy and zero-extend it into a local temporary (complete_src_),
-//      see MakeCompleteSrc. In the generic case, we do have a complete block,
-//      so we just use it in-place, see UseCompleteSrcInPlace.
-//   2. Packing a complete block into the destination, see Pack. This is the
-//      most critical part, so it's convenient that unaligned boundaries have
-//      already been handled in step 1.
-
-
-Other things that the packing stage has to do
-=============================================
-
-Besides storing matrix entries in a suitable order, the packing stages also has
-two other things to do.
-
-First, packing has to compute the vectors of sums of entries along the depth
-dimension. If this is any mysterious, read doc/low-precision.txt. These will
-only be used at the unpacking stage.
-
-Second, if the BitDepthSetting requires less than 8 bit of precision, then at
-the packing stage we have to requantize inputs accordingly.
-See doc/less-than-8-bit.txt for details. This is the Requantize() function.
-
-
-Specialized packing paths for specific formats on specific CPU architectures
-============================================================================
-
-Please refer to internal/pack_neon.h for examples of doing that. The piece
-of code to be specialized is PackingRegisterBlock. However, inside of it,
-only the Pack() method typically needs to be specialized (the rest is unlikely
-to be critical). So one typically specializes PackingRegisterBlock but still
-inheriting PackingRegisterBlockBase to keep the generic stuff, and then one
-typically wants to override the Pack() method.
-
-Template specialization for the right template parameters is how one specifies
-in which case a given path is to be used in place of the generic packing code.
-
-It is entirely possible to set the value of kRegisterSize differently based on
-the CPU architecture (for example, 32 on x86 with AVX) as long as all the
-specialized packing paths used on that CPU architecture are consistent with it.
diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.cc b/eight_bit_int_gemm/eight_bit_int_gemm.cc
index 8113bf3..512c483 100644
--- a/eight_bit_int_gemm/eight_bit_int_gemm.cc
+++ b/eight_bit_int_gemm/eight_bit_int_gemm.cc
@@ -307,7 +307,7 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c,
                      std::int32_t b_offset, int ldb, std::uint8_t* c,
                      std::int32_t c_offset, std::int32_t c_mult_int,
                      std::int32_t c_shift, int ldc, BitDepthSetting bit_depth) {
-  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  ScopedLock sl(GlobalMutexes::EightBitIntGemm());
   GemmContext* context = GetOrCreateGlobalContext();
 
 #if defined(GEMMLOWP_USE_META_FASTPATH) && defined(GEMMLOWP_NEON)
@@ -344,7 +344,7 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c,
                      const std::uint8_t* b, std::int32_t b_offset,
                      std::int32_t ldb, float* c, float c_offset,
                      std::int32_t ldc, BitDepthSetting bit_depth) {
-  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  ScopedLock sl(GlobalMutexes::EightBitIntGemm());
   GemmContext* context = GetOrCreateGlobalContext();
 
 #if defined(GEMMLOWP_USE_META_FASTPATH) && defined(GEMMLOWP_NEON)
@@ -405,13 +405,13 @@ void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c,
 }
 
 void SetMaxNumThreads(int n) {
-  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  ScopedLock sl(GlobalMutexes::EightBitIntGemm());
   GemmContext* context = GetOrCreateGlobalContext();
   context->set_max_num_threads(n);
 }
 
 void FreePersistentResources() {
-  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  ScopedLock sl(GlobalMutexes::EightBitIntGemm());
   DestroyGlobalContext();
   DestroyGlobalScratch();
 }
diff --git a/internal/allocator.h b/internal/allocator.h
index da325a4..3a6f077 100644
--- a/internal/allocator.h
+++ b/internal/allocator.h
@@ -41,23 +41,6 @@
 
 #include "common.h"
 
-#if defined(__ANDROID__)
-#include <android/api-level.h>
-// The 18 here should be 16, but has to be 18 for now due
-// to a Google-internal issue.
-#if __ANDROID_API__ < 18
-#include <malloc.h>
-#define GEMMLOWP_USE_MEMALIGN
-#endif
-// posix_memalign is missing on some 4.1 x86 devices
-#if __ANDROID_API__ == 18
-#ifdef GEMMLOWP_X86_32
-#include <malloc.h>
-#define GEMMLOWP_USE_MEMALIGN
-#endif
-#endif
-#endif
-
 namespace gemmlowp {
 
 enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 };
@@ -115,13 +98,7 @@ class Allocator {
     if (reserved_bytes_ > storage_size_) {
       DeallocateStorage();
       storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_);
-#ifdef GEMMLOWP_USE_MEMALIGN
-      storage_ = memalign(kAlignment, storage_size_);
-#else
-      if (posix_memalign(&storage_, kAlignment, storage_size_)) {
-        storage_ = nullptr;
-      }
-#endif
+      storage_ = aligned_alloc(kAlignment, storage_size_);
     }
 
     ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure");
@@ -192,7 +169,7 @@ class Allocator {
  private:
   void DeallocateStorage() {
     assert(!committed_);
-    free(storage_);
+    aligned_free(storage_);
     storage_size_ = 0;
   }
 
diff --git a/internal/block_params.h b/internal/block_params.h
index b2fc3ff..aedd261 100644
--- a/internal/block_params.h
+++ b/internal/block_params.h
@@ -43,13 +43,12 @@ struct BlockParams {
   int l2_depth;
 
   template <typename KernelFormat>
-  void Init(int rows, int cols, int depth, int num_threads,
-            int l1_bytes_to_use, int l2_bytes_to_use, float l2_rhs_factor) {
+  void Init(int rows, int cols, int depth, int num_threads, int l1_bytes_to_use,
+            int l2_bytes_to_use, float l2_rhs_factor) {
     FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads,
-                                   l2_bytes_to_use, l2_rhs_factor,
-                                   &l2_rows, &l2_cols, &l2_depth);
-    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth,
-                                   l1_bytes_to_use,
+                                   l2_bytes_to_use, l2_rhs_factor, &l2_rows,
+                                   &l2_cols, &l2_depth);
+    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, l1_bytes_to_use,
                                    &l1_rows, &l1_cols, &l1_depth);
   }
 
@@ -61,6 +60,10 @@ struct BlockParams {
     int l2_rows = 0;
     int l2_cols = 0;
     int l2_depth = 0;
+
+    int per_thread_rows =
+        std::max(1, RoundUp<KernelFormat::kRows>(rows) / num_threads);
+
     // No L2 blocking in the depth dimension at the moment.
     // Too much loss of accuracy due to storing intermediate results in
     // low precision.
@@ -81,15 +84,15 @@ struct BlockParams {
     // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances
     // the performance on x86.
     if (l2_rhs_factor == 1.0f) {
-      l2_rows = RoundUp<KernelFormat::kRows>(rows);
+      l2_rows = RoundUp<KernelFormat::kRows>(per_thread_rows);
     } else {
       int max_cache_friendly_l2_rows =
           std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
                           (num_threads * (l2_depth + 4 * l2_cols)));
-      int min_l2_rows_blocks =
-          std::max(1, CeilQuotient(rows, max_cache_friendly_l2_rows));
-      l2_rows =
-          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks));
+      int min_l2_rows_blocks = std::max(
+          1, CeilQuotient(per_thread_rows, max_cache_friendly_l2_rows));
+      l2_rows = RoundUp<KernelFormat::kRows>(
+          CeilQuotient(per_thread_rows, min_l2_rows_blocks));
     }
 
     *out_l2_rows = l2_rows;
diff --git a/internal/common.h b/internal/common.h
index 511809d..9de151b 100644
--- a/internal/common.h
+++ b/internal/common.h
@@ -18,7 +18,8 @@
 #ifndef GEMMLOWP_INTERNAL_COMMON_H_
 #define GEMMLOWP_INTERNAL_COMMON_H_
 
-#include <pthread.h>
+#include "../internal/platform.h"
+#include "../profiling/pthread_everywhere.h"
 
 #include <algorithm>
 #include <cassert>
@@ -112,12 +113,22 @@
 #define GEMMLOWP_SSE3_64
 #endif
 
+#if defined(__has_feature)
+#if __has_feature(memory_sanitizer)
+#include <sanitizer/msan_interface.h>
+#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
+#elif __has_feature(address_sanitizer)
+#include <sanitizer/asan_interface.h>
+#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
+#endif
+#endif
+
 #endif  // GEMMLOWP_ALLOW_INLINE_ASM
 
 // Detect Android. Don't conflate with ARM - we care about tuning
 // for non-ARM Android devices too. This can be used in conjunction
 // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
-#if defined(__ANDROID__)
+#if defined(__ANDROID__) || defined(ANDROID)
 #define GEMMLOWP_ANDROID
 #endif
 
@@ -205,7 +216,7 @@ inline void Prefetch(const void* ptr) {
   // leaving __builtin_prefetch a no-op on this architecture.
   // For our purposes, "pldl1keep" is usually what we want, meaning:
   // "prefetch for load, into L1 cache, using each value multiple times".
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) : );
+  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
 #elif defined \
     __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
   __builtin_prefetch(ptr);
@@ -251,6 +262,17 @@ struct IsPowerOfTwo {
   static const bool value = !(N & (N - 1));
 };
 
+template <typename T>
+void MarkMemoryAsInitialized(T* ptr, int size) {
+#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
+  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
+                                      size * sizeof(T));
+#else
+  (void)ptr;
+  (void)size;
+#endif
+}
+
 }  // namespace gemmlowp
 
 #endif  // GEMMLOWP_INTERNAL_COMMON_H_
diff --git a/internal/compute.h b/internal/compute.h
index bbc9e2a..4417507 100644
--- a/internal/compute.h
+++ b/internal/compute.h
@@ -63,6 +63,19 @@ class ComputeImpl {
   }
 
  private:
+  static void MarkPackedResultBlockAsInitialized(
+      const MatrixMap<std::int32_t, MapOrder::ColMajor>& packed_result_block) {
+#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
+    for (int col = 0; col < packed_result_block.cols(); col++) {
+      MarkMemoryAsInitialized(
+          packed_result_block.data() + col * packed_result_block.cols_stride(),
+          packed_result_block.rows());
+    }
+#else
+    (void)packed_result_block;
+#endif
+  }
+
   void ComputeRun(int start_row, int start_col, int start_depth,
                   int depth) GEMMLOWP_NOINLINE {
     packed_lhs_.seek_run(start_row, start_depth);
@@ -72,6 +85,7 @@ class ComputeImpl {
     kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
                 packed_result_block.cols_stride(), packed_lhs_.current_data(),
                 packed_rhs_.current_data(), start_depth, depth);
+    MarkPackedResultBlockAsInitialized(packed_result_block);
   }
 
   void ComputeL1(int start_row, int rows, int start_col, int cols,
diff --git a/internal/kernel.h b/internal/kernel.h
index 4d006af..825a7f3 100644
--- a/internal/kernel.h
+++ b/internal/kernel.h
@@ -183,6 +183,7 @@ inline const char* CellOrderName(CellOrder o) {
 // Returns the offset into a cell, at which a given coefficient is stored.
 template <typename CellFormat>
 inline int OffsetIntoCell(int w, int d) {
+  const int size = CellFormat::kWidth;
   switch (CellFormat::kOrder) {
     case CellOrder::DepthMajor:
       return w + d * CellFormat::kWidth;
@@ -190,7 +191,6 @@ inline int OffsetIntoCell(int w, int d) {
       return d + w * CellFormat::kDepth;
     case CellOrder::Diagonal:
       assert(CellFormat::kWidth == CellFormat::kDepth);
-      static const int size = CellFormat::kWidth;
       return ((size + w - d) * size + d) % (size * size);
     default:
       assert(false);
diff --git a/internal/kernel_default.h b/internal/kernel_default.h
index 7ed55b8..7037bda 100644
--- a/internal/kernel_default.h
+++ b/internal/kernel_default.h
@@ -88,6 +88,10 @@ GEMMLOWP_SET_DEFAULT_KERNEL(false, false, SSE4_64_Kernel12x4Depth2)
 // SIMD is not available on this platform. The slow fallback will be used.
 // Don't require GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK because there's nothing
 // the user can do about it.
+#elif defined __powerpc__
+// There is currently no fast kernel using SIMD instructions on POWER. Don't
+// require GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK because there's nothing the user
+// can do about it.
 #else
 #error \
     "SIMD not enabled, you'd be getting a slow software fallback. Consider \
diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h
index 0234b26..df7387a 100644
--- a/internal/multi_thread_gemm.h
+++ b/internal/multi_thread_gemm.h
@@ -19,8 +19,6 @@
 #ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
 #define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
 
-#include <pthread.h>
-#include <unistd.h>
 #include <vector>
 
 #include "single_thread_gemm.h"
@@ -58,8 +56,10 @@ inline int Do256NOPs() {
 #undef GEMMLOWP_NOP
 
 inline void WriteBarrier() {
-#ifdef GEMMLOWP_ARM_32
+#if defined(_MSC_VER)
   MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_32)
+  asm volatile("" ::: "memory");
 #elif defined(GEMMLOWP_ARM_64)
   asm volatile("dmb ishst" ::: "memory");
 #elif defined(GEMMLOWP_X86)
@@ -70,8 +70,10 @@ inline void WriteBarrier() {
 }
 
 inline void ReadBarrier() {
-#ifdef GEMMLOWP_ARM_32
+#if defined(_MSC_VER)
   MemoryBarrier();
+#elif defined(GEMMLOWP_ARM_32)
+  asm volatile("" ::: "memory");
 #elif defined(GEMMLOWP_ARM_64)
   asm volatile("dmb ishld" ::: "memory");
 #elif defined(GEMMLOWP_X86)
@@ -148,10 +150,16 @@ T WaitForVariableChange(volatile T* var, T initial_value, pthread_cond_t* cond,
 class BlockingCounter {
  public:
   BlockingCounter()
-      : cond_(PTHREAD_COND_INITIALIZER),
-        mutex_(PTHREAD_MUTEX_INITIALIZER),
-        count_(0),
-        initial_count_(0) {}
+      : count_(0),
+        initial_count_(0) {
+    pthread_cond_init(&cond_, nullptr);
+    pthread_mutex_init(&mutex_, nullptr);
+  }
+
+  ~BlockingCounter() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
 
   // Sets/resets the counter; initial_count is the number of
   // decrementing events that the Wait() call will be waiting for.
@@ -187,7 +195,15 @@ class BlockingCounter {
   void Wait() {
     ScopedProfilingLabel label("BlockingCounter::Wait");
     while (count_) {
-      MemoryBarrier();
+#ifdef GEMMLOWP_USE_BUSYWAIT
+      ReadBarrier();
+#else
+      // This is likely unnecessary, but is kept to ensure regressions are not
+      // introduced.
+#ifndef _WIN32
+      asm volatile("" ::: "memory");
+#endif
+#endif
       const std::size_t count_value = count_;
       if (count_value) {
         WaitForVariableChange(&count_, count_value, &cond_, &mutex_);
@@ -222,16 +238,18 @@ class Worker {
 
   explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
       : task_(nullptr),
-        state_cond_(PTHREAD_COND_INITIALIZER),
-        state_mutex_(PTHREAD_MUTEX_INITIALIZER),
         state_(State::ThreadStartup),
         counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+    pthread_cond_init(&state_cond_, nullptr);
+    pthread_mutex_init(&state_mutex_, nullptr);
     pthread_create(&thread_, nullptr, ThreadFunc, this);
   }
 
   ~Worker() {
     ChangeState(State::ExitAsSoonAsPossible);
     pthread_join(thread_, nullptr);
+    pthread_cond_destroy(&state_cond_);
+    pthread_mutex_destroy(&state_mutex_);
   }
 
   // Changes State; may be called from either the worker thread
@@ -359,14 +377,13 @@ class WorkersPool {
   void Execute(const std::vector<Task*>& tasks) {
     assert(tasks.size() >= 1);
     // One of the tasks will be run on the current thread.
-    int workers_count = tasks.size() - 1;
+    std::size_t workers_count = tasks.size() - 1;
     CreateWorkers(workers_count);
     assert(workers_count <= workers_.size());
     counter_to_decrement_when_ready_.Reset(workers_count);
     int n = 0;
-    std::for_each(tasks.begin(), --tasks.end(), [this, &n](Task *task) {
-      workers_[n++]->StartWork(task);
-    });
+    std::for_each(tasks.begin(), --tasks.end(),
+                  [this, &n](Task* task) { workers_[n++]->StartWork(task); });
     // Execute the remaining workload immediately on the current thread.
     Task* task = tasks.back();
     task->local_allocator = &main_thread_task_allocator_;
@@ -375,9 +392,7 @@ class WorkersPool {
     counter_to_decrement_when_ready_.Wait();
     // Cleanup tasks (best to do this from the same thread that allocated
     // the memory).
-    std::for_each(tasks.begin(), tasks.end(), [](Task *task) {
-      delete task;
-    });
+    std::for_each(tasks.begin(), tasks.end(), [](Task* task) { delete task; });
   }
 
  private:
@@ -421,18 +436,18 @@ class WorkersPool {
 template <typename KernelFormat, typename InputScalar, typename OutputScalar,
           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
-  typename OutputPipelineType, typename GemmContextType>
+          typename OutputPipelineType, typename GemmContextType>
 struct GemmWithPackedRhsTask : Task {
   typedef PackedSideBlock<typename KernelFormat::Lhs> PackedLhs;
   typedef PackedSideBlock<typename KernelFormat::Rhs> PackedRhs;
-  GemmWithPackedRhsTask(GemmContextType* _context,
-                        const KernelBase& _kernel,
+  GemmWithPackedRhsTask(GemmContextType* _context, const KernelBase& _kernel,
                         const MatrixMap<const InputScalar, LhsOrder>& _lhs,
                         const PackedRhs& _packed_rhs,
                         MatrixMap<OutputScalar, ResultOrder>* _result,
                         const MatrixBlockBounds& _result_block,
                         const LhsOffset& _lhs_offset,
                         const RhsOffset& _rhs_offset,
+                        const BlockParams& _block_params,
                         const OutputPipelineType& _output_pipeline)
       : context(_context),
         kernel(_kernel),
@@ -442,6 +457,7 @@ struct GemmWithPackedRhsTask : Task {
         result_block(_result_block),
         lhs_offset(_lhs_offset),
         rhs_offset(_rhs_offset),
+        block_params(_block_params),
         output_pipeline(_output_pipeline) {}
 
   void Run() override {
@@ -451,12 +467,6 @@ struct GemmWithPackedRhsTask : Task {
     const int cols = result_block.cols;
     const int depth = lhs.cols();
 
-    BlockParams block_params;
-    block_params.Init<KernelFormat>(rows, cols, depth, 1,
-                                    context->l1_bytes_to_use(),
-                                    context->l2_bytes_to_use(),
-                                    context->l2_rhs_factor());
-
     PackedLhs packed_lhs(Side::Lhs, local_allocator, block_params);
 
     PackedResult packed_result(local_allocator, block_params);
@@ -495,6 +505,7 @@ struct GemmWithPackedRhsTask : Task {
   const MatrixBlockBounds result_block;
   const LhsOffset& lhs_offset;
   const RhsOffset& rhs_offset;
+  const BlockParams& block_params;
   const OutputPipelineType& output_pipeline;
 };
 
@@ -552,19 +563,7 @@ inline int HowManyThreads(int max_num_threads, int rows, int cols, int depth) {
   }
 
   // Determine the maximum number of threads.
-  int max_count = max_num_threads;
-  // The special value 0 means try to determine the total number of cores.
-  if (max_count == 0) {
-    // No user-set maximum number of threads, so we need to
-    // do some hardware detection.
-    // This is expensive to query so we do it only once.
-    // Too bad for dynamicness. Also, we dont use the c++11 standard getter
-    // because Google's coding style currently bans #include <thread_>.
-    static const int hardware_threads_count =
-        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
-
-    max_count = hardware_threads_count;
-  }
+  int max_count = GetHardwareConcurrency(max_num_threads);
 
   // Basic calculation: take into account max pool size, and
   // how many rows we have to feed our kernel.
@@ -654,10 +653,9 @@ void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
   auto* workers_pool = context->workers_pool();
 
   BlockParams block_params;
-  block_params.Init<KernelFormat>(rows, cols, depth, task_count,
-                                  context->l1_bytes_to_use(),
-                                  context->l2_bytes_to_use(),
-                                  context->l2_rhs_factor());
+  block_params.Init<KernelFormat>(
+      rows, cols, depth, task_count, context->l1_bytes_to_use(),
+      context->l2_bytes_to_use(), context->l2_rhs_factor());
 
   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
                                                          block_params);
@@ -675,19 +673,20 @@ void MultiThreadGemm(GemmContextType* context, const KernelBase& kernel,
     int next_start_row = 0;
     for (int n = 0; n < task_count; ++n) {
       int start_row = next_start_row;
-      next_start_row = std::min(rows, RoundUp<KernelFormat::kRows>(
-                                          rows * (n + 1) / task_count));
+      next_start_row = std::min(
+          rows, RoundUp<KernelFormat::kRows>(rows * (n + 1) / task_count));
 
       int block_rows = next_start_row - start_row;
       auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
-      typedef GemmWithPackedRhsTask<
-          KernelFormat, InputScalar, OutputScalar, BitDepthParams, LhsOrder,
-          RhsOrder, ResultOrder, LhsOffset, RhsOffset, OutputPipelineType,
-          GemmContextType>
+      typedef GemmWithPackedRhsTask<KernelFormat, InputScalar, OutputScalar,
+                                    BitDepthParams, LhsOrder, RhsOrder,
+                                    ResultOrder, LhsOffset, RhsOffset,
+                                    OutputPipelineType, GemmContextType>
           TaskType;
-      tasks.push_back(new TaskType(context, kernel, lhs_block, packed_rhs, result,
-                                   MatrixBlockBounds(start_row, c, block_rows, cs),
-                                   lhs_offset, rhs_offset, output_pipeline));
+      tasks.push_back(
+          new TaskType(context, kernel, lhs_block, packed_rhs, result,
+                       MatrixBlockBounds(start_row, c, block_rows, cs),
+                       lhs_offset, rhs_offset, block_params, output_pipeline));
     }
     // Execute the work on the workers (and partially on this thread).
     workers_pool->Execute(tasks);
diff --git a/internal/platform.h b/internal/platform.h
new file mode 100755
index 0000000..49e41a9
--- /dev/null
+++ b/internal/platform.h
@@ -0,0 +1,112 @@
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// internal/platform.h: a place to put platform specific code
+
+#ifndef GEMMLOWP_INTERNAL_PLATFORM_H_
+#define GEMMLOWP_INTERNAL_PLATFORM_H_
+
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#include <time.h>
+#include <stdlib.h>
+#endif
+#include <malloc.h>
+
+#if defined ANDROID || defined __ANDROID__
+#include <android/api-level.h>
+// The 18 here should be 16, but has to be 18 for now due
+// to a Google-internal issue.
+#if __ANDROID_API__ < 18
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+// posix_memalign is missing on some 4.1 x86 devices
+#if __ANDROID_API__ == 18
+#ifdef GEMMLOWP_X86_32
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+#endif
+#endif
+
+
+namespace gemmlowp {
+
+#ifdef _WIN32
+inline void *aligned_alloc(size_t alignment, size_t size) {
+  return _aligned_malloc(size, alignment);
+}
+
+inline void aligned_free(void *memptr) {
+  _aligned_free(memptr);
+}
+
+inline int GetHardwareConcurrency(int max_threads) {
+  if (max_threads == 0) {
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    return sysinfo.dwNumberOfProcessors;
+  }
+  return max_threads;
+}
+
+inline double real_time_in_seconds() {
+  __int64 wintime; GetSystemTimeAsFileTime((FILETIME*)&wintime);
+  wintime -= 116444736000000000i64;	            //1jan1601 to 1jan1970
+  return wintime / 10000000i64 + wintime % 10000000i64 * 100 * 1e-9;
+}
+
+#else
+inline void *aligned_alloc(size_t alignment, size_t size) {
+#ifdef GEMMLOWP_USE_MEMALIGN
+  return memalign(alignment, size);
+#else
+  void *memptr;
+  if (posix_memalign(&memptr, alignment, size)) {
+    memptr = nullptr;
+  }
+  return memptr;
+#endif
+}
+
+inline int GetHardwareConcurrency(int max_threads) {
+  if (max_threads == 0) {
+    static const int hardware_threads_count =
+        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
+    return hardware_threads_count;
+  }
+  return max_threads;
+}
+
+inline void aligned_free(void *memptr) {
+  free(memptr);
+}
+
+inline double real_time_in_seconds() {
+#ifdef __APPLE__
+  timeval t;
+  gettimeofday(&t, nullptr);
+  return t.tv_sec + 1e-6 * t.tv_usec;
+#else
+  timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+  return t.tv_sec + 1e-9 * t.tv_nsec;
+#endif
+}
+
+#endif
+} // namespace gemmlowp
+#endif  // GEMMLOWP_INTERNAL_PLATFORM_H_
diff --git a/meta/multi_thread_common.h b/meta/multi_thread_common.h
index 0b35759..dc1b799 100644
--- a/meta/multi_thread_common.h
+++ b/meta/multi_thread_common.h
@@ -20,15 +20,6 @@
 namespace gemmlowp {
 namespace meta {
 
-inline int ResolveMaxThreads(int max_threads) {
-  if (max_threads == 0) {
-    static const int hardware_threads_count =
-        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
-    return hardware_threads_count;
-  }
-  return max_threads;
-}
-
 template <typename WorkersPool>
 class SimpleContext {
  public:
diff --git a/meta/multi_thread_gemm.h b/meta/multi_thread_gemm.h
index bc569e8..f69f810 100644
--- a/meta/multi_thread_gemm.h
+++ b/meta/multi_thread_gemm.h
@@ -132,9 +132,9 @@ inline void MultiThreadGemm(MultiThreadingContext* context,
 
   auto workers_pool = context->workers_pool();
   std::vector<Task*> tasks;
-  std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) {
-    tasks.push_back(new TaskRunnerType(param));
-  });
+  for (auto& task_param : task_params) {
+    tasks.push_back(new TaskRunnerType(task_param));
+  };
   workers_pool->Execute(tasks);
 }
 
diff --git a/meta/multi_thread_gemv.h b/meta/multi_thread_gemv.h
deleted file mode 100644
index 2e25ea8..0000000
--- a/meta/multi_thread_gemv.h
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// multi_thread_gemv.h: Entry point to the multithreaded version of the
-// generated (meta) gemv library.
-
-#ifndef GEMMLOWP_META_MULTI_THREAD_GEMV_H_
-#define GEMMLOWP_META_MULTI_THREAD_GEMV_H_
-
-#ifdef GEMMLOWP_NEON_32
-
-#include "multi_thread_common.h"
-#include "operations_common.h"
-#include "single_thread_gemm.h"
-
-namespace gemmlowp {
-namespace meta {
-namespace internal {
-
-class GemvQuantized8BitOperation : public Quantized8BitOperation {
- public:
-  GemvQuantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
-                             std::int32_t sum_offset, std::int32_t multiplier,
-                             std::int32_t shift)
-      : Quantized8BitOperation(lhs_offset, rhs_offset, sum_offset, multiplier,
-                               shift) {}
-
-  void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
-                           const std::uint8_t* rhs, std::int32_t m,
-                           std::int32_t n, std::int32_t k, std::uint8_t* result,
-                           std::int32_t result_stride) const {
-    gemv_q8(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, sum_offset,
-            multiplier, shift, result);
-  }
-
-  static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
-                                       std::int32_t k) {
-    return 128 * 1024;
-  }
-};
-
-class GemvFloatOperation : public FloatOperation {
- public:
-  GemvFloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
-                     float result_offset)
-      : FloatOperation(lhs_offset, rhs_offset, result_offset) {}
-
-  void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
-                           const std::uint8_t* rhs, std::int32_t m,
-                           std::int32_t n, std::int32_t k, float* result,
-                           std::int32_t result_stride) const {
-    gemv_f(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, result_offset,
-           result);
-  }
-
-  static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
-                                       std::int32_t k) {
-    return 128 * 1024;
-  }
-};
-
-class GemvInt32Operation : public Int32Operation {
- public:
-  GemvInt32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset)
-      : Int32Operation(lhs_offset, rhs_offset) {}
-
-  void ExecuteMatrixMatrix(std::uint8_t* scratch, const std::uint8_t* lhs,
-                           const std::uint8_t* rhs, std::int32_t m,
-                           std::int32_t n, std::int32_t k, std::int32_t* result,
-                           std::int32_t result_stride) const {
-    gemv_i32(scratch, lhs, rhs, n, k, lhs_offset, rhs_offset, result);
-  }
-
-  static std::int32_t ScratchPerThread(std::int32_t m, std::int32_t n,
-                                       std::int32_t k) {
-    return 128 * 1024;
-  }
-};
-
-}  // namespace internal
-
-std::int32_t gemv_q8_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
-                             std::int32_t max_threads) {
-  return internal::ResolveMaxThreads(max_threads) *
-         internal::GemvQuantized8BitOperation::ScratchPerThread(m, n, k);
-}
-
-void multi_thread_gemv_q8(gemmlowp::WorkersPool* pool, std::int32_t max_threads,
-                          std::uint8_t* scratch, const std::uint8_t* lhs,
-                          const std::uint8_t* rhs, std::int32_t n,
-                          std::int32_t k, std::int32_t lhs_offset,
-                          std::int32_t rhs_offset, std::int32_t sum_offset,
-                          std::int32_t multiplier, std::int32_t shift,
-                          std::uint8_t* result) {
-  max_threads = internal::ResolveMaxThreads(max_threads);
-  internal::GemvQuantized8BitOperation operation(lhs_offset, rhs_offset,
-                                                 sum_offset, multiplier, shift);
-  if (max_threads == 1) {
-    operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n);
-  } else {
-    internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1,
-                                        n, k, result, n, operation);
-  }
-}
-
-std::int32_t gemv_f_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
-                            std::int32_t max_threads) {
-  return internal::ResolveMaxThreads(max_threads) *
-         internal::GemvFloatOperation::ScratchPerThread(m, n, k);
-}
-
-void multi_thread_gemv_f(gemmlowp::WorkersPool* pool, std::int32_t max_threads,
-                         std::uint8_t* scratch, const std::uint8_t* lhs,
-                         const std::uint8_t* rhs, std::int32_t n,
-                         std::int32_t k, std::int32_t lhs_offset,
-                         std::int32_t rhs_offset, float result_offset,
-                         float* result) {
-  max_threads = internal::ResolveMaxThreads(max_threads);
-  internal::GemvFloatOperation operation(lhs_offset, rhs_offset, result_offset);
-  if (max_threads == 1) {
-    operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n);
-  } else {
-    internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1,
-                                        n, k, result, n, operation);
-  }
-}
-
-std::int32_t gemv_i32_scratch(std::int32_t m, std::int32_t n, std::int32_t k,
-                              std::int32_t max_threads) {
-  return internal::ResolveMaxThreads(max_threads) *
-         internal::GemvInt32Operation::ScratchPerThread(m, n, k);
-}
-
-void multi_thread_gemv_i32(gemmlowp::WorkersPool* pool,
-                           std::int32_t max_threads, std::uint8_t* scratch,
-                           const std::uint8_t* lhs, const std::uint8_t* rhs,
-                           std::int32_t n, std::int32_t k,
-                           std::int32_t lhs_offset, std::int32_t rhs_offset,
-                           std::int32_t* result) {
-  max_threads = internal::ResolveMaxThreads(max_threads);
-  internal::GemvInt32Operation operation(lhs_offset, rhs_offset);
-  if (max_threads == 1) {
-    operation.ExecuteMatrixMatrix(scratch, lhs, rhs, 1, n, k, result, n);
-  } else {
-    internal::MultiThreadedMatrixMatrix(pool, max_threads, scratch, lhs, rhs, 1,
-                                        n, k, result, n, operation);
-  }
-}
-
-}  // namespace meta
-}  // namespace gemmlowp
-
-#else
-#warning "Meta gemm fast-path requires GEMMLOWP_NEON_32!"
-#endif
-
-#endif  // GEMMLOWP_META_MULTI_THREAD_GEMV_H_
diff --git a/meta/multi_thread_transform.h b/meta/multi_thread_transform.h
index d21aec1..3bd4b3e 100644
--- a/meta/multi_thread_transform.h
+++ b/meta/multi_thread_transform.h
@@ -86,9 +86,9 @@ inline void MultiThreadTransform1D(MultiThreadingContext* context,
 
   auto workers_pool = context->workers_pool();
   std::vector<Task*> tasks;
-  std::for_each(task_params.begin(), task_params.end(), [tasks](Params* param) {
-    tasks.push_back(new TaskRunnerType(param));
-  });
+  for (auto& task_param : task_params) {
+    tasks.push_back(new TaskRunnerType(task_param));
+  }
   workers_pool->Execute(tasks);
 }
 
diff --git a/meta/operations_common.h b/meta/operations_common.h
deleted file mode 100644
index d80375a..0000000
--- a/meta/operations_common.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef GEMMLOWP_META_OPERATIONS_COMMON_H_
-#define GEMMLOWP_META_OPERATIONS_COMMON_H_
-
-class Quantized8BitOperation {
- public:
-  Quantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
-                         std::int32_t sum_offset, std::int32_t multiplier,
-                         std::int32_t shift)
-      : lhs_offset(lhs_offset),
-        rhs_offset(rhs_offset),
-        sum_offset(sum_offset),
-        multiplier(multiplier),
-        shift(shift) {}
-
- protected:
-  std::int32_t lhs_offset;
-  std::int32_t rhs_offset;
-  std::int32_t sum_offset;
-  std::int32_t multiplier;
-  std::int32_t shift;
-};
-
-class FloatOperation {
- public:
-  FloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
-                 float result_offset)
-      : lhs_offset(lhs_offset),
-        rhs_offset(rhs_offset),
-        result_offset(result_offset) {}
-
- protected:
-  std::int32_t lhs_offset;
-  std::int32_t rhs_offset;
-  float result_offset;
-};
-
-class Int32Operation {
- public:
-  Int32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset)
-      : lhs_offset(lhs_offset), rhs_offset(rhs_offset) {}
-
- protected:
-  std::int32_t lhs_offset;
-  std::int32_t rhs_offset;
-};
-
-#endif  // GEMMLOWP_META_OPERATIONS_COMMON_H_
diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h
index 51b6525..539076a 100644
--- a/profiling/instrumentation.h
+++ b/profiling/instrumentation.h
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -52,15 +52,6 @@ using ::uintptr_t;
 #include <set>
 #endif
 
-// We should always use C++11 thread_local; unfortunately that
-// isn't fully supported on Apple yet.
-#ifdef __APPLE__
-#define GEMMLOWP_THREAD_LOCAL static __thread
-#define GEMMLOWP_USING_OLD_THREAD_LOCAL
-#else
-#define GEMMLOWP_THREAD_LOCAL thread_local
-#endif
-
 namespace gemmlowp {
 
 inline void ReleaseBuildAssertion(bool condition, const char* msg) {
@@ -70,41 +61,42 @@ inline void ReleaseBuildAssertion(bool condition, const char* msg) {
   }
 }
 
-// To be used as template parameter for GlobalLock.
-// GlobalLock<ProfilerLockId> is the profiler global lock:
-// registering threads, starting profiling, finishing profiling, and
-// the profiler itself as it samples threads, all need to lock it.
-struct ProfilerLockId;
-
-// A very plain global lock. Templated in LockId so we can have multiple
-// locks, one for each LockId type.
-template <typename LockId>
-class GlobalLock {
-  static pthread_mutex_t* Mutex() {
-    static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+class Mutex {
+ public:
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
+
+  Mutex() { pthread_mutex_init(&m, NULL); }
+  ~Mutex() { pthread_mutex_destroy(&m); }
+
+  void Lock() { pthread_mutex_lock(&m); }
+  void Unlock() { pthread_mutex_unlock(&m); }
+
+ private:
+  pthread_mutex_t m;
+};
+
+class GlobalMutexes {
+ public:
+  static Mutex* Profiler() {
+    static Mutex m;
     return &m;
   }
 
- public:
-  static void Lock() { pthread_mutex_lock(Mutex()); }
-  static void Unlock() { pthread_mutex_unlock(Mutex()); }
+  static Mutex* EightBitIntGemm() {
+    static Mutex m;
+    return &m;
+  }
 };
 
-// A very simple RAII helper to lock and unlock a GlobalLock
-template <typename LockId>
-struct AutoGlobalLock {
-  AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
-  ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
-};
+// A very simple RAII helper to lock and unlock a Mutex
+struct ScopedLock {
+  ScopedLock(Mutex* m) : _m(m) { _m->Lock(); }
+  ~ScopedLock() { _m->Unlock(); }
 
-// MemoryBarrier is purely a compile-time thing; it tells two things
-// to the compiler:
-//   1) It prevents reordering code across it
-//     (thanks to the 'volatile' after 'asm')
-//   2) It requires the compiler to assume that any value previously
-//     read from memory, may have changed. Thus it offers an alternative
-//     to using 'volatile' variables.
-inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
+ private:
+  Mutex* _m;
+};
 
 // Profiling definitions. Two paths: when profiling is enabled,
 // and when profiling is disabled.
@@ -115,34 +107,31 @@ inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
 // contains pointers to literal strings that were manually entered
 // in the instrumented code (see ScopedProfilingLabel).
 struct ProfilingStack {
-  static const std::size_t kMaxSize = 15;
+  static const std::size_t kMaxSize = 14;
   typedef const char* LabelsArrayType[kMaxSize];
   LabelsArrayType labels;
   std::size_t size;
+  Mutex* lock;
 
   ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
 
   void Push(const char* label) {
-    MemoryBarrier();
+    ScopedLock sl(lock);
     ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
     labels[size] = label;
-    MemoryBarrier();
     size++;
-    MemoryBarrier();
   }
 
   void Pop() {
-    MemoryBarrier();
+    ScopedLock sl(lock);
     ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
     size--;
-    MemoryBarrier();
   }
 
   void UpdateTop(const char* new_label) {
-    MemoryBarrier();
+    ScopedLock sl(lock);
     assert(size);
     labels[size - 1] = new_label;
-    MemoryBarrier();
   }
 
   ProfilingStack& operator=(const ProfilingStack& other) {
@@ -174,29 +163,35 @@ struct ThreadInfo {
   ThreadInfo() {
     pthread_key_create(&key, ThreadExitCallback);
     pthread_setspecific(key, this);
+    stack.lock = new Mutex();
   }
 
   static void ThreadExitCallback(void* ptr) {
-    AutoGlobalLock<ProfilerLockId> lock;
+    ScopedLock sl(GlobalMutexes::Profiler());
     ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
     ThreadsUnderProfiling().erase(self);
     pthread_key_delete(self->key);
+    delete self->stack.lock;
   }
 };
 
 inline ThreadInfo& ThreadLocalThreadInfo() {
-#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL
-  // We're leaking this ThreadInfo structure, because Apple doesn't support
-  // non-trivial constructors or destructors for their __thread type modifier.
-  GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr;
-  if (i == nullptr) {
-    i = new ThreadInfo();
+  static pthread_key_t key;
+  static auto DeleteThreadInfo = [](void* threadInfoPtr) {
+    ThreadInfo* threadInfo = static_cast<ThreadInfo*>(threadInfoPtr);
+    if (threadInfo) {
+      delete threadInfo;
+    }
+  };
+
+  static int key_result = pthread_key_create(&key, DeleteThreadInfo);
+
+  ThreadInfo* threadInfo = static_cast<ThreadInfo*>(pthread_getspecific(key));
+  if (!threadInfo) {
+    threadInfo = new ThreadInfo();
+    pthread_setspecific(key, threadInfo);
   }
-  return *i;
-#else
-  GEMMLOWP_THREAD_LOCAL ThreadInfo i;
-  return i;
-#endif
+  return *threadInfo;
 }
 
 // ScopedProfilingLabel is how one instruments code for profiling
@@ -221,7 +216,7 @@ class ScopedProfilingLabel {
 
 // To be called once on each thread to be profiled.
 inline void RegisterCurrentThreadForProfiling() {
-  AutoGlobalLock<ProfilerLockId> lock;
+  ScopedLock sl(GlobalMutexes::Profiler());
   ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
 }
 
diff --git a/profiling/profiler.h b/profiling/profiler.h
index a18c036..018da57 100644
--- a/profiling/profiler.h
+++ b/profiling/profiler.h
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -306,12 +306,12 @@ inline pthread_t& ProfilerThread() {
 // In the end, the key atomicity property that we are relying on
 // here is that pointers are changed atomically, and the labels
 // are pointers (to literal strings).
-inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
+inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) {
+  ScopedLock sl(thread->stack.lock);
   assert(!dst->size);
   while (dst->size < thread->stack.size) {
     dst->labels[dst->size] = thread->stack.labels[dst->size];
     dst->size++;
-    MemoryBarrier();  // thread->stack can change at any time
   }
 }
 
@@ -330,7 +330,7 @@ inline void* ProfilerThreadFunc(void*) {
   while (!ProfilerThreadShouldFinish()) {
     WaitOneProfilerTick();
     {
-      AutoGlobalLock<ProfilerLockId> lock;
+      ScopedLock sl(GlobalMutexes::Profiler());
       for (auto t : ThreadsUnderProfiling()) {
         ProfilingStack s;
         RecordStack(t, &s);
@@ -347,7 +347,7 @@ inline void* ProfilerThreadFunc(void*) {
 
 // Starts recording samples.
 inline void StartProfiling() {
-  AutoGlobalLock<ProfilerLockId> lock;
+  ScopedLock sl(GlobalMutexes::Profiler());
   ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
   IsProfiling() = true;
   ProfilerThreadShouldFinish() = false;
@@ -357,7 +357,7 @@ inline void StartProfiling() {
 // Stops recording samples, and prints a profile tree-view on stdout.
 inline void FinishProfiling() {
   {
-    AutoGlobalLock<ProfilerLockId> lock;
+    ScopedLock sl(GlobalMutexes::Profiler());
     ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
     // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
     // as the scary comments below should make clear.
diff --git a/profiling/pthread_everywhere.h b/profiling/pthread_everywhere.h
new file mode 100644
index 0000000..7e12d66
--- /dev/null
+++ b/profiling/pthread_everywhere.h
@@ -0,0 +1,88 @@
+// Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pthread_everywhere.h: Either includes <pthread.h> or implements a
+// subset of pthread functionality on top of C++11 <thread> for portability.
+
+#ifndef GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
+#define GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
+
+#include "pthread_everywhere.h"
+
+#ifndef _WIN32
+#define GEMMLOWP_USE_PTHREAD
+#endif
+
+#if defined GEMMLOWP_USE_PTHREAD
+#include <pthread.h>
+#else
+// Implement a small subset of pthread on top of C++11 threads.
+// The function signatures differ from true pthread functions in two ways:
+//  - True pthread functions return int error codes, ours return void.
+//    Rationale: the c++11 <thread> equivalent functions return void
+//    and use exceptions to report errors; we don't want to deal with
+//    exceptions in this code, so we couldn't meaningfully return errors
+//    in the polyfill. Also, the gemmlowp code using these pthread functions
+//    never checks their return values anyway.
+//  - True pthread *_create/*_init functions take pointers to 'attribute'
+//    structs; ours take nullptr_t. That is because gemmlowp always passes
+//    nullptr at the moment, so any support we would code for non-null
+//    attribs would be unused.
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <cstddef>
+namespace gemmlowp {
+using pthread_t = std::thread*;
+using pthread_mutex_t = std::mutex*;
+using pthread_cond_t = std::condition_variable*;
+inline void pthread_create(pthread_t* thread, std::nullptr_t, 
+  void *(*start_routine) (void *), void *arg) {
+  *thread = new std::thread(start_routine, arg);
+}
+inline void pthread_join(pthread_t thread, std::nullptr_t) {
+  thread->join();
+}
+inline void pthread_mutex_init(pthread_mutex_t *mutex, std::nullptr_t) {
+  *mutex = new std::mutex;
+}
+inline void pthread_mutex_lock(pthread_mutex_t* mutex) {
+  (*mutex)->lock();
+}
+inline void pthread_mutex_unlock(pthread_mutex_t* mutex) {
+  (*mutex)->unlock();
+}
+inline void pthread_mutex_destroy(pthread_mutex_t *mutex) {
+  delete *mutex;
+}
+inline void pthread_cond_init(pthread_cond_t *cond, std::nullptr_t) {
+  *cond = new std::condition_variable;
+}
+inline void pthread_cond_signal(pthread_cond_t* cond) {
+  (*cond)->notify_one();
+}
+inline void pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
+  std::unique_lock<std::mutex> lock(**mutex, std::adopt_lock);
+  (*cond)->wait(lock);
+  // detach lock from mutex so when we leave this conext
+  // the lock is not released
+  lock.release();
+}
+inline void pthread_cond_destroy(pthread_cond_t *cond) {
+  delete *cond;
+}
+}  // end namespace gemmlowp
+#endif
+
+#endif  // GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
\ No newline at end of file
diff --git a/test/benchmark.cc b/test/benchmark.cc
index 20dd369..9a87a41 100644
--- a/test/benchmark.cc
+++ b/test/benchmark.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <unistd.h>
 #ifdef __APPLE__
 #include <sys/time.h>
 #endif
@@ -44,18 +43,6 @@
 
 namespace gemmlowp {
 
-double time() {
-#ifdef __APPLE__
-  timeval t;
-  gettimeofday(&t, nullptr);
-  return t.tv_sec + 1e-6 * t.tv_usec;
-#else
-  timespec t;
-  clock_gettime(CLOCK_REALTIME, &t);
-  return t.tv_sec + 1e-9 * t.tv_nsec;
-#endif
-}
-
 const double min_accurate_duration = 1e-1;
 const std::size_t min_working_set_size = 16 * 1024 * 1024;
 
@@ -111,10 +98,10 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
   std::size_t pool_index = 0;
 
   while (true) {
-    double starttime = time();
+    double starttime = real_time_in_seconds();
     for (int i = 0; i < iters_at_a_time; i++) {
       for (size_t j = 0; j < gemms.size(); j++) {
-        int k = pool_index * gemms.size() + j;
+        size_t k = pool_index * gemms.size() + j;
         Gemm<std::uint8_t, GEMMLOWP_TEST_BIT_DEPTH_PARAMS>(
             context, lhs[k].const_map(), rhs[k].const_map(), &result[k].map(),
             -75, -91, 74980, 123, 20);
@@ -124,7 +111,7 @@ double time_for_gemms(GemmContext* context, const std::vector<gemm_t>& gemms) {
         pool_index = 0;
       }
     }
-    double endtime = time();
+    double endtime = real_time_in_seconds();
 
     const float timing = static_cast<float>(endtime - starttime);
 
@@ -228,8 +215,8 @@ void benchmark_gemm_sizes(GemmContext* context,
   gemmlowp::StartProfiling();
 #endif
 
-  double starttime = time();
-  while (time() < starttime + mintime) {
+  double starttime = real_time_in_seconds();
+  while (real_time_in_seconds() < starttime + mintime) {
     gemm_times.push_back(
         time_for_gemms<LhsType, RhsType, ResultType>(context, gemms));
   }
diff --git a/test/benchmark_all_sizes.cc b/test/benchmark_all_sizes.cc
new file mode 100644
index 0000000..16cc57c
--- /dev/null
+++ b/test/benchmark_all_sizes.cc
@@ -0,0 +1,357 @@
+// Example command line to build on Android ARM64:
+/*
+~/android/toolchains/r15c-aarch64/bin/aarch64-linux-android-clang++ \
+test/benchmark_all_sizes.cc -o /tmp/b -O3 --std=c++11 -fPIE -static \
+-DBENCHMARK_QUICK -DBENCHMARK_8bit
+*/
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <ctime>
+#include <iostream>
+#include <map>
+#include <random>
+#include <set>
+
+#include "../public/gemmlowp.h"
+
+#if defined GEMMLOWP_ANDROID && defined GEMMLOWP_ARM_32
+// Compilation workaround
+namespace std {
+  using ::round;
+}
+#endif
+
+// Minimum duration of each benchmark measurement. Also, duration
+// of sleep time between each two consecutive benchmark measurements to
+// prevent over-heating.
+const double kBenchmarkSecs = 0.1;
+
+// Sleep time before each benchmark.
+const int kCooldownBeforeBenchmarkSecs = 0;
+
+// Number of benchmark passes.
+const int kPasses = 4;
+
+#ifdef BENCHMARK_NUM_THREADS
+const int kNumThreads = BENCHMARK_NUM_THREADS;
+#else
+const int kNumThreads = 1;
+#endif
+
+namespace gemmlowp {
+
+// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
+// since it only maps existing data. In tests though, we need to
+// create our own matrices.
+template <typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+  typedef MatrixMap<tScalar, tOrder> Map;
+  typedef MatrixMap<const tScalar, tOrder> ConstMap;
+  typedef typename Map::Scalar Scalar;
+  static const MapOrder Order = tOrder;
+  using Map::cols_;
+  using Map::data_;
+  using Map::kOrder;
+  using Map::rows_;
+  using Map::stride_;
+
+ public:
+  Matrix() : Map(nullptr, 0, 0, 0) {}
+
+  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+  Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
+
+  Matrix& operator=(const Matrix& other) {
+    Resize(other.rows_, other.cols_);
+    std::memcpy(data_, other.data_, size() * sizeof(Scalar));
+    return *this;
+  }
+
+  friend bool operator==(const Matrix& a, const Matrix& b) {
+    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+           !std::memcmp(a.data_, b.data_, a.size());
+  }
+
+  void Resize(int rows, int cols) {
+    rows_ = rows;
+    cols_ = cols;
+    stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
+    storage.resize(size());
+    data_ = storage.data();
+  }
+
+  int size() const { return rows_ * cols_; }
+
+  Map& map() { return *static_cast<Map*>(this); }
+
+  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+  std::vector<Scalar> storage;
+};
+
+template <typename MatrixType>
+void MakeZero(MatrixType* m) {
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = 128;
+    }
+  }
+}
+
+}  // end namespace gemmlowp
+
+template <typename BitDepthParams>
+float benchmark_8bit(int rows, int depth, int cols) {
+  using namespace gemmlowp;
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+  LhsType lhs;
+  RhsType rhs;
+  ResultType result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  MakeZero(&lhs);
+  MakeZero(&rhs);
+  MakeZero(&result);
+
+  typedef std::tuple<OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
+                     OutputStageSaturatingCastToUint8>
+      Pipeline;
+  gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+      quantize_down_stage;
+  quantize_down_stage.result_offset_after_shift = 128;
+  quantize_down_stage.result_fixedpoint_multiplier = 1234567890;
+  quantize_down_stage.result_shift = 16;
+  gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
+  const auto output_pipeline =
+      std::make_tuple(quantize_down_stage, saturating_cast_stage);
+  GemmContext gemm_context;
+  gemm_context.set_max_num_threads(kNumThreads);
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, BitDepthParams>(
+      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, output_pipeline);
+
+  double time_start = real_time_in_seconds();
+  double t = time_start;
+  int iters = 0;
+  int iters_at_a_time = 1;
+  while (t - time_start < kBenchmarkSecs) {
+    for (int i = 0; i < iters_at_a_time; i++) {
+      gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t,
+                                       BitDepthParams>(
+          &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+          -128, output_pipeline);
+      iters++;
+    }
+    iters_at_a_time *= 2;
+    t = real_time_in_seconds();
+  }
+  return (t - time_start) / iters;
+}
+
+template <typename BitDepthParams>
+float benchmark_8bit_to_32bit(int rows, int depth, int cols) {
+  using namespace gemmlowp;
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::int32_t, MapOrder::ColMajor> ResultType;
+
+  LhsType lhs;
+  RhsType rhs;
+  ResultType result;
+  lhs.Resize(rows, depth);
+  rhs.Resize(depth, cols);
+  result.Resize(rows, cols);
+  MakeZero(&lhs);
+  MakeZero(&rhs);
+  MakeZero(&result);
+
+  typedef std::tuple<> EmptyPipeline;
+  GemmContext gemm_context;
+  gemm_context.set_max_num_threads(kNumThreads);
+  gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t, BitDepthParams>(
+      &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+      -128, EmptyPipeline());
+
+  double time_start = real_time_in_seconds();
+  double t = time_start;
+  int iters = 0;
+  int iters_at_a_time = 1;
+  while (t - time_start < kBenchmarkSecs) {
+    for (int i = 0; i < iters_at_a_time; i++) {
+      gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                                       BitDepthParams>(
+          &gemm_context, lhs.const_map(), rhs.const_map(), &result.map(), -128,
+          -128, EmptyPipeline());
+      iters++;
+    }
+    iters_at_a_time *= 2;
+    t = real_time_in_seconds();
+  }
+  return (t - time_start) / iters;
+}
+
+struct Shape {
+  int rows;
+  int depth;
+  int cols;
+};
+
+bool operator==(const Shape& s1, const Shape& s2) {
+  return s1.rows == s2.rows && s1.depth == s2.depth && s1.cols == s2.cols;
+}
+
+bool operator<(const Shape& shape1, const Shape& shape2) {
+  return shape1.depth < shape2.depth ||
+         (shape1.depth == shape2.depth &&
+          (shape1.rows < shape2.rows ||
+           (shape1.rows == shape2.rows && shape1.cols < shape2.cols)));
+};
+
+#ifdef _WIN32
+#define sleep(t) Sleep(t)
+#endif
+
+float benchmark(const Shape& shape) {
+  if (kCooldownBeforeBenchmarkSecs) {
+    sleep(kCooldownBeforeBenchmarkSecs);
+  }
+#if defined BENCHMARK_8bit
+  // Benchmark the fast 8bit path, using L8R8WithLhsNonzeroBitDepthParams.
+  // This is the recommended thing to default to: it's what most applications
+  // want to use, as it's the fastest.
+  // The contract is that LHS must take values in [1, 255], while RHS can take
+  // any value in [0, 255].
+  return benchmark_8bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_wide
+  // Variant benchmarking the slower (mostly legacy) DefaultL8R8BitDepthParams.
+  // The only contract difference is that both LHS and RHS can take values in
+  // [0, 255].
+  return benchmark_8bit<gemmlowp::DefaultL8R8BitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit
+  // Variant of BENCHMARK_8bit where the user asks for getting raw int32
+  // accumulators, instead of a 8bit-downscaled result.
+  return benchmark_8bit_to_32bit<gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_8bit_to_32bit_wide
+  // Variant of BENCHMARK_8bit_wide where the user asks for getting raw int32
+  // accumulators, instead of a 8bit-downscaled result.
+  return benchmark_8bit_to_32bit<gemmlowp::DefaultL8R8BitDepthParams>(
+      shape.rows, shape.depth, shape.cols);
+#elif defined BENCHMARK_float
+  return benchmark_float(shape.rows, shape.depth, shape.cols);
+#else
+#error What arithmetic path should we benchmark? (Suggestion: #define BENCHMARK_8bit)
+#endif
+}
+
+std::set<int> all_sizes() {
+  std::set<int> sizes;
+  for (int i = 1; i <= 2048; i *= 2) {
+    sizes.insert(i);
+  }
+  for (double x = 8; x <= 2048; x *= std::sqrt(2.)) {
+    sizes.insert(static_cast<int>(std::round(x)));
+  }
+  for (double x = 16; x <= 512; x *= std::pow(2., 1. / 4.)) {
+    sizes.insert(static_cast<int>(std::round(x)));
+  }
+  return sizes;
+}
+
+std::mt19937& RandomEngine() {
+  static std::mt19937 engine;
+  return engine;
+}
+
+std::vector<Shape> all_shapes_in_random_order() {
+  std::vector<Shape> shapes;
+  const std::set<int> sizes = all_sizes();
+#if defined BENCHMARK_ROWS
+  // Benchmark one specific shape
+  Shape shape;
+  shape.rows = BENCHMARK_ROWS;
+  shape.depth = BENCHMARK_DEPTH;
+  shape.cols = BENCHMARK_COLS;
+  shapes.push_back(shape);
+#elif defined BENCHMARK_QUICK
+  // Benchmark an assortment of cubic shapes
+  for (int size : sizes) {
+    Shape shape;
+    shape.rows = size;
+    shape.depth = size;
+    shape.cols = size;
+    shapes.push_back(shape);
+  }
+#elif defined BENCHMARK_EXHAUSTIVE
+  // Benchmark all sorts of shapes
+  for (int rows : sizes) {
+    for (int depth : sizes) {
+      for (int cols : sizes) {
+        Shape shape;
+        shape.rows = rows;
+        shape.depth = depth;
+        shape.cols = cols;
+        shapes.push_back(shape);
+      }
+    }
+  }
+#else
+#error What shapes should we benchmark? (Suggestion: #define BENCHMARK_QUICK)
+#endif
+  std::shuffle(std::begin(shapes), std::end(shapes), RandomEngine());
+  return shapes;
+}
+
+void run_benchmarks(std::map<Shape, float>* results) {
+  std::vector<Shape> shapes;
+  for (int pass = 0; pass < kPasses; pass++) {
+    const std::vector<Shape> pass_shapes = all_shapes_in_random_order();
+    shapes.insert(std::end(shapes), std::begin(pass_shapes),
+                  std::end(pass_shapes));
+  }
+
+  const double time_start = gemmlowp::real_time_in_seconds();
+  for (std::size_t i = 0; i < shapes.size(); i++) {
+    const double ratio = static_cast<double>(i) / shapes.size();
+    const double elapsed = gemmlowp::real_time_in_seconds() - time_start;
+    const double elapsed_hours = elapsed / 3600.;
+    const double eta_hours = elapsed_hours * (1. - ratio) / ratio;
+    fprintf(stderr,
+            "Benchmarking: %.2f%% done, Elapsed: %.2f hours, ETA: %.2f "
+            "hours...   \r",
+            100. * ratio, elapsed_hours, eta_hours);
+    fflush(stderr);
+    const Shape& shape = shapes[i];
+    float latency = benchmark(shape);
+    if (results->count(shape)) {
+      (*results)[shape] = std::min(latency, (*results)[shape]);
+    } else {
+      (*results)[shape] = latency;
+    }
+  }
+  fprintf(stderr, "\n");
+}
+
+int main() {
+  std::map<Shape, float> results;
+  run_benchmarks(&results);
+  printf("Using %d thread(s)\n", kNumThreads);
+  printf("depth,rows,cols,latency(s),Gop/s\n");
+  for (const auto& result : results) {
+    const Shape& shape = result.first;
+    printf("%d,%d,%d,%.4g,%.4g\n", shape.depth, shape.rows, shape.cols,
+           result.second,
+           2e-9 * shape.depth * shape.rows * shape.cols / result.second);
+  }
+}
diff --git a/test/test.cc b/test/test.cc
index fdc7bcc..eee16b4 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -14,7 +14,6 @@
 
 #include "test.h"
 
-#include <unistd.h>
 #include <array>
 #include <cstdint>
 #include <cstdlib>
diff --git a/test/test.h b/test/test.h
index b6a540d..aecd0c1 100644
--- a/test/test.h
+++ b/test/test.h
@@ -102,6 +102,19 @@ int Random() {
   return dist(RandomEngine());
 }
 
+#ifdef _MSC_VER
+// msvc does not support 8bit types in uniform_int_distribution<>.
+// Take 32 bit uniform_int_distribution<> and only use the lower 8 bits.
+template <typename OperandRange, typename MatrixType>
+void MakeRandom(MatrixType* m) {
+  ScopedProfilingLabel("MakeRandom(matrix)");
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = Random() % OperandRange::kMaxValue;
+    }
+  }
+}
+#else
 template <typename OperandRange, typename MatrixType>
 void MakeRandom(MatrixType* m) {
   ScopedProfilingLabel("MakeRandom(matrix)");
@@ -114,6 +127,7 @@ void MakeRandom(MatrixType* m) {
     }
   }
 }
+#endif
 
 template <typename MatrixType>
 void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) {
diff --git a/test/test_allocator.cc b/test/test_allocator.cc
index 8a76709..3de50f0 100644
--- a/test/test_allocator.cc
+++ b/test/test_allocator.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "../internal/allocator.h"
 #include "test.h"
+#include "../internal/allocator.h"
 
 namespace gemmlowp {
 
diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc
index 8260576..d1e0932 100644
--- a/test/test_blocking_counter.cc
+++ b/test/test_blocking_counter.cc
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "test.h"
+#include "../profiling/pthread_everywhere.h"
 
-#include <pthread.h>
 #include <vector>
 
 #include "../internal/multi_thread_gemm.h"
@@ -26,6 +26,7 @@ class Thread {
   Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement)
       : blocking_counter_(blocking_counter),
         number_of_times_to_decrement_(number_of_times_to_decrement),
+        finished_(false),
         made_the_last_decrement_(false) {
     pthread_create(&thread_, nullptr, ThreadFunc, this);
   }
@@ -33,7 +34,9 @@ class Thread {
   ~Thread() { Join(); }
 
   bool Join() const {
-    pthread_join(thread_, nullptr);
+    if (!finished_) {
+      pthread_join(thread_, nullptr);
+    }
     return made_the_last_decrement_;
   }
 
@@ -45,6 +48,7 @@ class Thread {
       Check(!made_the_last_decrement_);
       made_the_last_decrement_ = blocking_counter_->DecrementCount();
     }
+    finished_ = true;
   }
 
   static void* ThreadFunc(void* ptr) {
@@ -55,6 +59,7 @@ class Thread {
   BlockingCounter* const blocking_counter_;
   const int number_of_times_to_decrement_;
   pthread_t thread_;
+  bool finished_;
   bool made_the_last_decrement_;
 };
 
diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc
index 591bf44..e9d4b84 100644
--- a/test/test_math_helpers.cc
+++ b/test/test_math_helpers.cc
@@ -1,4 +1,4 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
+// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-- 
cgit v1.2.3