initial import

author: Benoit Jacob <benoitjacob@google.com> 2015-06-25 15:50:59 -0400
committer: Benoit Jacob <benoitjacob@google.com> 2015-06-25 15:53:04 -0400
commit: 75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae (patch)
tree: c8e35a06c7d959e6ad0a90b4929305055919e3f8
download: gemmlowp-75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae.tar.gz
35 files changed, 6372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4ff62a0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.o
+*.ii
+*.s
+**/.DS_Store
+?
+??
+???
diff --git a/AUTHORS.txt b/AUTHORS.txt
new file mode 100644
index 0000000..13a49e0
--- /dev/null
+++ b/AUTHORS.txt
@@ -0,0 +1,9 @@
+# This is the official list of gemmlowp authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS.txt file.
+# See the latter for an explanation.
+
+# Names should be added to this file as:
+# Name or Organization <email address>
+# The email address is not required for organizations.
+
+Google Inc.
diff --git a/CONTRIBUTING.txt b/CONTRIBUTING.txt
new file mode 100644
index 0000000..e7244b3
--- /dev/null
+++ b/CONTRIBUTING.txt
@@ -0,0 +1,33 @@
+Want to contribute? Great! First, read this page (including the small print at the end).
+
+Before you contribute
+=====================
+
+Before we can use your code, you must sign the Google Individual Contributor
+License Agreement (CLA),
+
+  https://developers.google.com/open-source/cla/individual?csw=1
+
+which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+Code reviews
+============
+
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+The small print
+===============
+
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
new file mode 100644
index 0000000..e9396d6
--- /dev/null
+++ b/CONTRIBUTORS.txt
@@ -0,0 +1,13 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS.txt file lists the copyright holders; this file
+# lists people.  For example, Google employees are listed here
+# but not in AUTHORS.txt, because Google holds the copyright.
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# Names should be added to this file as:
+#     Name <email address>
+
+Benoit Jacob <benoitjacob@google.com>
+Pete Warden <petewarden@google.com>
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..29fcdc2
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,126 @@
+gemmlowp: a small self-contained low-precision GEMM library
+===========================================================
+
+This is not a full linear algebra library, only a GEMM library: it only does
+general matrix multiplication ("GEMM").
+
+
+Disclaimer
+==========
+
+This is not an official Google product (experimental or otherwise), it is just
+code that happens to be owned by Google.
+
+
+Portability, target platforms/architectures
+===========================================
+
+Should be portable to any platform with some C++11 and POSIX support,
+while we have optional optimized code paths for specifica architectures.
+
+Required:
+  C++11 (a small conservative subset of it)
+
+Required for some features:
+  * Some POSIX interfaces:
+    * pthreads (for multi-threaded operation and for profiling).
+    * sysconf (for multi-threaded operation to detect number of cores;
+               may be bypassed).
+
+Optional optimized code paths:
+At the moment, we only have optimized code paths for ARM NEON SIMD.
+Some are written in inline assembly, some are written in C++ using
+intrinsics. Both GCC and Clang are supported.
+
+
+Public interfaces
+=================
+
+1. gemmlowp public interface
+----------------------------
+
+  gemmlowp's main public interface is in the public/ subdirectory. The
+  header to include is
+      public/gemmlowp.h.
+  This is a headers-only library, so there is nothing to link to.
+
+2. EightBitIntGemm standard interface
+-------------------------------------
+
+  Additionally, the eight_bit_int_gemm/ subdirectory provides an
+  implementation of the standard EightBitIntGemm interface. The header
+  to include is
+    eight_bit_int_gemm/eight_bit_int_gemm.h
+  This is *NOT* a headers-only library, users need to link to
+    eight_bit_int_gemm/eight_bit_int_gemm.cc.
+
+
+Testing
+=======
+
+The test/ directory contains unit tests. The primary unit test is
+  test/test.cc
+Since it covers also the EightBitIntGemm interface, it needs to be
+linked against
+  eight_bit_int_gemm/eight_bit_int_gemm.cc
+
+The scripts/ directory contains a script to build and run a program
+on an Android device:
+  scripts/test-android.sh
+
+It expects the CXX environment variable to point to an Android toolchain's
+C++ compiler, and expects source files (and optionally, cflags) as
+command-line parameters. To build and run the above-mentioned main unit test,
+first set CXX e.g.:
+
+$ export CXX=/some/toolchains/arm-linux-androideabi-4.8/bin/arm-linux-androideabi-g++
+
+Then run:
+
+$ ./scripts/test-android.sh test/test.cc eight_bit_int_gemm/eight_bit_int_gemm.cc
+
+
+Profiling
+=========
+
+The profiling/ subdirectory offers a very simple non-interrupting sampling
+profiler that only requires pthreads (no signals).
+
+It relies on source code being instrumented with pseudo-stack labels.
+See profiling/instrumentation.h.
+A full example of using this profiler is given in profiling/profiler.h.
+
+
+Low-precision?
+==============
+
+"Low-precision" means that the input and output matrix entries are integers
+on at most 8 bits. The scalar type is uint8_t.
+
+This isn't the same as just doing plain matrix arithmetic over uint8_t,
+because that would overflow. To avoid overflow, we internally accumulate
+results on more than 8 bits, and at the end we keep only some significant
+8 bits. This relies on the caller providing suitable offset/multiplier/shift
+parameters, which effectively govern how we extract some significant 8 bit
+from our more-than-8bit temporary accumulators. See the extra function
+parameters taken by Gemm() in public/gemmlowp.h or by EightBitIntGemm() in
+eight_bit_int_gemm/eight_bit_int_gemm.h.
+
+
+Performance goals
+============================
+
+Our performance goals differ from typical GEMM performance goals in the
+following ways:
+
+1. We care not only about speed, but also about minimizing power usage.
+   We specifically care about charge usage in mobile/embedded devices.
+   This implies that we care doubly about minimizing memory bandwidth usage:
+   we care about it, like any GEMM, because of the impact on speed, and we
+   also care about it because it is a key factor of power usage.
+
+2. Most GEMMs are optimized primarily for large dense matrix sizes (>= 1000).
+   We do care about large sizes, but we also care specifically about the
+   typically smaller matrix sizes encountered in various mobile applications.
+   This means that we have to optimize for all sizes, not just for large enough
+   sizes.
diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.cc b/eight_bit_int_gemm/eight_bit_int_gemm.cc
new file mode 100644
index 0000000..4c52dee
--- /dev/null
+++ b/eight_bit_int_gemm/eight_bit_int_gemm.cc
@@ -0,0 +1,89 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "eight_bit_int_gemm/eight_bit_int_gemm.h"
+
+// gemmlowp symbols should have hidden visibility.
+// currently this is ensured in the build system by
+// passing -finlines-visibility-hidden. TODO: it would be
+// safer to hardcode it here with some #pragma's.
+#include "public/gemmlowp.h"
+
+namespace gemmlowp {
+
+namespace eight_bit_int_gemm {
+
+namespace {
+
+// To be used as template parameter for GlobalLock.
+// GlobalLock<EightBitIntGemmLockId> is the global lock
+// on EightBitIntGemm entry points, protecting
+// EightBitIntGemm's global state.
+struct EightBitIntGemmLockId;
+
+// Global state: consists of one global GemmContext instance.
+GemmContext* global_context;
+
+GemmContext* GetOrCreateGlobalContext() {
+  if (!global_context) {
+    global_context = new GemmContext;
+  }
+  return global_context;
+}
+
+void DestroyGlobalContext() {
+  delete global_context;
+  global_context = nullptr;
+}
+
+}  // end anonymous namespace
+
+// Public interface entry points
+
+void EightBitIntGemm(int m, int n, int k, const std::uint8_t* a,
+                     std::int32_t a_offset, int lda, const std::uint8_t* b,
+                     std::int32_t b_offset, int ldb, std::uint8_t* c,
+                     std::int32_t c_offset, std::int32_t c_mult_int,
+                     std::int32_t c_shift, int ldc) {
+  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  GemmContext* context = GetOrCreateGlobalContext();
+
+  MatrixMap<const std::uint8_t, MapOrder::RowMajor> lhs(b, n, k, ldb);
+  MatrixMap<const std::uint8_t, MapOrder::ColMajor> rhs(a, k, m, lda);
+  MatrixMap<std::uint8_t, MapOrder::ColMajor> result(c, n, m, ldc);
+
+  const int lhs_offset = b_offset;
+  const int rhs_offset = a_offset;
+  const int result_offset = c_offset;
+  const int result_mult_int = c_mult_int;
+  const int result_shift = c_shift;
+
+  Gemm(context, lhs, rhs, &result, lhs_offset, rhs_offset, result_offset,
+       result_mult_int, result_shift);
+}
+
+void SetMaxNumThreads(int n) {
+  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  GemmContext* context = GetOrCreateGlobalContext();
+  context->set_max_num_threads(n);
+}
+
+void FreePersistentResources() {
+  AutoGlobalLock<EightBitIntGemmLockId> lock;
+  DestroyGlobalContext();
+}
+
+}  // namespace eight_bit_int_gemm
+
+}  // namespace gemmlowp
diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.h b/eight_bit_int_gemm/eight_bit_int_gemm.h
new file mode 100644
index 0000000..1d2b1cb
--- /dev/null
+++ b/eight_bit_int_gemm/eight_bit_int_gemm.h
@@ -0,0 +1,66 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// eight_bit_int_gemm.h: exposes the standard EightBitIntGemm interface.
+
+#ifndef GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
+#define GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
+
+#include <cstdint>
+
+namespace gemmlowp {
+
+namespace eight_bit_int_gemm {
+
+// Concurrency / reentrancy notice
+// ===============================
+//
+// This eight_bit_int_gemm has global singleton persistent state.
+// A global lock ensures serialization of calls, so this library
+// is fully reentrant but only one calling thread gets to actually run
+// at a time, while other calling threads would wait. So it is safe
+// albeit potentially slow to call the functions exposed here on
+// multiple threads concurrently.
+//
+// Users who prefer a state-less, singleton-less interface,
+// should use the main gemmlowp interface (public/gemmlowp.h) instead.
+
+// The main entry point to compute a Gemm. This is the standard
+// EightBitIntGemm interface.
+void EightBitIntGemm(int m, int n, int k, const std::uint8_t *a,
+                     std::int32_t a_offset, int lda, const std::uint8_t *b,
+                     std::int32_t b_offset, int ldb, std::uint8_t *c,
+                     std::int32_t c_offset, std::int32_t c_mult_int,
+                     std::int32_t c_shift, int ldc);
+
+// Frees any persistent resources
+// (threads, thread pools, allocators, buffers, ...)
+// that gemmlowp might hold. This is called automatically
+// on thread exit, but one may also call it earlier, at any time.
+void FreePersistentResources();
+
+// Allows specifying the number of hardware threads, as a hint as to
+// how many worker threads to use for sufficiently large Gemm's.
+// We will never use more threads than that, but may use fewer,
+// for instance on Gemm's that are too small to benefit from all
+// available threads. The value 0 lets the implementation query
+// the system to determine the number of hardware threads.
+// Default value: 0.
+void SetMaxNumThreads(int n);
+
+}  // namespace eight_bit_int_gemm
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
diff --git a/internal/allocator.h b/internal/allocator.h
new file mode 100644
index 0000000..30b2bd8
--- /dev/null
+++ b/internal/allocator.h
@@ -0,0 +1,211 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// allocator.h: a buffer allocator that allows avoiding most of the
+// malloc/free overhead, by:
+// 1. Requiring all N allocations to be reserved in advance, and
+//    then commited at once, turning N allocations into 1.
+// 2. Being persistent, the allocated storage is reused across commits,
+//    and only reallocated as needed when the commit size gets larger.
+//
+// This is driven by Android-specific needs:
+// 1. On Android, the default (Bionic) allocator tends to aggressively
+// unmap pages, which means that malloc/free can be surprisingly expensive.
+// 2. On Android, stack allocations with alloca() can't be as large as on
+// desktop platforms.
+//
+// General usage:
+// 1. Reserve blocks by calling Reserve(), which returns a Handle.
+// 2. Call Commit() once.
+// 3. Now it is possible to get pointers to allocated buffers by calling
+//    GetPointer().
+// 4. Call Decommit() once.
+// 5. The allocator is now reverted to its original state, except that
+//    it retained its allocated storage, so the next Commit() will be faster.
+//    The allocated storage is only freed when the Allocator object is
+//    destroyed.
+
+#ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_
+#define GEMMLOWP_INTERNAL_ALLOCATOR_H_
+
+#include "internal/common.h"
+
+#if defined ANDROID || defined __ANDROID__
+#include <android/api-level.h>
+#if __ANDROID_API__ < 16
+#include <malloc.h>
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+#endif
+
+namespace gemmlowp {
+
+enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 };
+
+template <typename T>
+struct GetTypeIdImpl {};
+
+template <typename T>
+inline TypeId GetTypeId() {
+  return GetTypeIdImpl<T>::Value;
+}
+
+template <typename T>
+struct GetTypeIdImpl<const T> : GetTypeIdImpl<T> {};
+
+#define GEMMLOWP_REGISTER_TYPEID(type_, id) \
+  template <>                               \
+  struct GetTypeIdImpl<type_> {             \
+    static const TypeId Value = TypeId::id; \
+  };
+
+GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8)
+GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8)
+GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16)
+GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16)
+GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32)
+GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32)
+
+class Allocator {
+ public:
+  Allocator()
+      : committed_(false),
+        storage_size_(0),
+        storage_(nullptr),
+        reserved_blocks_(0),
+        reserved_bytes_(0),
+        generation_(0) {}
+
+  ~Allocator() {
+    assert(!committed_);
+    assert(!reserved_blocks_);
+    DeallocateStorage();
+  }
+
+  // Alignment of allocated blocks.
+  static const std::size_t kAlignment = kDefaultCacheLineSize;
+
+  // This is all we need so far, and since the usage pattern is fixed,
+  // there is no point in allowing more until we need to.
+  static const std::size_t kMaxBlocks = 5;
+
+  void Commit() {
+    assert(!committed_);
+
+    if (reserved_bytes_ > storage_size_) {
+      DeallocateStorage();
+      storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_);
+#ifdef GEMMLOWP_USE_MEMALIGN
+      storage_ = memalign(kAlignment, storage_size_);
+#else
+      if (posix_memalign(&storage_, kAlignment, storage_size_)) {
+        storage_ = nullptr;
+      }
+#endif
+    }
+
+    ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure");
+    committed_ = true;
+  }
+
+  void Decommit() {
+    assert(committed_);
+    committed_ = false;
+    generation_++;
+
+    reserved_blocks_ = 0;
+    reserved_bytes_ = 0;
+  }
+
+  // See generation_
+  typedef std::size_t generation_t;
+
+  // A handle on a reserved block. The user obtains
+  // one by calling Reserve() and, after committing,
+  // passes it to GetPointer().
+  class Handle {
+    std::uint8_t index_;
+    generation_t generation_;
+    TypeId type_;
+
+    friend class Allocator;
+  };
+
+  // Reserves a block sized for n elements of type T, and
+  // returns a handle to it. Must be called before committing.
+  template <typename T>
+  Handle Reserve(std::size_t n) {
+    assert(!committed_ && "can't reserve blocks while committed");
+    assert(reserved_blocks_ < kMaxBlocks &&
+           "didn't expect to allocate this many blocks");
+    const std::size_t bytes = RoundUp<kAlignment>(n * sizeof(T));
+    const std::size_t offset = reserved_bytes_;
+    const std::size_t index = reserved_blocks_;
+
+    reserved_blocks_offsets_[index] = offset;
+    Handle h;
+    h.index_ = index;
+    h.generation_ = generation_;
+    h.type_ = GetTypeId<T>();
+
+    reserved_blocks_++;
+    reserved_bytes_ += bytes;
+
+    return h;
+  }
+
+  // Returns the pointer to the allocated buffer for the given handle.
+  // Must be called after committing.
+  template <typename T>
+  T* GetPointer(const Handle& h) const {
+    assert(committed_ && "can't get block pointers unless committed");
+    assert(h.index_ < reserved_blocks_ &&
+           "bad handle, points to inexistant block");
+    assert(h.generation_ == generation_ &&
+           "handle from earlier generation, have decommitted since");
+    assert(h.type_ == GetTypeId<T>() && "type mismatch");
+    std::size_t offset = reserved_blocks_offsets_[h.index_];
+    std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(storage_) + offset;
+    return reinterpret_cast<T*>(addr);
+  }
+
+ private:
+  void DeallocateStorage() {
+    assert(!committed_);
+    free(storage_);
+    storage_size_ = 0;
+  }
+
+  // Set to true by Commit() and to false by Decommit(). Initially false.
+  bool committed_;
+
+  // The actually allocated storage size and buffer pointer.
+  std::size_t storage_size_;
+  mutable void* storage_;
+
+  // The number of blocks that have been reserved by Reserve().
+  std::size_t reserved_blocks_;
+  // The number of bytes that have been reserved by Reserve().
+  std::size_t reserved_bytes_;
+  // The offsets of reserved blocks into the storage buffer.
+  std::size_t reserved_blocks_offsets_[kMaxBlocks];
+
+  // The 'generation' is incremented on Decommit() and allows catching
+  // bad GetPointer() calls still referring to a previous commit.
+  generation_t generation_;
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_ALLOCATOR_H_
diff --git a/internal/block_params.h b/internal/block_params.h
new file mode 100644
index 0000000..8545f94
--- /dev/null
+++ b/internal/block_params.h
@@ -0,0 +1,166 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// block_params.h: Logic to choose L1 and L2 block sizes
+// to optimize cache-friendliness.
+
+#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// A BlockParams instance contains a full description of all the block size
+// parameters to be used by a Gemm.
+// There are two nested levels of block subdivisions: first a subdivision
+// into large blocks that should fit in last-level cache (what we call L2 here)
+// and then another subdivision into smaller blocks that should fit in
+// L1 cache. There is then actually a third level of subdivision to fit
+// in registers, but we are not concerned with that here.
+struct BlockParams {
+  // L1 block parameters determine the size of small blocks that should
+  // fit in L1 cache.
+  int l1_rows;
+  int l1_cols;
+  int l1_depth;
+
+  // L2 block parameters determine the size of larger blocks that should
+  // fit in L2 cache.
+  int l2_rows;
+  int l2_cols;
+  int l2_depth;
+
+  template <typename KernelFormat>
+  void Init(int rows, int cols, int depth, int num_threads) {
+    FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, &l2_rows,
+                                   &l2_cols, &l2_depth);
+    FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, &l1_rows,
+                                   &l1_cols, &l1_depth);
+  }
+
+  template <typename KernelFormat>
+  static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
+                               int* out_l2_rows, int* out_l2_cols,
+                               int* out_l2_depth) {
+    int l2_rows = 0;
+    int l2_cols = 0;
+    int l2_depth = 0;
+    // No L2 blocking in the depth dimension at the moment.
+    // Too much loss of accuracy due to storing intermediate results in
+    // low precision.
+    // However, we still want to round l2_depth up to the next multiple
+    // of kernel depth, so as to avoid having to special-case unaligned depths.
+    l2_depth = RoundUp<KernelFormat::kDepth>(depth);
+
+    const int l2_bytes_to_use = kDefaultL2CacheSize;
+    const float l2_rhs_factor = kDefaultL2RhsFactor;
+
+    {
+      int max_cache_friendly_l2_cols = std::max(
+          1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
+      int min_l2_cols_blocks = CeilQuotient(cols, max_cache_friendly_l2_cols);
+      l2_cols =
+          RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
+    }
+
+    {
+      int max_cache_friendly_l2_rows =
+          std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
+                          (num_threads * (l2_depth + 4 * l2_cols)));
+      int min_l2_rows_blocks = CeilQuotient(rows, max_cache_friendly_l2_rows);
+      l2_rows =
+          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks));
+    }
+
+    *out_l2_rows = l2_rows;
+    *out_l2_cols = l2_cols;
+    *out_l2_depth = l2_depth;
+  }
+
+  template <typename KernelFormat>
+  static void FindL1BlockSizes(int rows, int cols, int depth, int* out_l1_rows,
+                               int* out_l1_cols, int* out_l1_depth) {
+    int l1_rows = 0;
+    int l1_cols = 0;
+    int l1_depth = 0;
+
+    // L2 block sizes should already be multiples of kernel block sizes.
+    assert(rows % KernelFormat::kRows == 0);
+    assert(cols % KernelFormat::kCols == 0);
+    assert(depth % KernelFormat::kDepth == 0);
+
+    // No L1 blocking in the columns dimension at the moment.
+    // Thought not to be needed. Similar to Eigen.
+    l1_cols = cols;
+
+    const int l1_bytes_to_use = kDefaultL1CacheSize;
+
+    {
+      int max_cache_friendly_l1_depth = std::max(
+          1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
+                 (KernelFormat::kRows + KernelFormat::kCols));
+      int min_l1_depth_blocks =
+          CeilQuotient(depth, max_cache_friendly_l1_depth);
+      l1_depth = RoundUp<KernelFormat::kDepth>(
+          CeilQuotient(depth, min_l1_depth_blocks));
+    }
+
+    {
+      int max_cache_friendly_l1_rows =
+          std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
+      int min_l1_rows_blocks = CeilQuotient(rows, max_cache_friendly_l1_rows);
+      l1_rows =
+          RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
+    }
+
+    *out_l1_rows = l1_rows;
+    *out_l1_cols = l1_cols;
+    *out_l1_depth = l1_depth;
+  }
+};
+
+// A SideBlockParams instance contains only the block params relevant to
+// one side (LHS or RHS), expressed in terms of 'width' instead of
+// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
+// the number of rows, while in the RHS, 'width' means the number of columns.
+// That allows us to write generic code that applies to either LHS or RHS.
+struct SideBlockParams {
+  // L1 block parameters determine the size of small blocks that should
+  // fit in L1 cache.
+  int l1_width;
+  int l1_depth;
+
+  // L2 block parameters determine the size of larger blocks that should
+  // fit in L2 cache.
+  int l2_width;
+  int l2_depth;
+};
+
+enum class Side { Lhs, Rhs };
+
+inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
+                               const BlockParams& block_params) {
+  side_block_params->l1_width =
+      side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
+  side_block_params->l2_width =
+      side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
+
+  side_block_params->l1_depth = block_params.l1_depth;
+  side_block_params->l2_depth = block_params.l2_depth;
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
diff --git a/internal/common.h b/internal/common.h
new file mode 100644
index 0000000..f7e643e
--- /dev/null
+++ b/internal/common.h
@@ -0,0 +1,117 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// common.h: contains stuff that's used throughout gemmlowp
+// and should always be available.
+
+#ifndef GEMMLOWP_INTERNAL_COMMON_H_
+#define GEMMLOWP_INTERNAL_COMMON_H_
+
+#include <pthread.h>
+#include <cstdint>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <algorithm>
+
+#include "profiling/instrumentation.h"
+
+#ifdef GEMMLOWP_PROFILING
+#include <set>
+#include <cstdio>
+#include <cstring>
+#endif
+
+// Detect NEON. It's important to check for both tokens.
+#if (defined __ARM_NEON) || (defined __ARM_NEON__)
+#define GEMMLOWP_NEON
+#endif
+
+namespace gemmlowp {
+
+// Standard cache line size. Useful to optimize alignment and
+// prefetches. Ideally we would query this at runtime, however
+// 64 byte cache lines are the vast majority, and even if it's
+// wrong on some device, it will be wrong by no more than a 2x factor,
+// which should be acceptable.
+const int kDefaultCacheLineSize = 64;
+
+// Default L1 and L2 data cache sizes. On x86, we should ideally query this at
+// runtime. On ARM, the instruction to query this is privileged and
+// Android kernels do not expose it to userspace. Fortunately, the majority
+// of ARM devices have roughly comparable values:
+//   Nexus 5: L1 16k, L2 1M
+//   Android One: L1 32k, L2 512k
+// The following values are equal to or somewhat lower than that, and were
+// found to perform well on both the Nexus 5 and Android One.
+// Of course, they would be too low for typical x86 CPUs where we would want
+// to set the L2 value to (L3 cache size / number of cores) at least.
+const int kDefaultL1CacheSize = 16 * 1024;
+const int kDefaultL2CacheSize = 256 * 1024;
+
+// The proportion of the cache that we intend to use for storing
+// RHS blocks. This should be between 0 and 1, and typically closer to 1,
+// as we typically want to use most of the L2 cache for storing a large
+// RHS block.
+const float kDefaultL2RhsFactor = 0.90f;
+
+// Hints the CPU to prefetch the cache line containing ptr.
+inline void Prefetch(const void* ptr) {
+#ifdef __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
+  __builtin_prefetch(ptr);
+#else
+  (void)ptr;
+#endif
+}
+
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <int Modulus>
+int RoundDown(int i) {
+  return i - (i % Modulus);
+}
+
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <int Modulus>
+int RoundUp(int i) {
+  return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+  return (a + b - 1) / b;
+}
+
+// Returns the argument rounded up to the nearest power of two.
+template <typename Integer>
+Integer RoundUpToPowerOfTwo(Integer n) {
+  Integer i = n - 1;
+  i |= i >> 1;
+  i |= i >> 2;
+  i |= i >> 4;
+  i |= i >> 8;
+  i |= i >> 16;
+  return i + 1;
+}
+
+template <int N>
+struct IsPowerOfTwo {
+  static const bool value = !(N & (N - 1));
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_COMMON_H_
diff --git a/internal/compute.h b/internal/compute.h
new file mode 100644
index 0000000..4b2f56a
--- /dev/null
+++ b/internal/compute.h
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compute.h: the central stage of the Gemm computation, operates
+// on already-packed LHS and RHS blocks and calls the Gemm kernel
+// to compute a block of the product.
+
+#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
+#define GEMMLOWP_INTERNAL_COMPUTE_H_
+
+#include "internal/kernel.h"
+#include "internal/block_params.h"
+#include "internal/pack.h"
+
+namespace gemmlowp {
+
+template <typename KernelFormat, typename PackedResult>
+class ComputeImpl {
+  typedef typename KernelFormat::Lhs KernelLhsFormat;
+  typedef typename KernelFormat::Rhs KernelRhsFormat;
+
+  const KernelBase& kernel_;
+  const BlockParams& block_params_;
+
+  PackedResult* const packed_result_;
+  const PackedSideBlock<KernelLhsFormat>& packed_lhs_;
+  const PackedSideBlock<KernelRhsFormat>& packed_rhs_;
+
+ public:
+  ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
+              PackedResult* _packed_result,
+              const PackedSideBlock<KernelLhsFormat>& _packed_lhs,
+              const PackedSideBlock<KernelRhsFormat>& _packed_rhs)
+      : kernel_(_kernel),
+        block_params_(_block_params),
+        packed_result_(_packed_result),
+        packed_lhs_(_packed_lhs),
+        packed_rhs_(_packed_rhs) {}
+
+  void Compute() {
+    for (int d = 0; d < block_params_.l2_depth; d += block_params_.l1_depth) {
+      int ds = std::min(block_params_.l1_depth, block_params_.l2_depth - d);
+
+      for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
+        int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
+
+        ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
+      }
+    }
+  }
+
+ private:
+  void ComputeRun(int start_row, int start_col, int start_depth, int depth) {
+    packed_lhs_.seek_run(start_row, start_depth);
+    packed_rhs_.seek_run(start_col, start_depth);
+    auto packed_result_block = packed_result_->Map().block(
+        start_row, start_col, KernelFormat::kRows, KernelFormat::kCols);
+    kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
+                packed_result_block.cols_stride(), packed_lhs_.current_data(),
+                packed_rhs_.current_data(), start_depth, depth);
+  }
+
+  void ComputeL1(int start_row, int rows, int start_col, int cols,
+                 int start_depth, int depth) {
+    assert(rows % KernelFormat::kRows == 0);
+    assert(cols % KernelFormat::kCols == 0);
+    assert(depth % KernelFormat::kDepth == 0);
+
+    for (int c = 0; c < cols; c += KernelFormat::kCols) {
+      for (int r = 0; r < rows; r += KernelFormat::kRows) {
+        ComputeRun(start_row + r, start_col + c, start_depth, depth);
+      }
+    }
+  }
+};
+
+template <typename PackedResult, typename KernelLhsFormat,
+          typename KernelRhsFormat>
+void Compute(const KernelBase& kernel, const BlockParams& block_params,
+             PackedResult* packed_result,
+             const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+             const PackedSideBlock<KernelRhsFormat>& packed_rhs) {
+  ScopedProfilingLabel label("compute");
+  ComputeImpl<KernelFormat<KernelLhsFormat, KernelRhsFormat>, PackedResult>
+      impl(kernel, block_params, packed_result, packed_lhs, packed_rhs);
+
+  impl.Compute();
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_COMPUTE_H_
diff --git a/internal/kernel.h b/internal/kernel.h
new file mode 100644
index 0000000..5cb5e69
--- /dev/null
+++ b/internal/kernel.h
@@ -0,0 +1,217 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel.h: general definitions for kernels.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_H_
+#define GEMMLOWP_INTERNAL_KERNEL_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// Explanation of general gemmlowp terminology
+// ===========================================
+//
+// We use the following abbreviations:
+// LHS = "left-hand side"
+// RHS = "right-hand side"
+// Sometimes when referring to either LHS or RHS, we just say a "Side".
+//
+// In a matrix product of a MxK matrix times a KxN matrix,
+// we call K the 'depth'. Note that M is the number of rows
+// of the result (and of the LHS), and N is the number of columns
+// of the result (and of the RHS).
+//
+// In each of the LHS and RHS matrices, we call 'width' the
+// other dimension, besides the depth. So in the LHS, 'width'
+// is the number of rows, while in the RHS, 'width' is the number
+// of columns.
+//
+//  So in the LHS MxK matrix, the depth is K and the width in M.
+// And in the RHS KxN matrix, the depth is K and the width in N.
+//
+// This is illustrated in this picture:
+//
+//                             RHS width
+//                        <----------------->
+//                        +-----------------+ ^
+//                        |       RHS       | | Depth
+//                        +-----------------+ v
+//                 ^ +--+ +-----------------+
+//                 | |L | |                 |
+//       LHS width | |H | |      Result     |
+//                 | |S | |                 |
+//                 v +--+ +-----------------+
+//                   <-->
+//                   Depth
+
+// Explanation of gemmlowp kernel formats and "cells"
+// ==================================================
+//
+// Kernels operate on small LHS and RHS blocks that fit in registers.
+// These blocks are stored contiguously in memory, but not always
+// in a traditional column-major or row-major order; instead,
+// they consist of a number of sub-blocks, which we call "cells",
+// that are stored in column-major or row-major order. However,
+// what really matters to us is not so much rows vs columns, but
+// rather width vs depth. So we refer to "width-major" and "depth-major"
+// storage orders. In the LHS, width-major means row-major,
+// while in the RHS, width-major means column-major.
+// There is also a third possibility, "diagonal order",
+// which is unused at the moment.
+//
+// We aim to treat both sides, LHS and RHS, on an equal footing,
+// so we call them both 'sides'. A KernelFormat thus is just a pair
+// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
+// contains a CellFormat and a number of cells; cells are only ever
+// stacked in the width dimension, which means stacked vertically in the
+// LHS and stacked horizondally in the RHS.
+//
+// Example
+// =======
+//
+// Let's work out the data layout expected by a kernel having the
+// following format (the struct names here are defined below in this file):
+//
+// KernelFormat<
+//   KernelSideFormat<CellFormat<3, 4>, 3>,
+//   KernelSideFormat<CellFormat<5, 4>, 2>
+// >
+//
+// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
+// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the LHS,
+// DepthMajor means column-major, so the LHS cells are of size 3x4 in
+// column-major order, so the LHS layout is:
+//
+// 0  3  6  9
+// 1  4  8  10
+// 2  5  9  11
+// 12 15 18 21
+// 13 16 19 22
+// 14 17 20 23
+// 24 27 30 33
+// 25 28 31 34
+// 26 29 32 35
+//
+// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
+// 2 cells each having dimensions (width=5, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the RHS,
+// DepthMajor means row-major, so the RHS cells are of size 4x5 in
+// row-major order, so the RHS layout is:
+//
+// 0  1  2  3  4  20 21 22 23 24
+// 5  6  7  8  9  25 26 27 28 29
+// 10 11 12 13 14 30 31 32 33 34
+// 15 16 17 18 19 35 36 37 38 39
+
+// CellOrder enumerates the possible storage orders (=layouts) for
+// a cell (see explanation above).
+enum class CellOrder { DepthMajor, WidthMajor, Diagonal };
+
+// CellFormat describes how data is laid
+// out in a cell. That is, a CellOrder together with actual dimensions.
+template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor>
+struct CellFormat {
+  static const int kWidth = tWidth;
+  static const int kDepth = tDepth;
+  static const CellOrder kOrder = tOrder;
+
+  static const int kSize = kWidth * kDepth;
+};
+
+// KernelSideFormat describes how data is laid out in a kernel side
+// (i.e. LHS or RHS). That is, a CellFormat together with a number of
+// cells. These cells are always stacked in the Width dimension.
+// For example, in the LHS case, the Width dimension is the rows dimension,
+// se we're saying that in the LHS, cells are stacked vertically.
+// We never stack cells in the Depth dimension.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormat {
+  typedef tCellFormat Cell;
+  static const int kCells = tCells;
+  static const int kWidth = kCells * Cell::kWidth;
+  static const int kDepth = Cell::kDepth;
+};
+
+// KernelFormat describes fully the input data layout that a kernel expects.
+// It consists of two KernelSideFormat's, one for LHS and one for RHS.
+template <typename tLhs, typename tRhs>
+struct KernelFormat {
+  typedef tLhs Lhs;
+  typedef tRhs Rhs;
+
+  static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
+  static const int kDepth = Lhs::Cell::kDepth;
+  static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
+  static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
+};
+
+inline const char* CellOrderName(CellOrder o) {
+  switch (o) {
+    case CellOrder::DepthMajor:
+      return "DepthMajor";
+    case CellOrder::WidthMajor:
+      return "WidthMajor";
+    case CellOrder::Diagonal:
+      return "Diagonal";
+    default:
+      assert(false);
+      return nullptr;
+  }
+}
+
+// Returns the offset into a cell, at which a given coefficient is stored.
+template <typename CellFormat>
+inline int OffsetIntoCell(int w, int d) {
+  switch (CellFormat::kOrder) {
+    case CellOrder::DepthMajor:
+      return w + d * CellFormat::kWidth;
+    case CellOrder::WidthMajor:
+      return d + w * CellFormat::kDepth;
+    case CellOrder::Diagonal:
+      assert(CellFormat::kWidth == CellFormat::kDepth);
+      static const int cell_width = CellFormat::kWidth;
+      return w + ((d + cell_width - w) % cell_width) * cell_width;
+    default:
+      assert(false);
+      return 0;
+  }
+}
+
+// KernelBase is the virtual base class below all kernels.
+// The idea is that we don't need to templatize all our code on the exact
+// kernel type; we only need to templatize on kernel format. Kernels
+// sharing the same format can thus share the same packing/unpacking code.
+struct KernelBase {
+  virtual const char* Name() const = 0;
+
+  // This is the kernel implementation. We use the word 'run' consistently
+  // throughout gemmlowp to mean an inner loop, the implementation of which
+  // is to be provided by a separate optimized function.
+  virtual void Run(std::int32_t* dst_ptr, int dst_row_stride,
+                   int dst_col_stride, const std::uint8_t* lhs_ptr,
+                   const std::uint8_t* rhs_ptr, int start_depth,
+                   int run_depth) const = 0;
+
+  static const int kLhsBitDepth = 8;
+  static const int kRhsBitDepth = 8;
+
+  virtual ~KernelBase() {}
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_H_
diff --git a/internal/kernel_default.h b/internal/kernel_default.h
new file mode 100644
index 0000000..3e7cd5d
--- /dev/null
+++ b/internal/kernel_default.h
@@ -0,0 +1,41 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_default.h: Chooses default GEMM and GEMV kernels for the
+// host platform.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+#define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+
+#include "internal/common.h"
+
+#ifdef GEMMLOWP_NEON
+#include "internal/kernel_neon.h"
+namespace gemmlowp {
+typedef NEONKernel12x4Depth2 DefaultKernelForGEMM;
+typedef NEONKernel8x1Depth4 DefaultKernelForGEMV;
+}
+#else
+#include "internal/kernel_reference.h"
+namespace gemmlowp {
+typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>,
+                                     KernelSideFormat<CellFormat<4, 4>, 2> > >
+    DefaultKernelForGEMM;
+typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>,
+                                     KernelSideFormat<CellFormat<1, 4>, 1> > >
+    DefaultKernelForGEMV;
+}
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
diff --git a/internal/kernel_neon.h b/internal/kernel_neon.h
new file mode 100644
index 0000000..2177fc2
--- /dev/null
+++ b/internal/kernel_neon.h
@@ -0,0 +1,570 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_neon.h: a collection of NEON optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+#define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+
+#include "internal/kernel.h"
+
+#include <cassert>
+
+namespace gemmlowp {
+
+// Our main GEMM kernel.
+struct NEONKernel12x4Depth2 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
+                       KernelSideFormat<CellFormat<4, 2>, 1> > Format;
+
+  const char* Name() const override { return "NEON, 12x4, straight, depth 2"; }
+
+  // TODO(benoitjacob): reorder function arguments so dst comes last
+  void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+           const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+           int start_depth, int run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+
+    assert(dst_row_stride == 1);
+
+    asm volatile(
+        // Clear accumulator registers (see layout below)
+        "vmov.s32 q4, #0\n"
+        "vmov.s32 q8, q4\n"
+        "vmov.s32 q12, q4\n"
+        "vmov.s32 q5, q4\n"
+        "vmov.s32 q9, q4\n"
+        "vmov.s32 q13, q4\n"
+        "vmov.s32 q6, q4\n"
+        "vmov.s32 q10, q4\n"
+        "vmov.s32 q14, q4\n"
+        "vmov.s32 q7, q4\n"
+        "vmov.s32 q11, q4\n"
+        "vmov.s32 q15, q4\n"
+
+        /* Main loop */
+
+        "loop_NEONKernel12x4Depth2_%=:\n"
+
+        // Overview of register layout:
+        //
+        // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
+        // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
+        // (q1--q3).
+        // A 12x4 block of accumulators is stored in 32bit in q4--q15.
+        //
+        //                   +-----+-----+-----+-----+
+        //                   |d0[0]|d0[1]|d0[2]|d0[3]|
+        //              Rhs  +-----+-----+-----+-----+
+        //                   |d1[0]|d1[1]|d1[2]|d1[3]|
+        //                   +-----+-----+-----+-----+
+        //
+        //                   |     |     |     |     |
+        //
+        //    Lhs            |     |     |     |     |
+        //
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  |d2|d3|          | q4  | q5  | q6  | q7  |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  |d4|d5|          | q8  | q9  | q10 | q11 |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  |d6|d7|          | q12 | q13 | q14 | q15 |
+        //  +--+--+ - - - -  +-----+-----+-----+-----+
+        //
+        //                            Accumulator
+
+        // Load 1 Rhs cell of size 2x4
+        "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"
+
+        // Load 3 Lhs cells of size 4x2 each
+        "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "vmovl.u8 q0, d0\n"
+        "vmovl.u8 q1, d2\n"
+        "vmovl.u8 q2, d4\n"
+        "vmovl.u8 q3, d6\n"
+
+        // Multiply-accumulate, level of depth 0
+        "vmlal.u16 q4, d2, d0[0]\n"
+        "vmlal.u16 q5, d2, d0[1]\n"
+        "vmlal.u16 q6, d2, d0[2]\n"
+        "vmlal.u16 q7, d2, d0[3]\n"
+        "vmlal.u16 q8, d4, d0[0]\n"
+        "vmlal.u16 q9, d4, d0[1]\n"
+        "vmlal.u16 q10, d4, d0[2]\n"
+        "vmlal.u16 q11, d4, d0[3]\n"
+        "vmlal.u16 q12, d6, d0[0]\n"
+        "vmlal.u16 q13, d6, d0[1]\n"
+        "vmlal.u16 q14, d6, d0[2]\n"
+        "vmlal.u16 q15, d6, d0[3]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "vmlal.u16 q4, d3, d1[0]\n"
+        "vmlal.u16 q5, d3, d1[1]\n"
+        "vmlal.u16 q6, d3, d1[2]\n"
+        "vmlal.u16 q7, d3, d1[3]\n"
+        "vmlal.u16 q8, d5, d1[0]\n"
+        "vmlal.u16 q9, d5, d1[1]\n"
+        "vmlal.u16 q10, d5, d1[2]\n"
+        "vmlal.u16 q11, d5, d1[3]\n"
+        "vmlal.u16 q12, d7, d1[0]\n"
+        "vmlal.u16 q13, d7, d1[1]\n"
+        "vmlal.u16 q14, d7, d1[2]\n"
+        "vmlal.u16 q15, d7, d1[3]\n"
+
+        // Loop. Decrement loop index (depth) by 2, since we just handled 2
+        // levels of depth (Kernel::kDepth=2).
+        "subs %[run_depth], #2\n"
+        "bne loop_NEONKernel12x4Depth2_%=\n"
+
+        /* end of main loop */
+
+        /* Accumulate our local accumulator registers into the destination block
+           */
+
+        // Compute stride between consecutive columns, in bytes
+        "mov r0, #4\n"  // multiply by 4 = sizeof(int32)
+        "mul %[dst_col_stride], r0\n"
+
+        // If start_depth == 0, then there is no preexisting accumulator
+        // to accumulate, so we can simply store our result.
+        "cmp %[start_depth], #0\n"
+        "beq store_result_NEONKernel12x4Depth2_%=\n"
+
+        "mov r0, %[dst_ptr]\n"
+
+        // Load a column
+        "mov r1, r0\n"
+        "vld1.32 {d0, d1}, [r1]!\n"
+        "vld1.32 {d2, d3}, [r1]!\n"
+        "vld1.32 {d4, d5}, [r1]!\n"
+        // Accumulate a column
+        "vadd.s32 q4, q4, q0\n"
+        "vadd.s32 q8, q8, q1\n"
+        "vadd.s32 q12, q12, q2\n"
+
+        "add r0, %[dst_col_stride]\n"
+        // Load a column
+        "mov r1, r0\n"
+        "vld1.32 {d0, d1}, [r1]!\n"
+        "vld1.32 {d2, d3}, [r1]!\n"
+        "vld1.32 {d4, d5}, [r1]!\n"
+        // Accumulate a column
+        "vadd.s32 q5, q5, q0\n"
+        "vadd.s32 q9, q9, q1\n"
+        "vadd.s32 q13, q13, q2\n"
+
+        "add r0, %[dst_col_stride]\n"
+        // Load a column
+        "mov r1, r0\n"
+        "vld1.32 {d0, d1}, [r1]!\n"
+        "vld1.32 {d2, d3}, [r1]!\n"
+        "vld1.32 {d4, d5}, [r1]!\n"
+        // Accumulate a column
+        "vadd.s32 q6, q6, q0\n"
+        "vadd.s32 q10, q10, q1\n"
+        "vadd.s32 q14, q14, q2\n"
+
+        "add r0, %[dst_col_stride]\n"
+        // Load a column
+        "mov r1, r0\n"
+        "vld1.32 {d0, d1}, [r1]!\n"
+        "vld1.32 {d2, d3}, [r1]!\n"
+        "vld1.32 {d4, d5}, [r1]!\n"
+        // Accumulate a column
+        "vadd.s32 q7, q7, q0\n"
+        "vadd.s32 q11, q11, q1\n"
+        "vadd.s32 q15, q15, q2\n"
+
+        "store_result_NEONKernel12x4Depth2_%=:\n"
+
+        "mov r0, %[dst_ptr]\n"
+        // Store a column
+        "mov r1, r0\n"
+        "vst1.32 {d8, d9}, [r1]!\n"
+        "vst1.32 {d16, d17}, [r1]!\n"
+        "vst1.32 {d24, d25}, [r1]!\n"
+        // Store a column
+        "add r0, %[dst_col_stride]\n"
+        "mov r1, r0\n"
+        "vst1.32 {d10, d11}, [r1]!\n"
+        "vst1.32 {d18, d19}, [r1]!\n"
+        "vst1.32 {d26, d27}, [r1]!\n"
+        // Store a column
+        "add r0, %[dst_col_stride]\n"
+        "mov r1, r0\n"
+        "vst1.32 {d12, d13}, [r1]!\n"
+        "vst1.32 {d20, d21}, [r1]!\n"
+        "vst1.32 {d28, d29}, [r1]!\n"
+        // Store a column
+        "add r0, %[dst_col_stride]\n"
+        "mov r1, r0\n"
+        "vst1.32 {d14, d15}, [r1]!\n"
+        "vst1.32 {d22, d23}, [r1]!\n"
+        "vst1.32 {d30, d31}, [r1]!\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [dst_ptr] "+r"(dst_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [start_depth] "r"(start_depth),
+        [dst_col_stride] "r"(dst_col_stride)
+        :  // clobbers
+        "cc", "memory", "r0", "r1",
+        // note: someone on internet says that quad registers are
+        // unsupported in the clobber list!
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+        "d31");
+  }
+};
+
+// Our main GEMV kernel.
+struct NEONKernel8x1Depth4 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<8, 4>, 1>,
+                       KernelSideFormat<CellFormat<1, 4>, 1> > Format;
+
+  const char* Name() const override { return "NEON, 8x1, straight, depth 4"; }
+
+  void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+           const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+           int start_depth, int run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+
+    assert(dst_row_stride == 1);
+
+    asm volatile(
+        // Clear accumulator registers (see layout below)
+        "vmov.s32 q11, #0\n"
+        "vmov.s32 q12, q11\n"
+
+        /* Main loop */
+
+        "loop_NEONKernel8x1Depth4_%=:\n"
+
+        // Overview of register layout:
+        //
+        // A 4x1 cell of Rhs is stored in 16bit in d0.
+        // A 8x4 cell Lhs is stored in 16bit in d2--d9 (q1--q4).
+        // A block of accumulators of size 8x1 is stored in 32bit in q11--q12
+        //
+        //                         +-----+
+        //                         |d0[0]|
+        //                         +-----+
+        //                         |d0[1]|
+        //                         +-----+   Rhs
+        //                         |d0[2]|
+        //                         +-----+
+        //                         |d0[3]|
+        //                         +-----+
+        //                         |     |
+        //
+        //       Lhs               |     |
+        //
+        //  +--+--+--+--+ - - - -  +-----+
+        //  |d2|d4|d6|d8|          | q11 |
+        //  |d2|d4|d6|d8|          | q11 |
+        //  |d2|d4|d6|d8|          | q11 |
+        //  |d2|d4|d6|d8|          | q11 |
+        //  +--+--+--+--+ - - - -  +-----+   Accumulator
+        //  |d3|d5|d7|d9|          | q12 |
+        //  |d3|d5|d7|d9|          | q12 |
+        //  |d3|d5|d7|d9|          | q12 |
+        //  |d3|d5|d7|d9|          | q12 |
+        //  +--+--+--+--+ - - - -  +-----+
+
+        // Load 1 Rhs cell of size 4x1.
+        "vldr.32 d0, [%[rhs_ptr]]\n"
+        "add %[rhs_ptr], #4\n"
+
+        // Load 1 Lhs cell of size 8x4. Each vld1 instruction loads 1 col.
+        "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d8}, [%[lhs_ptr]:64]!\n"
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "vmovl.u8 q0, d0\n"  // d1 is unused.
+        "vmovl.u8 q1, d2\n"
+        "vmovl.u8 q2, d4\n"
+        "vmovl.u8 q3, d6\n"
+        "vmovl.u8 q4, d8\n"
+
+        // Multiply-accumulate, level of depth 0
+        "vmlal.u16 q11, d2, d0[0]\n"
+        "vmlal.u16 q12, d3, d0[0]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "vmlal.u16 q11, d4, d0[1]\n"
+        "vmlal.u16 q12, d5, d0[1]\n"
+
+        // Multiply-accumulate, level of depth 2
+        "vmlal.u16 q11, d6, d0[2]\n"
+        "vmlal.u16 q12, d7, d0[2]\n"
+
+        // Multiply-accumulate, level of depth 3
+        "vmlal.u16 q11, d8, d0[3]\n"
+        "vmlal.u16 q12, d9, d0[3]\n"
+
+        // Loop. Decrement loop index (depth) by 4, since we just handled 4
+        // levels
+        // of depth (Kernel::kDepth=4).
+        "subs %[run_depth], #4\n"
+        "bne loop_NEONKernel8x1Depth4_%=\n"
+
+        /* end of main loop */
+
+        /* Accumulate our local accumulator registers into the destination block
+           */
+
+        "cmp %[start_depth], #0\n"
+        "beq store_result_NEONKernel8x1Depth4_%=\n"
+
+        "mov r0, %[dst_ptr]\n"
+        // Load a column
+        "vld1.32 {d0, d1}, [r0]!\n"
+        "vld1.32 {d2, d3}, [r0]!\n"
+
+        // Accumulate a column
+        "vadd.s32 q11, q11, q0\n"
+        "vadd.s32 q12, q12, q1\n"
+
+        "store_result_NEONKernel8x1Depth4_%=:\n"
+
+        "mov r0, %[dst_ptr]\n"
+        "vst1.32 {d22, d23}, [r0]!\n"
+        "vst1.32 {d24, d25}, [r0]!\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth),
+        [dst_ptr] "r"(dst_ptr)
+        :  // clobbers
+        "cc", "memory", "r0",
+        // note: someone on internet says that quad registers are
+        // unsupported in the clobber list!
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+        "d31");
+  }
+};
+
+// Another GEMV kernel, that performs best by itself;
+// unfortunately GEMV is dominated by LHS packing and we haven't succeeded
+// so far in making efficient packing code for this very wide (width 20)
+// format.
+struct NEONKernel20x1Depth4 : KernelBase {
+  typedef KernelFormat<KernelSideFormat<CellFormat<4, 4>, 5>,
+                       KernelSideFormat<CellFormat<1, 4>, 1> > Format;
+
+  const char* Name() const override { return "NEON, 20x1, straight, depth 4"; }
+
+  void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+           const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+           int start_depth, int run_depth) const override {
+    ScopedProfilingLabel label("optimized kernel");
+
+    assert(dst_row_stride == 1);
+
+    asm volatile(
+        // Clear accumulator registers (see layout below)
+        "vmov.s32 q11, #0\n"
+        "vmov.s32 q12, q11\n"
+        "vmov.s32 q13, q11\n"
+        "vmov.s32 q14, q11\n"
+        "vmov.s32 q15, q11\n"
+
+        /* Main loop */
+
+        "loop_NEONKernel20x1Depth4_%=:\n"
+
+        // Overview of register layout:
+        //
+        // A 4x1 cell of Rhs is stored in 16bit in d0.
+        // 5 cells, of size 4x4 each, are stored in 16bit in d2--d21.
+        // A block of accumulators of size 20x1 is stored in 32bit
+        // in q11--q15.
+        //
+        //                             +-----+
+        //                             |d0[0]|
+        //                             +-----+
+        //                             |d0[1]|
+        //                             +-----+   Rhs
+        //                             |d0[2]|
+        //                             +-----+
+        //                             |d0[3]|
+        //                             +-----+
+        //                             |     |
+        //
+        //       Lhs                   |     |
+        //
+        //  +---+---+---+---+ - - - -  +-----+
+        //  |d2 |d3 |d4 |d5 |          | q11 |
+        //  |d2 |d3 |d4 |d5 |          | q11 |
+        //  |d2 |d3 |d4 |d5 |          | q11 |
+        //  |d2 |d3 |d4 |d5 |          | q11 |
+        //  +---+---+---+---+ - - - -  +-----+
+        //  |d6 |d7 |d8 |d9 |          | q12 |
+        //  |d6 |d7 |d8 |d9 |          | q12 |
+        //  |d6 |d7 |d8 |d9 |          | q12 |
+        //  |d6 |d7 |d8 |d9 |          | q12 |
+        //  +---+---+---+---+ - - - -  +-----+
+        //  |d10|d11|d12|d13|          | q13 |
+        //  |d10|d11|d12|d13|          | q13 |   Accumulator
+        //  |d10|d11|d12|d13|          | q13 |
+        //  |d10|d11|d12|d13|          | q13 |
+        //  +---+---+---+---+ - - - -  +-----+
+        //  |d14|d15|d16|d17|          | q14 |
+        //  |d14|d15|d16|d17|          | q14 |
+        //  |d14|d15|d16|d17|          | q14 |
+        //  |d14|d15|d16|d17|          | q14 |
+        //  +---+---+---+---+ - - - -  +-----+
+        //  |d18|d19|d20|d21|          | q15 |
+        //  |d18|d19|d20|d21|          | q15 |
+        //  |d18|d19|d20|d21|          | q15 |
+        //  |d18|d19|d20|d21|          | q15 |
+        //  +---+---+---+---+ - - - -  +-----+
+
+        // Load 1 Rhs cell
+        "vldr.32 d0, [%[rhs_ptr]]\n"
+        "add %[rhs_ptr], #4\n"
+
+        // Load 10 Lhs cells
+        "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d8}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d10}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d12}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d14}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d16}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d18}, [%[lhs_ptr]:64]!\n"
+        "vld1.8 {d20}, [%[lhs_ptr]:64]!\n"
+
+        // Expand Lhs/Rhs cells to 16 bit.
+        "vmovl.u8 q0, d0\n"
+        "vmovl.u8 q1, d2\n"
+        "vmovl.u8 q2, d4\n"
+        "vmovl.u8 q3, d6\n"
+        "vmovl.u8 q4, d8\n"
+        "vmovl.u8 q5, d10\n"
+        "vmovl.u8 q6, d12\n"
+        "vmovl.u8 q7, d14\n"
+        "vmovl.u8 q8, d16\n"
+        "vmovl.u8 q9, d18\n"
+        "vmovl.u8 q10, d20\n"
+
+        // Multiply-accumulate, level of depth 0
+        "vmlal.u16 q11, d2, d0[0]\n"
+        "vmlal.u16 q12, d6, d0[0]\n"
+        "vmlal.u16 q13, d10, d0[0]\n"
+        "vmlal.u16 q14, d14, d0[0]\n"
+        "vmlal.u16 q15, d18, d0[0]\n"
+
+        // Multiply-accumulate, level of depth 1
+        "vmlal.u16 q11, d3, d0[1]\n"
+        "vmlal.u16 q12, d7, d0[1]\n"
+        "vmlal.u16 q13, d11, d0[1]\n"
+        "vmlal.u16 q14, d15, d0[1]\n"
+        "vmlal.u16 q15, d19, d0[1]\n"
+
+        // Multiply-accumulate, level of depth 2
+        "vmlal.u16 q11, d4, d0[2]\n"
+        "vmlal.u16 q12, d8, d0[2]\n"
+        "vmlal.u16 q13, d12, d0[2]\n"
+        "vmlal.u16 q14, d16, d0[2]\n"
+        "vmlal.u16 q15, d20, d0[2]\n"
+
+        // Multiply-accumulate, level of depth 3
+        "vmlal.u16 q11, d5, d0[3]\n"
+        "vmlal.u16 q12, d9, d0[3]\n"
+        "vmlal.u16 q13, d13, d0[3]\n"
+        "vmlal.u16 q14, d17, d0[3]\n"
+        "vmlal.u16 q15, d21, d0[3]\n"
+
+        // Loop. Decrement loop index (depth) by 4, since we just handled 4
+        // levels
+        // of depth (Kernel::kDepth=4).
+        "subs %[run_depth], #4\n"
+        "bne loop_NEONKernel20x1Depth4_%=\n"
+
+        /* end of main loop */
+
+        /* Accumulate our local accumulator registers into the destination block
+           */
+
+        "cmp %[start_depth], #0\n"
+        "beq store_result_NEONKernel20x1Depth4_%=\n"
+
+        "mov r0, %[dst_ptr]\n"
+        // Load a column
+        "vld1.32 {d0, d1}, [r0]!\n"
+        "vld1.32 {d2, d3}, [r0]!\n"
+        "vld1.32 {d4, d5}, [r0]!\n"
+        "vld1.32 {d6, d7}, [r0]!\n"
+        "vld1.32 {d8, d9}, [r0]!\n"
+
+        // Accumulate a column
+        "vadd.s32 q11, q11, q0\n"
+        "vadd.s32 q12, q12, q1\n"
+        "vadd.s32 q13, q13, q2\n"
+        "vadd.s32 q14, q14, q3\n"
+        "vadd.s32 q15, q15, q4\n"
+
+        "store_result_NEONKernel20x1Depth4_%=:\n"
+
+        "mov r0, %[dst_ptr]\n"
+        "vst1.32 {d22, d23}, [r0]!\n"
+        "vst1.32 {d24, d25}, [r0]!\n"
+        "vst1.32 {d26, d27}, [r0]!\n"
+        "vst1.32 {d28, d29}, [r0]!\n"
+        "vst1.32 {d30, d31}, [r0]!\n"
+        :  // outputs
+        [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+        [run_depth] "+r"(run_depth)
+        :  // inputs
+        [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride),
+        [start_depth] "r"(start_depth)
+        :  // clobbers
+        "cc", "memory", "r0",
+        // note: someone on internet says that quad registers are
+        // unsupported in the clobber list!
+        "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+        "d31");
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
diff --git a/internal/kernel_reference.h b/internal/kernel_reference.h
new file mode 100644
index 0000000..2c65087
--- /dev/null
+++ b/internal/kernel_reference.h
@@ -0,0 +1,119 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_reference.h: a reference kernel for CPU architectures where we don't
+// have optimized kernels yet. Also useful for testing, as it's templatized
+// to have any arbitrary format, allowing tests to cover all sorts of corner
+// cases.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+
+#include "internal/kernel.h"
+
+#include <cstring>
+#include <cstdio>
+
+namespace gemmlowp {
+
+// This kernel is templatized in an arbitrary Format template parameter,
+// allowing it to have any arbitrary format.
+template <typename tFormat>
+struct ReferenceKernel : KernelBase {
+  typedef tFormat Format;
+
+  const char* Name() const override {
+    static char buf[256];
+    snprintf(buf, sizeof(buf),
+             "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)",
+             Format::Lhs::kCells, Format::Lhs::Cell::kWidth,
+             Format::Lhs::Cell::kDepth,
+             CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells,
+             Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth,
+             CellOrderName(Format::Rhs::Cell::kOrder));
+    return buf;
+  }
+
+  void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+           const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+           int start_depth, int run_depth) const override {
+    std::int32_t accumulator[Format::kRows * Format::kCols];
+    memset(accumulator, 0, sizeof(accumulator));
+
+    const int run_depth_cells = run_depth / Format::kDepth;
+
+    // The outer loop is over the depth dimension.
+    for (int dc = 0; dc < run_depth_cells; dc++) {
+      // The next two loops are over cells of the Lhs (stacked vertically),
+      // and over cells of the Rhs (stacked horizontally).
+      for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
+        const std::uint8_t* lhs_cell_ptr = lhs_ptr +
+                                           (dc * Format::Lhs::kCells + rc) *
+                                               Format::Lhs::Cell::kWidth *
+                                               Format::kDepth;
+        for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
+          const std::uint8_t* rhs_cell_ptr = rhs_ptr +
+                                             (dc * Format::Rhs::kCells + cc) *
+                                                 Format::Rhs::Cell::kWidth *
+                                                 Format::kDepth;
+
+          // Now we are inside one cell of the Lhs and inside one cell
+          // of the Rhs, so the remaining inner loops are just
+          // traditional three loops of matrix multiplication.
+          for (int di = 0; di < Format::kDepth; di++) {
+            for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
+              for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
+                const std::uint8_t* lhs_coeff_ptr =
+                    lhs_cell_ptr +
+                    OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
+                const std::uint8_t* rhs_coeff_ptr =
+                    rhs_cell_ptr +
+                    OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
+                std::int32_t* accumulator_coeff_ptr =
+                    accumulator + (ri + rc * Format::Lhs::Cell::kWidth) +
+                    (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
+                *accumulator_coeff_ptr +=
+                    std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (start_depth == 0) {
+      // start_depth == 0 means we haven't accumulated anything yet, so we need
+      // to overwrite the accumulator, as it hasn't been initialized to zero.
+      for (int r = 0; r < Format::kRows; r++) {
+        for (int c = 0; c < Format::kCols; c++) {
+          dst_ptr[r * dst_row_stride + c * dst_col_stride] =
+              accumulator[r + c * Format::kRows];
+        }
+      }
+    } else {
+      // We have already accumulated stuff, so we need to continue accumulating
+      // instead of just overwriting.
+      for (int r = 0; r < Format::kRows; r++) {
+        for (int c = 0; c < Format::kCols; c++) {
+          dst_ptr[r * dst_row_stride + c * dst_col_stride] +=
+              accumulator[r + c * Format::kRows];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h
new file mode 100644
index 0000000..b008034
--- /dev/null
+++ b/internal/multi_thread_gemm.h
@@ -0,0 +1,496 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// multi_thread_gemm.h: Multi-threaded GEMM entry point.
+// Readers note: To understand this file, it is useful to first
+// read and understand the much simpler single_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+
+#include <pthread.h>
+#include <unistd.h>
+#include <vector>
+
+#include "internal/single_thread_gemm.h"
+
+namespace gemmlowp {
+
+// A BlockingCounter lets one thread to wait for N events to occur.
+// This is how the master thread waits for all the worker threads
+// to have finished working.
+class BlockingCounter {
+ public:
+  BlockingCounter()
+      : cond_(PTHREAD_COND_INITIALIZER),
+        mutex_(PTHREAD_MUTEX_INITIALIZER),
+        count_(0) {}
+
+  // Sets/resets the counter; initial_count is the number of
+  // decrementing events that the Wait() call will be waiting for.
+  void Reset(int initial_count) {
+    pthread_mutex_lock(&mutex_);
+    assert(count_ == 0);
+    count_ = initial_count;
+    pthread_mutex_unlock(&mutex_);
+  }
+
+  // Decrements the counter; if the counter hits zero, signals
+  // the thread that was waiting for that, and returns true.
+  // Otherwise (if the decremented count is still nonzero),
+  // returns false.
+  bool DecrementCount() {
+    pthread_mutex_lock(&mutex_);
+    assert(count_ > 0);
+    count_--;
+    if (count_ == 0) {
+      pthread_cond_signal(&cond_);
+    }
+    bool retval = count_ == 0;
+    pthread_mutex_unlock(&mutex_);
+    return retval;
+  }
+
+  // Waits for the N other threads (N having been set by Reset())
+  // to hit the BlockingCounter.
+  void Wait() {
+    ScopedProfilingLabel label("BlockingCounter::Wait");
+    pthread_mutex_lock(&mutex_);
+    while (count_) {
+      pthread_cond_wait(&cond_, &mutex_);
+    }
+    pthread_mutex_unlock(&mutex_);
+  }
+
+ private:
+  pthread_cond_t cond_;
+  pthread_mutex_t mutex_;
+  int count_;
+};
+
+// A workload for a worker.
+struct Task {
+  Task() : local_allocator(nullptr) {}
+  virtual ~Task() {}
+  virtual void Run() const = 0;
+  Allocator* local_allocator;
+};
+
+// A worker thread.
+class Worker {
+ public:
+  enum class State {
+    ThreadStartup,  // The initial state before the thread main loop runs.
+    Ready,          // Is not working, has not yet received new work to do.
+    HasWork,        // Has work to do.
+    ExitAsSoonAsPossible  // Should exit at earliest convenience.
+  };
+
+  explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
+      : task_(nullptr),
+        state_cond_(PTHREAD_COND_INITIALIZER),
+        state_mutex_(PTHREAD_MUTEX_INITIALIZER),
+        state_(State::ThreadStartup),
+        counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+    pthread_create(&thread_, nullptr, ThreadFunc, this);
+  }
+
+  ~Worker() {
+    ChangeState(State::ExitAsSoonAsPossible);
+    pthread_join(thread_, nullptr);
+  }
+
+  // Changes State; may be called from either the worker thread
+  // or the master thread; however, not all state transitions are legal,
+  // which is guarded by assertions.
+  void ChangeState(State new_state) {
+    ScopedProfilingLabel label("Worker::ChangeState");
+    pthread_mutex_lock(&state_mutex_);
+    assert(new_state != state_);
+    switch (state_) {
+      case State::ThreadStartup:
+        assert(new_state == State::Ready);
+        break;
+      case State::Ready:
+        assert(new_state == State::HasWork ||
+               new_state == State::ExitAsSoonAsPossible);
+        break;
+      case State::HasWork:
+        assert(new_state == State::Ready ||
+               new_state == State::ExitAsSoonAsPossible);
+        break;
+      default:
+        abort();
+    }
+    state_ = new_state;
+    pthread_cond_signal(&state_cond_);
+    if (state_ == State::Ready) {
+      counter_to_decrement_when_ready_->DecrementCount();
+    }
+    pthread_mutex_unlock(&state_mutex_);
+  }
+
+  // Thread entry point.
+  void ThreadFunc() {
+    ScopedProfilingLabel label("Worker::ThreadFunc");
+    RegisterCurrentThreadForProfiling();
+
+    ChangeState(State::Ready);
+
+    // Thread main loop
+    while (true) {
+      // Get a state to act on
+      pthread_mutex_lock(&state_mutex_);
+      switch (state_) {
+        case State::ExitAsSoonAsPossible:
+        case State::HasWork:
+          break;
+        case State::Ready:
+          // In the 'Ready' state, we have nothing to do but to wait until
+          // we switch to another state.
+          while (state_ == State::Ready) {
+            ScopedProfilingLabel label("Worker::ThreadFunc waiting");
+            pthread_cond_wait(&state_cond_, &state_mutex_);
+          }
+          break;
+        default:
+          abort();
+      }
+      State state_to_act_upon = state_;
+      pthread_mutex_unlock(&state_mutex_);
+
+      // We now have a state to act on, so act.
+      switch (state_to_act_upon) {
+        case State::HasWork:
+          // Got work to do! So do it, and then revert to 'Ready' state.
+          assert(task_);
+          task_->Run();
+          delete task_;
+          task_ = nullptr;
+          ChangeState(State::Ready);
+          break;
+        case State::ExitAsSoonAsPossible:
+          return;
+        default:
+          abort();
+      }
+    }
+  }
+
+  static void* ThreadFunc(void* arg) {
+    static_cast<Worker*>(arg)->ThreadFunc();
+    return nullptr;
+  }
+
+  // Called by the master thead to give this worker work to do.
+  // It is only legal to call this if the worker
+  void StartWork(Task* task) {
+    assert(!task_);
+    task->local_allocator = &local_allocator_;
+    task_ = task;
+    assert(state_ == State::Ready);
+    ChangeState(State::HasWork);
+  }
+
+ private:
+  // The underlying thread.
+  pthread_t thread_;
+
+  // The task to be worked on.
+  const Task* task_;
+
+  // The condition variable and mutex guarding state changes.
+  pthread_cond_t state_cond_;
+  pthread_mutex_t state_mutex_;
+
+  // The state enum tells if we're currently working, waiting for work, etc.
+  State state_;
+
+  // Each thread had a local allocator so they can allocate temporary
+  // buffers without blocking each other.
+  Allocator local_allocator_;
+
+  // pointer to the master's thread BlockingCounter object, to notify the
+  // master thread of when this worker switches to the 'Ready' state.
+  BlockingCounter* const counter_to_decrement_when_ready_;
+};
+
+// A very simple pool of workers, that only allows the very
+// specific parallelization pattern that we use here:
+// a fixed number of workers can be given work, and one then
+// waits for all of them to finish.
+class WorkersPool {
+ public:
+  WorkersPool() {}
+
+  ~WorkersPool() {
+    for (auto w : workers_) {
+      delete w;
+    }
+  }
+
+  BlockingCounter& counter_to_decrement_when_ready() {
+    return counter_to_decrement_when_ready_;
+  }
+
+  // Give work to a specific worker.
+  void StartWorker(int index, Task* task_) {
+    assert(static_cast<std::size_t>(index) < workers_.size());
+    workers_[index]->StartWork(task_);
+  }
+
+  // Ensures that the pool has at least the given count of workers.
+  // If any new worker has to be created, this function waits for it to
+  // be ready.
+  void CreateWorkers(std::size_t workers_count) {
+    if (workers_.size() >= workers_count) {
+      return;
+    }
+    counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
+    while (workers_.size() < workers_count) {
+      workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
+    }
+    counter_to_decrement_when_ready_.Wait();
+  }
+
+ private:
+  // copy construction disallowed
+  WorkersPool(const WorkersPool&) = delete;
+
+  // The workers in this pool. They are owned by the pool:
+  // the pool creates workers and destroys them in its destructor.
+  std::vector<Worker*> workers_;
+
+  // The BlockingCounter used to wait for the workers.
+  BlockingCounter counter_to_decrement_when_ready_;
+};
+
+// The task we use to implement a multi-threaded Gemm: a block of the
+// RHS has been packed by the master thread; each worker thread
+// then has to pack a block of the LHS and accumulate the Gemm of these
+// packed LHS and RHS blocks.
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder>
+struct GemmWithPackedRhsTask : Task {
+  GemmWithPackedRhsTask(
+      const KernelBase& _kernel, const MatrixMap<const Scalar, LhsOrder>& _lhs,
+      const PackedSideBlock<typename KernelFormat::Rhs>& _packed_rhs,
+      MatrixMap<Scalar, ResultOrder>* _result, int _lhs_offset, int _rhs_offset,
+      int _result_offset, int _result_mult_int, int _result_shift)
+      : kernel(_kernel),
+        lhs(_lhs),
+        packed_rhs(_packed_rhs),
+        result(*_result),
+        lhs_offset(_lhs_offset),
+        rhs_offset(_rhs_offset),
+        result_offset(_result_offset),
+        result_mult_int(_result_mult_int),
+        result_shift(_result_shift) {}
+
+  void Run() const override {
+    ScopedProfilingLabel label("GemmWithPackedRhsTask");
+
+    const int rows = result.rows();
+    const int cols = result.cols();
+    const int depth = lhs.cols();
+
+    BlockParams block_params;
+    block_params.Init<KernelFormat>(rows, cols, depth, 1);
+
+    PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(
+        Side::Lhs, local_allocator, block_params, rhs_offset);
+
+    PackedResultInt32 packed_result(local_allocator, block_params);
+
+    local_allocator->Commit();
+
+    for (int c = 0; c < cols; c += block_params.l2_cols) {
+      int cs = std::min(block_params.l2_cols, cols - c);
+
+      for (int r = 0; r < rows; r += block_params.l2_rows) {
+        int rs = std::min(block_params.l2_rows, rows - r);
+
+        PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+        Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
+
+        auto result_block = result.block(r, c, rs, cs);
+        UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs,
+                     depth, result_offset, result_mult_int, result_shift);
+      }
+    }
+
+    local_allocator->Decommit();
+  }
+
+  const KernelBase& kernel;
+  const MatrixMap<const Scalar, LhsOrder> lhs;
+  const PackedSideBlock<typename KernelFormat::Rhs> packed_rhs;
+  MatrixMap<Scalar, ResultOrder> result;
+  int lhs_offset;
+  int rhs_offset;
+  int result_offset;
+  int result_mult_int;
+  int result_shift;
+};
+
+class MultiThreadGemmContext : public SingleThreadGemmContext {
+ public:
+  MultiThreadGemmContext() : max_num_threads_(0) {}
+
+  void set_max_num_threads(int n) { max_num_threads_ = n; }
+
+  int max_num_threads() const { return max_num_threads_; }
+
+  WorkersPool* workers_pool() { return &workers_pool_; }
+
+ protected:
+  // The workers pool used by MultiThreadGemm. Making
+  // this part of the context allows it to be persistent,
+  // avoiding recreating threads on every Gemm.
+  WorkersPool workers_pool_;
+
+  // The maximum number of worker threads to use (in addition
+  // to the master thread).
+  // The default value 0 means the default behavior of
+  // detecting the number of hardware threads. Nonzero values mean
+  // skipping and overriding hardware detection.
+  int max_num_threads_;
+};
+
+// Determines how many worker threads should be used for a given Gemm
+// operation.
+template <int KernelRows>
+inline int HowManyWorkers(MultiThreadGemmContext* context, int rows, int cols,
+                          int depth) {
+  // First check if the user set an explicit maximum number of threads.
+  int max_count = context->max_num_threads();
+  if (!max_count) {
+    // No user-set maximum number of threads, so we need to
+    // do some hardware detection.
+    // This is expensive to query so we do it only once.
+    // Too bad for dynamicness. Also, we dont use the c++11 standard getter
+    // because Google's coding style currently bans #include <thread_>.
+    static const int hardware_threads_count =
+        static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
+
+    max_count = hardware_threads_count;
+  }
+
+  // Basic calculation: take into account max pool size, and
+  // how many rows we have to feed our kernel.
+  int workers_count = std::min(max_count, CeilQuotient(rows, KernelRows));
+
+  // At this point for small products we already have workers_count==1 so
+  // we can avoid doing more work; otherwise, we still want to check
+  // that the cubic size (rows*cols*depth) is big enough to keep
+  // workers_ busy.
+  if (workers_count > 1) {
+    // Empirically determined value.
+    static const int min_cubic_size_per_thread = 32 * 1024;
+
+    // We can only multiply two out of three sizes without risking overflow
+    int cols_times_depth = cols * depth;
+
+    if (cols_times_depth < min_cubic_size_per_thread) {
+      // in that case, we can multiply by rows without risking overflow
+      int cubic_size = rows * cols_times_depth;
+      workers_count = std::min(
+          workers_count, CeilQuotient(cubic_size, min_cubic_size_per_thread));
+    }
+  }
+
+  assert(workers_count > 0 && workers_count <= max_count);
+  return workers_count;
+}
+
+// The main multi-threaded Gemm function.
+// To understand it, first read the code of SingleThreadedGemm().
+// The parallelization scheme used here is to have this master function
+// pack a block of RHS and then start worker threads to pack a block of LHS
+// each, and accumulate the corresponding products.
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder>
+void MultiThreadGemm(MultiThreadGemmContext* context, const KernelBase& kernel,
+                     const MatrixMap<const Scalar, LhsOrder>& lhs,
+                     const MatrixMap<const Scalar, RhsOrder>& rhs,
+                     MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                     int rhs_offset, int result_offset, int result_mult_int,
+                     int result_shift) {
+  ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
+
+  assert(lhs.cols() == rhs.rows());
+
+  int rows = result->rows();
+  int cols = result->cols();
+  int depth = lhs.cols();
+
+  const int workers_count =
+      HowManyWorkers<KernelFormat::kRows>(context, rows, cols, depth);
+  if (workers_count == 1) {
+    return SingleThreadGemm<KernelFormat, Scalar, LhsOrder, RhsOrder,
+                            ResultOrder>(context, kernel, lhs, rhs, result,
+                                         lhs_offset, rhs_offset, result_offset,
+                                         result_mult_int, result_shift);
+  }
+  assert(workers_count > 1);
+
+  Allocator* allocator = context->allocator();
+  WorkersPool* workers_pool = context->workers_pool();
+
+  workers_pool->CreateWorkers(workers_count);
+
+  BlockParams block_params;
+  block_params.Init<KernelFormat>(rows, cols, depth, workers_count);
+
+  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
+      Side::Rhs, allocator, block_params, lhs_offset);
+  allocator->Commit();
+
+  // We loop over large blocks of the RHS.
+  for (int c = 0; c < cols; c += block_params.l2_cols) {
+    int cs = std::min(block_params.l2_cols, cols - c);
+
+    // Pack a large block of the RHS.
+    PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+
+    // Give work to each worker.
+    int next_start_row = 0;
+    workers_pool->counter_to_decrement_when_ready().Reset(workers_count);
+    for (int w = 0; w < workers_count; w++) {
+      int start_row = next_start_row;
+      next_start_row = std::min(
+          rows, RoundUp<KernelFormat::kRows>(rows * (w + 1) / workers_count));
+
+      int block_rows = next_start_row - start_row;
+      auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
+      auto result_block = result->block(start_row, c, block_rows, cs);
+      typedef GemmWithPackedRhsTask<KernelFormat, Scalar, LhsOrder, RhsOrder,
+                                    ResultOrder> TaskType;
+
+      auto task = new TaskType(kernel, lhs_block, packed_rhs, &result_block,
+                               lhs_offset, rhs_offset, result_offset,
+                               result_mult_int, result_shift);
+      workers_pool->StartWorker(w, task);
+    }
+    // Wait for the workers.
+    workers_pool->counter_to_decrement_when_ready().Wait();
+  }
+
+  allocator->Decommit();
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
diff --git a/internal/pack.h b/internal/pack.h
new file mode 100644
index 0000000..b37d505
--- /dev/null
+++ b/internal/pack.h
@@ -0,0 +1,350 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack.h: packing blocks of the LHS and RHS into the data layout
+// that is expected by compute.h and eventually by kernels.
+// Because this data layout depends on the kernel format, code here
+// is templated in KernelLhsFormat/KernelRhsFormat.
+//
+// Readers note: an important theme around here is that we try hard
+// to handle both Lhs and Rhs with a single piece of code. We indifferently
+// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
+// by (row, column) indices, we address them by (width, depth), as explained
+// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
+// at once.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_H_
+#define GEMMLOWP_INTERNAL_PACK_H_
+
+#include <cstring>
+
+#include "internal/block_params.h"
+#include "internal/kernel.h"
+#include "internal/common.h"
+#include "internal/allocator.h"
+
+namespace gemmlowp {
+
+// A PackedSideBlock instance is a packed block of either the LHS or RHS
+// (whence the generic 'Side' name).
+//
+// 'Packed' means that it is laid out in the storage order that
+// is expected by the specified kernel format. From a block of the input
+// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
+// or PackRhs().
+template <typename KernelSideFormat>
+class PackedSideBlock {
+ public:
+  PackedSideBlock(Side side, Allocator* allocator,
+                  const BlockParams& block_params,
+                  int rank_one_update_multiplier)
+      : allocator_(allocator),
+        rank_one_update_multiplier_(rank_one_update_multiplier),
+        pos_(0) {
+    GetSideBlockParams(side, &params_, block_params);
+    data_handle_ =
+        allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
+    rank_one_update_handle_ =
+        allocator_->Reserve<std::int32_t>(params_.l2_width);
+  }
+
+  ~PackedSideBlock() {}
+
+  void seek_run(int start_width, int start_depth) const {
+    int kernel_run_depth =
+        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
+    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
+  }
+
+  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
+
+  void seek_forward_n_cells(int n) const {
+    pos_ += n * KernelSideFormat::Cell::kSize;
+  }
+
+  const std::uint8_t* current_data() const {
+    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+  }
+
+  std::uint8_t* current_data() {
+    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+  }
+
+  std::int32_t* rank_one_update() {
+    return allocator_->GetPointer<std::int32_t>(rank_one_update_handle_);
+  }
+
+  const std::int32_t* rank_one_update() const {
+    return allocator_->GetPointer<const std::int32_t>(rank_one_update_handle_);
+  }
+
+  std::int32_t rank_one_update_multiplier() const {
+    return rank_one_update_multiplier_;
+  }
+
+  const SideBlockParams& params() const { return params_; }
+
+ private:
+  // The block size parameters that this PackedSizeBlock follows.
+  // The L2 parameters determine its overall size, while the L1 parameters,
+  // together with the kernel format template parameter, determine
+  // the fine details of the storage/traversal order.
+  SideBlockParams params_;
+
+  // Pointer to the allocator provided by the caller. Not owned.
+  // The Allocator is assumed to outlive the PackedSideBlock.
+  Allocator* const allocator_;
+
+  // Handle on the buffer backing this packed block. Owned.
+  Allocator::Handle data_handle_;
+
+  // Handle on the additional buffer backing the rank-one-update vector
+  // associated with this block. Owned.
+  Allocator::Handle rank_one_update_handle_;
+
+  // The constant multiplier of the rank one update vector.
+  std::int32_t rank_one_update_multiplier_;
+
+  // pos_ is the current position in the buffer, which we access
+  // sequentially, like a file.
+  // The idea is that we pack data in the same order as it is
+  // going to be traversed during the computation, which for
+  // cache-friendliness reasons is complicated to random-access,
+  // as the offsets calculations would be intricate. So we
+  // give up random-access addressing, and instead content ourselves
+  // with sequential access.
+  //
+  // pos_ is mutable because during the computation we will want to
+  // be able to iterate on the data in a const PackedSideBlock.
+  mutable int pos_;
+};
+
+// WidthMajor and DepthMajor are custom phrases modelled after the
+// standard terminology 'row-major' and 'column-major'. Their meaning
+// should be transparent once one has read the explanation in kernel.h:
+// for example, in the Lhs, the 'width' dimension is the rows dimension,
+// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
+// Another way to put it: WidthMajor means that contiguous storage is used
+// for entries having the same 'width' index.
+enum class SideMapOrder { WidthMajor, DepthMajor };
+
+// Similar to MatrixMap from map.h, but in terms of width/depth instead of
+// rows/columns. Used to address blocks of the input LHS/RHS matrices when
+// packing them.
+template <typename tScalar, SideMapOrder tOrder>
+class SideMap {
+ public:
+  typedef tScalar Scalar;
+  static const SideMapOrder kOrder = tOrder;
+
+  SideMap(Scalar* data, int width, int depth, int stride)
+      : data_(data), width_(width), depth_(depth), stride_(stride) {}
+
+  SideMap(const SideMap& other)
+      : data_(other.data_),
+        width_(other.width_),
+        depth_(other.depth_),
+        stride_(other.stride_) {}
+
+  int width() const { return width_; }
+  int depth() const { return depth_; }
+  int stride() const { return stride_; }
+  int width_stride() const {
+    return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
+  }
+  int depth_stride() const {
+    return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
+  }
+  Scalar* data() const { return data_; }
+  Scalar* data(int w, int d) const {
+    return data_ + w * width_stride() + d * depth_stride();
+  }
+  Scalar operator()(int w, int d) const { return *data(w, d); }
+  Scalar& operator()(int w, int d) { return *data(w, d); }
+
+  SideMap block(int start_width, int start_depth, int block_width,
+                int block_depth) const {
+    assert(start_width >= 0);
+    assert(start_width + block_width <= width_);
+    assert(start_depth >= 0);
+    assert(start_depth + block_depth <= depth_);
+
+    return SideMap(data(start_width, start_depth), block_width, block_depth,
+                   stride_);
+  }
+
+ private:
+  Scalar* data_;  // not owned.
+  int width_, depth_, stride_;
+};
+
+// Generic (slow) packing code.
+template <typename SrcMapType, typename KernelSideFormat>
+class PackSideBlockImplGeneric {
+ public:
+  typedef typename KernelSideFormat::Cell CellFormat;
+  static const int kLhsCells = KernelSideFormat::kCells;
+  static const int kCellWidth = CellFormat::kWidth;
+  static const int kKernelWidth = CellFormat::kWidth * kLhsCells;
+  static const int kCellDepth = CellFormat::kDepth;
+
+  virtual ~PackSideBlockImplGeneric() {}
+
+  PackSideBlockImplGeneric(PackedSideBlock<KernelSideFormat>* packed_side_block,
+                           const SrcMapType& src_map)
+      : packed_side_block_(packed_side_block), src_map_(src_map) {}
+
+  // The public entry point to pack a block.
+  void PackL2() {
+    memset(packed_side_block_->rank_one_update(), 0,
+           sizeof(std::int32_t) * packed_side_block_->params().l2_width);
+    for (int d = 0; d < src_map_.depth();
+         d += packed_side_block_->params().l1_depth) {
+      int ds = std::min<int>(packed_side_block_->params().l1_depth,
+                             src_map_.depth() - d);
+
+      for (int w = 0; w < src_map_.width();
+           w += packed_side_block_->params().l1_width) {
+        int ws = std::min<int>(packed_side_block_->params().l1_width,
+                               src_map_.width() - w);
+
+        PackL1(w, ws, d, ds);
+      }
+    }
+  }
+
+  PackedSideBlock<KernelSideFormat>* packed_side_block() const {
+    return packed_side_block_;
+  }
+
+  const SrcMapType& src_map() const { return src_map_; }
+
+ protected:
+  // PackRun packs only a run i.e. is the inner loop in the depth dimension.
+  // This is what subclasses may override to provide optimized code paths.
+  // Optimized implementations may still fall back to this generic code
+  // to handle unaligned boundaries.
+  virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+    for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+      for (int w = 0; w < width; w++) {
+        Prefetch(src_map_.data(start_width + w, start_depth + d));
+      }
+    }
+    for (int d = 0; d < depth; d += kCellDepth) {
+      // The next loop's boundary is kKernelWidth, not width,
+      // because we always pack whole kernels so that the
+      // compute stage doesn't need to worry about unaligned kernel sizes.
+      for (int w = 0; w < +kKernelWidth; w += kCellWidth) {
+        PackUnalignedCell(start_width + w, start_depth + d);
+      }
+    }
+  }
+
+ private:
+  // The intermediate-level loops, between PackL2 and PackRun.
+  void PackL1(int start_width, int width, int start_depth, int depth) {
+    for (int w = 0; w < width; w += kKernelWidth) {
+      int ws = std::min(+kKernelWidth, width - w);
+      packed_side_block_->seek_run(start_width + w, start_depth);
+      PackRun(start_width + w, ws, start_depth, depth);
+    }
+  }
+
+  // Reference un-optimized implementation of the packing of a cell;
+  // also serves as a fallback to handle unaligned edges.
+  void PackUnalignedCell(int start_width, int start_depth) {
+    std::uint8_t* dst_ptr = packed_side_block_->current_data();
+    std::int32_t* dst_rank_one_update =
+        packed_side_block_->rank_one_update() + start_width;
+    std::int32_t dst_rank_one_update_multiplier =
+        packed_side_block_->rank_one_update_multiplier();
+
+    memset(dst_ptr, 0, sizeof(std::uint8_t) * CellFormat::kSize);
+
+    if (start_width < src_map_.width() && start_depth < src_map_.depth()) {
+      int width = std::min<int>(+kCellWidth, src_map_.width() - start_width);
+      int depth = std::min<int>(+kCellDepth, src_map_.depth() - start_depth);
+      auto src_block = src_map_.block(start_width, start_depth, width, depth);
+
+      for (int w = 0; w < width; w++) {
+        for (int d = 0; d < depth; d++) {
+          std::uint8_t s = src_block(w, d);
+          dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = s;
+          dst_rank_one_update[w] += s * dst_rank_one_update_multiplier;
+        }
+      }
+    }
+
+    packed_side_block_->seek_next_cell();
+  }
+
+  // The PackedSideBlock being packed, i.e. the 'destination'.
+  PackedSideBlock<KernelSideFormat>* const packed_side_block_;
+
+  // A map on the block of the original matrix block being packed,
+  // i.e. the 'source'.
+  const SrcMapType& src_map_;
+};
+
+// The packing code that we actually use. Defaults to using the above
+// generic code; optimized paths can be inserted by specializing this
+// template. See e.g. pack_neon.h.
+template <typename SrcMapType, typename KernelSideFormat>
+class PackSideBlockImpl
+    : public PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> {
+ public:
+  typedef PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> Base;
+
+  PackSideBlockImpl(PackedSideBlock<KernelSideFormat>* packed_side_block,
+                    const SrcMapType& src_map)
+      : Base(packed_side_block, src_map) {}
+};
+
+// Packs a block of the input LHS matrix, into a PackedSideBlock
+template <typename KernelSideFormat, typename MatrixMapType>
+void PackLhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) {
+  ScopedProfilingLabel label("pack LHS");
+  static_assert(MatrixMapType::kOrder == MapOrder::RowMajor,
+                "only row-major LHS is supported at the moment.");
+  static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor;
+  typedef typename MatrixMapType::Scalar Scalar;
+  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+  SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
+  typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType;
+  ImplType impl(dst, src_side_map);
+  impl.PackL2();
+}
+
+// Packs a block of the input RHS matrix, into a PackedSideBlock
+template <typename KernelSideFormat, typename MatrixMapType>
+void PackRhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) {
+  ScopedProfilingLabel label("pack RHS");
+  static_assert(MatrixMapType::kOrder == MapOrder::ColMajor,
+                "only col-major RHS is supported at the moment.");
+  static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor;
+  typedef typename MatrixMapType::Scalar Scalar;
+  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+  SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
+  typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType;
+  ImplType impl(dst, src_side_map);
+  impl.PackL2();
+}
+
+}  // namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "internal/pack_neon.h"
+#endif
+
+#endif  // GEMMLOWP_INTERNAL_PACK_H_
diff --git a/internal/pack_neon.h b/internal/pack_neon.h
new file mode 100644
index 0000000..c437489
--- /dev/null
+++ b/internal/pack_neon.h
@@ -0,0 +1,938 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_neon.h: optimized NEON specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_NEON_H_
+#define GEMMLOWP_INTERNAL_PACK_NEON_H_
+
+#include "internal/pack.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+// Specialization for 3 Cells of width 4, depth 2.
+// This is the LHS format used by NEONKernel12x4Depth2.
+typedef KernelSideFormat<CellFormat<4, 2>, 3> SideFormat3Cells4x2;
+template <typename SrcMapType>
+class PackSideBlockImpl<SrcMapType, SideFormat3Cells4x2>
+    : public PackSideBlockImplGeneric<SrcMapType, SideFormat3Cells4x2> {
+ public:
+  typedef SideFormat3Cells4x2 SideFormat;
+  typedef PackSideBlockImplGeneric<SrcMapType, SideFormat> Base;
+
+  PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+                    const SrcMapType& src_map)
+      : Base(packed_side_block, src_map) {}
+
+ protected:
+  static const int KernelRows = SideFormat::kWidth;
+
+  virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+    // Fall back to generic path for packing too narrow runs.
+    if (width < SideFormat::kWidth) {
+      Base::PackRun(start_width, width, start_depth, depth);
+      return;
+    }
+
+    const std::uint8_t* src_ptr =
+        Base::src_map().data(start_width, start_depth);
+    const int stride = Base::src_map().stride();
+    assert(src_ptr + stride ==
+           Base::src_map().data(start_width + 1, start_depth));
+    assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+    // Prefetch data.
+    for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+      for (int i = 0; i < KernelRows; i++) {
+        Prefetch(src_ptr + i * stride + d);
+      }
+    }
+
+    const int AlignedDepth16 = RoundDown<16>(depth);
+    if (AlignedDepth16) {
+      // Fast inner loop for handling multiples of 16 levels of depth
+      ScopedProfilingLabel label("optimized kernel");
+
+      std::int32_t* rank_one_update_ptr =
+          Base::packed_side_block()->rank_one_update() + start_width;
+
+      std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+      const std::uint8_t* dst_end_ptr = dst_ptr + KernelRows * AlignedDepth16;
+
+      __attribute__((aligned(32))) std::uint8_t buf[KernelRows * 16];
+      __attribute__((aligned(16))) std::int32_t sumsbuf[12];
+      asm volatile(
+          "mov r4, %[src_ptr]\n"
+          "mov r3, %[dst_ptr]\n"
+
+          // We will accumulate the rank_one_update sums in q12--q14.
+          "vmov.s32 q12, #0\n"
+          "vmov.s32 q13, q12\n"
+          "vmov.s32 q14, q12\n"
+
+          // Main loop.
+          "loop_PackSideBlockImplNEON4x3x2_%=:\n"
+
+          // Load a 12x16 block into the 12 registers q0--q11.
+          // So each of these 12 registers contains 16 entries of
+          // one line, and there are 12 lines being processed.
+          "mov r0, r4\n"
+          "add r4, #16\n"
+          "vld1.8 {d0,d1}, [r0], %[stride]\n"
+          "vld1.8 {d2,d3}, [r0], %[stride]\n"
+          "vld1.8 {d4,d5}, [r0], %[stride]\n"
+          "vld1.8 {d6,d7}, [r0], %[stride]\n"
+          "vld1.8 {d8,d9}, [r0], %[stride]\n"
+          "vld1.8 {d10,d11}, [r0], %[stride]\n"
+          "vld1.8 {d12,d13}, [r0], %[stride]\n"
+          "vld1.8 {d14,d15}, [r0], %[stride]\n"
+          "vld1.8 {d16,d17}, [r0], %[stride]\n"
+          "vld1.8 {d18,d19}, [r0], %[stride]\n"
+          "vld1.8 {d20,d21}, [r0], %[stride]\n"
+          "vld1.8 {d22,d23}, [r0], %[stride]\n"
+
+          // The CellOrder is the opposite of the MapOrder here
+          // so we need to transpose the data in each cell.
+          // We do so using an auxiliary buffer and the vst4 instruction,
+          // which takes 4 registers and stores them interleaved.
+          "mov r1, %[buf]\n"
+          "vst4.8 {d0, d2, d4, d6}, [r1:256]!\n"
+          "vst4.8 {d1, d3, d5, d7}, [r1:256]!\n"
+          "vst4.8 {d8, d10, d12, d14}, [r1:256]!\n"
+          "vst4.8 {d9, d11, d13, d15}, [r1:256]!\n"
+          "vst4.8 {d16, d18, d20, d22}, [r1:256]!\n"
+          "vst4.8 {d17, d19, d21, d23}, [r1:256]!\n"
+
+          // Reload the data from our auxiliary buffer back into
+          // the same regisers; it is now transposed so each of the
+          // 24 d-registers from d0 to d23 now contains one cell,
+          // just not yet in the right sequence. We will still have
+          // to permute those cells.
+          "mov r1, %[buf]\n"
+          "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n"
+          "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n"
+          "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n"
+          "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n"
+          "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n"
+          "vld1.8 {d20,d21,d22,d23}, [r1:256]!\n"
+
+          // Store these cells back to memory, now in the right
+          // sequence.
+          "vst1.8 d0, [r3:64]!\n"
+          "vst1.8 d8, [r3:64]!\n"
+          "vst1.8 d16, [r3:64]!\n"
+          "vst1.8 d1, [r3:64]!\n"
+          "vst1.8 d9, [r3:64]!\n"
+          "vst1.8 d17, [r3:64]!\n"
+          "vst1.8 d2, [r3:64]!\n"
+          "vst1.8 d10, [r3:64]!\n"
+          "vst1.8 d18, [r3:64]!\n"
+          "vst1.8 d3, [r3:64]!\n"
+          "vst1.8 d11, [r3:64]!\n"
+          "vst1.8 d19, [r3:64]!\n"
+          "vst1.8 d4, [r3:64]!\n"
+          "vst1.8 d12, [r3:64]!\n"
+          "vst1.8 d20, [r3:64]!\n"
+          "vst1.8 d5, [r3:64]!\n"
+          "vst1.8 d13, [r3:64]!\n"
+          "vst1.8 d21, [r3:64]!\n"
+          "vst1.8 d6, [r3:64]!\n"
+          "vst1.8 d14, [r3:64]!\n"
+          "vst1.8 d22, [r3:64]!\n"
+          "vst1.8 d7, [r3:64]!\n"
+          "vst1.8 d15, [r3:64]!\n"
+          "vst1.8 d23, [r3:64]!\n"
+
+          // Now we are done packing this 12x16 block. We still have
+          // to accumulate the rank-one-update sums for it.
+
+          // Add 8-bit values pair-wise into 16-bit values.
+          "vaddl.u8 q0, d0, d1\n"
+          "vaddl.u8 q1, d2, d3\n"
+          "vaddl.u8 q2, d4, d5\n"
+          "vaddl.u8 q3, d6, d7\n"
+          "vaddl.u8 q4, d8, d9\n"
+          "vaddl.u8 q5, d10, d11\n"
+          "vaddl.u8 q6, d12, d13\n"
+          "vaddl.u8 q7, d14, d15\n"
+          "vaddl.u8 q8, d16, d17\n"
+          "vaddl.u8 q9, d18, d19\n"
+          "vaddl.u8 q10, d20, d21\n"
+          "vaddl.u8 q11, d22, d23\n"
+
+          // Add 16-bit values pair-wise into 32-bit values.
+          "vaddl.u16 q0, d0, d1\n"
+          "vaddl.u16 q1, d2, d3\n"
+          "vaddl.u16 q2, d4, d5\n"
+          "vaddl.u16 q3, d6, d7\n"
+          "vaddl.u16 q4, d8, d9\n"
+          "vaddl.u16 q5, d10, d11\n"
+          "vaddl.u16 q6, d12, d13\n"
+          "vaddl.u16 q7, d14, d15\n"
+          "vaddl.u16 q8, d16, d17\n"
+          "vaddl.u16 q9, d18, d19\n"
+          "vaddl.u16 q10, d20, d21\n"
+          "vaddl.u16 q11, d22, d23\n"
+
+          // Accumulate the 32-bit sums into our accumulators q12--q14.
+          "vadd.s32 q12, q12, q0\n"
+          "vadd.s32 q13, q13, q4\n"
+          "vadd.s32 q14, q14, q8\n"
+          "vadd.s32 q12, q12, q1\n"
+          "vadd.s32 q13, q13, q5\n"
+          "vadd.s32 q14, q14, q9\n"
+          "vadd.s32 q12, q12, q2\n"
+          "vadd.s32 q13, q13, q6\n"
+          "vadd.s32 q14, q14, q10\n"
+          "vadd.s32 q12, q12, q3\n"
+          "vadd.s32 q13, q13, q7\n"
+          "vadd.s32 q14, q14, q11\n"
+
+          // End of main loop.
+          "cmp r3, %[dst_end_ptr]\n"
+          "bne loop_PackSideBlockImplNEON4x3x2_%=\n"
+
+          // Store our rank-one-update accumulator registers to the
+          // sums buffer.
+          "mov r0, %[sumsbuf]\n"
+          "vst1.32 {d24, d25}, [r0:128]!\n"
+          "vst1.32 {d26, d27}, [r0:128]!\n"
+          "vst1.32 {d28, d29}, [r0:128]!\n"
+
+          :  // no outputs
+          :  // inputs
+          [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+          [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf),
+          [sumsbuf] "r"(sumsbuf)
+          :  // clobbers
+          "cc", "memory", "r0", "r1", "r3", "r4",
+          // note: someone on internet says that quad registers are
+          // unsupported in the clobber list!
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+          "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+          "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+          "d31");
+
+      // Accumulate the final rank_one_update vector.
+      int32x4x3_t sums;
+      sums.val[0] = vld1q_s32(sumsbuf);
+      sums.val[1] = vld1q_s32(sumsbuf + 4);
+      sums.val[2] = vld1q_s32(sumsbuf + 8);
+      sums.val[0] = vmulq_n_s32(
+          sums.val[0], Base::packed_side_block()->rank_one_update_multiplier());
+      sums.val[1] = vmulq_n_s32(
+          sums.val[1], Base::packed_side_block()->rank_one_update_multiplier());
+      sums.val[2] = vmulq_n_s32(
+          sums.val[2], Base::packed_side_block()->rank_one_update_multiplier());
+
+      int32x4x3_t old_sums;
+      old_sums.val[0] = vld1q_s32(rank_one_update_ptr + 0);
+      old_sums.val[1] = vld1q_s32(rank_one_update_ptr + 4);
+      old_sums.val[2] = vld1q_s32(rank_one_update_ptr + 8);
+      sums.val[0] = vaddq_s32(sums.val[0], old_sums.val[0]);
+      sums.val[1] = vaddq_s32(sums.val[1], old_sums.val[1]);
+      sums.val[2] = vaddq_s32(sums.val[2], old_sums.val[2]);
+      vst1q_s32(rank_one_update_ptr + 0, sums.val[0]);
+      vst1q_s32(rank_one_update_ptr + 4, sums.val[1]);
+      vst1q_s32(rank_one_update_ptr + 8, sums.val[2]);
+    }
+
+    // We are done handling groups of 16 levels of depth; there may be
+    // a leftover for which we use the generic path.
+    Base::packed_side_block()->seek_forward_n_cells(
+        SideFormat::kCells * AlignedDepth16 / SideFormat::kDepth);
+    Base::PackRun(start_width, width, start_depth + AlignedDepth16,
+                  depth - AlignedDepth16);
+  }
+};
+
+// Specialization for 5 Cells of width 4, depth 4.
+// This is the LHS format used by NEONKernel20x1Depth4.
+typedef KernelSideFormat<CellFormat<4, 4>, 5> SideFormat5Cells4x4;
+template <typename LhsMapType>
+class PackSideBlockImpl<LhsMapType, SideFormat5Cells4x4>
+    : public PackSideBlockImplGeneric<LhsMapType, SideFormat5Cells4x4> {
+ public:
+  typedef SideFormat5Cells4x4 SideFormat;
+  typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base;
+
+  PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+                    const LhsMapType& src_map)
+      : Base(packed_side_block, src_map) {}
+
+ protected:
+  virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+    // Fall back to generic path for packing too narrow runs.
+    if (width < SideFormat::kWidth) {
+      Base::PackRun(start_width, width, start_depth, depth);
+      return;
+    }
+
+    const std::uint8_t* src_ptr =
+        Base::src_map().data(start_width, start_depth);
+    const int stride = Base::src_map().stride();
+    assert(src_ptr + stride ==
+           Base::src_map().data(start_width + 1, start_depth));
+    assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+    // Prefetch data.
+    for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+      for (int i = 0; i < SideFormat::kWidth; i++) {
+        Prefetch(src_ptr + i * stride + d);
+      }
+    }
+
+    const int AlignedDepth8 = RoundDown<8>(depth);
+    if (AlignedDepth8) {
+      // Fast inner loop for handling multiples of 8 levels of depth
+      ScopedProfilingLabel label("optimized kernel");
+
+      std::int32_t* rank_one_update_ptr =
+          Base::packed_side_block()->rank_one_update() + start_width;
+
+      std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+      const std::uint8_t* dst_end_ptr =
+          dst_ptr + SideFormat::kWidth * AlignedDepth8;
+
+      __attribute__((aligned(32))) std::uint8_t buf[SideFormat::kWidth * 8];
+      __attribute__((aligned(16))) std::int32_t sumsbuf[20];
+      asm volatile(
+          "mov r4, %[src_ptr]\n"
+          "mov r3, %[dst_ptr]\n"
+
+          // We will accumulate the rank_one_update sums in q10--q14.
+          "vmov.s32 q10, #0\n"
+          "vmov.s32 q11, q10\n"
+          "vmov.s32 q12, q10\n"
+          "vmov.s32 q13, q10\n"
+          "vmov.s32 q14, q10\n"
+
+          // Main loop.
+          "loop_PackSideBlockImplNEON4x5x4_%=:\n"
+
+          // Load a 20x8 block into the 20 registers d0--d19.
+          // So each of these 20 registers contains 8 entries of
+          // one line, and there are 20 lines being processed.
+          "mov r0, r4\n"
+          "add r4, #8\n"
+          "vld1.8 d0, [r0], %[stride]\n"
+          "vld1.8 d1, [r0], %[stride]\n"
+          "vld1.8 d2, [r0], %[stride]\n"
+          "vld1.8 d3, [r0], %[stride]\n"
+          "vld1.8 d4, [r0], %[stride]\n"
+          "vld1.8 d5, [r0], %[stride]\n"
+          "vld1.8 d6, [r0], %[stride]\n"
+          "vld1.8 d7, [r0], %[stride]\n"
+          "vld1.8 d8, [r0], %[stride]\n"
+          "vld1.8 d9, [r0], %[stride]\n"
+          "vld1.8 d10, [r0], %[stride]\n"
+          "vld1.8 d11, [r0], %[stride]\n"
+          "vld1.8 d12, [r0], %[stride]\n"
+          "vld1.8 d13, [r0], %[stride]\n"
+          "vld1.8 d14, [r0], %[stride]\n"
+          "vld1.8 d15, [r0], %[stride]\n"
+          "vld1.8 d16, [r0], %[stride]\n"
+          "vld1.8 d17, [r0], %[stride]\n"
+          "vld1.8 d18, [r0], %[stride]\n"
+          "vld1.8 d19, [r0], %[stride]\n"
+
+          // The CellOrder is the opposite of the MapOrder here
+          // so we need to transpose the data in each cell.
+          // We do so using an auxiliary buffer and the vst4 instruction,
+          // which takes 4 registers and stores them interleaved.
+          "mov r1, %[buf]\n"
+          "vst4.8 {d0, d1, d2, d3}, [r1:256]!\n"
+          "vst4.8 {d4, d5, d6, d7}, [r1:256]!\n"
+          "vst4.8 {d8, d9, d10, d11}, [r1:256]!\n"
+          "vst4.8 {d12, d13, d14, d15}, [r1:256]!\n"
+          "vst4.8 {d16, d17, d18, d19}, [r1:256]!\n"
+
+          // Reload the data from our auxiliary buffer back into
+          // the same regisers; it is now transposed so each of the
+          // 20 d-registers from d0 to d19 now contains half of one cell,
+          // just not yet in the right sequence. We will still have
+          // to permute those cells.
+          "mov r1, %[buf]\n"
+          "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n"
+          "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n"
+          "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n"
+          "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n"
+          "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n"
+
+          // Store these cells back to memory, now in the right
+          // sequence.
+          "vst1.8 d0, [r3:64]!\n"
+          "vst1.8 d1, [r3:64]!\n"
+          "vst1.8 d4, [r3:64]!\n"
+          "vst1.8 d5, [r3:64]!\n"
+          "vst1.8 d8, [r3:64]!\n"
+          "vst1.8 d9, [r3:64]!\n"
+          "vst1.8 d12, [r3:64]!\n"
+          "vst1.8 d13, [r3:64]!\n"
+          "vst1.8 d16, [r3:64]!\n"
+          "vst1.8 d17, [r3:64]!\n"
+          "vst1.8 d2, [r3:64]!\n"
+          "vst1.8 d3, [r3:64]!\n"
+          "vst1.8 d6, [r3:64]!\n"
+          "vst1.8 d7, [r3:64]!\n"
+          "vst1.8 d10, [r3:64]!\n"
+          "vst1.8 d11, [r3:64]!\n"
+          "vst1.8 d14, [r3:64]!\n"
+          "vst1.8 d15, [r3:64]!\n"
+          "vst1.8 d18, [r3:64]!\n"
+          "vst1.8 d19, [r3:64]!\n"
+
+          // Now we are done packing this 20x8 block. We still have
+          // to accumulate the rank-one-update sums for it.
+
+          // Add 8-bit values pair-wise into 16-bit values.
+          "vaddl.u8 q0, d0, d1\n"
+          "vaddl.u8 q1, d2, d3\n"
+          "vaddl.u8 q2, d4, d5\n"
+          "vaddl.u8 q3, d6, d7\n"
+          "vaddl.u8 q4, d8, d9\n"
+          "vaddl.u8 q5, d10, d11\n"
+          "vaddl.u8 q6, d12, d13\n"
+          "vaddl.u8 q7, d14, d15\n"
+          "vaddl.u8 q8, d16, d17\n"
+          "vaddl.u8 q9, d18, d19\n"
+
+          // Add 16-bit values pair-wise into 32-bit values.
+          "vaddl.u16 q0, d0, d1\n"
+          "vaddl.u16 q1, d2, d3\n"
+          "vaddl.u16 q2, d4, d5\n"
+          "vaddl.u16 q3, d6, d7\n"
+          "vaddl.u16 q4, d8, d9\n"
+          "vaddl.u16 q5, d10, d11\n"
+          "vaddl.u16 q6, d12, d13\n"
+          "vaddl.u16 q7, d14, d15\n"
+          "vaddl.u16 q8, d16, d17\n"
+          "vaddl.u16 q9, d18, d19\n"
+
+          // Accumulate the 32-bit sums into our accumulators q10--q14.
+          "vadd.s32 q10, q10, q0\n"
+          "vadd.s32 q11, q11, q2\n"
+          "vadd.s32 q12, q12, q4\n"
+          "vadd.s32 q13, q13, q6\n"
+          "vadd.s32 q14, q14, q8\n"
+          "vadd.s32 q10, q10, q1\n"
+          "vadd.s32 q11, q11, q3\n"
+          "vadd.s32 q12, q12, q5\n"
+          "vadd.s32 q13, q13, q7\n"
+          "vadd.s32 q14, q14, q9\n"
+
+          // End of main loop.
+          "cmp r3, %[dst_end_ptr]\n"
+          "bne loop_PackSideBlockImplNEON4x5x4_%=\n"
+
+          // Store our rank-one-update accumulator registers to the
+          // sums buffer.
+          "mov r0, %[sumsbuf]\n"
+          "vst1.32 {d20, d21}, [r0:128]!\n"
+          "vst1.32 {d22, d23}, [r0:128]!\n"
+          "vst1.32 {d24, d25}, [r0:128]!\n"
+          "vst1.32 {d26, d27}, [r0:128]!\n"
+          "vst1.32 {d28, d29}, [r0:128]!\n"
+          :  // no outputs
+          :  // inputs
+          [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+          [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf),
+          [sumsbuf] "r"(sumsbuf)
+          :  // clobbers
+          "cc", "memory", "r0", "r1", "r3", "r4",
+          // note: someone on internet says that quad registers are
+          // unsupported in the clobber list!
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+          "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+          "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+          "d31");
+
+      // Accumulate the final rank_one_update vector.
+      int32x4_t sums[5];
+      sums[0] = vld1q_s32(sumsbuf);
+      sums[1] = vld1q_s32(sumsbuf + 4);
+      sums[2] = vld1q_s32(sumsbuf + 8);
+      sums[3] = vld1q_s32(sumsbuf + 12);
+      sums[4] = vld1q_s32(sumsbuf + 16);
+
+      sums[0] = vmulq_n_s32(
+          sums[0], Base::packed_side_block()->rank_one_update_multiplier());
+      sums[1] = vmulq_n_s32(
+          sums[1], Base::packed_side_block()->rank_one_update_multiplier());
+      sums[2] = vmulq_n_s32(
+          sums[2], Base::packed_side_block()->rank_one_update_multiplier());
+      sums[3] = vmulq_n_s32(
+          sums[3], Base::packed_side_block()->rank_one_update_multiplier());
+      sums[4] = vmulq_n_s32(
+          sums[4], Base::packed_side_block()->rank_one_update_multiplier());
+
+      int32x4_t old_sums[5];
+      old_sums[0] = vld1q_s32(rank_one_update_ptr + 0);
+      old_sums[1] = vld1q_s32(rank_one_update_ptr + 4);
+      old_sums[2] = vld1q_s32(rank_one_update_ptr + 8);
+      old_sums[3] = vld1q_s32(rank_one_update_ptr + 12);
+      old_sums[4] = vld1q_s32(rank_one_update_ptr + 16);
+
+      sums[0] = vaddq_s32(sums[0], old_sums[0]);
+      sums[1] = vaddq_s32(sums[1], old_sums[1]);
+      sums[2] = vaddq_s32(sums[2], old_sums[2]);
+      sums[3] = vaddq_s32(sums[3], old_sums[3]);
+      sums[4] = vaddq_s32(sums[4], old_sums[4]);
+
+      vst1q_s32(rank_one_update_ptr + 0, sums[0]);
+      vst1q_s32(rank_one_update_ptr + 4, sums[1]);
+      vst1q_s32(rank_one_update_ptr + 8, sums[2]);
+      vst1q_s32(rank_one_update_ptr + 12, sums[3]);
+      vst1q_s32(rank_one_update_ptr + 16, sums[4]);
+    }
+
+    // We are done handling groups of 8 levels of depth; there may be
+    // a leftover for which we use the generic path.
+    Base::packed_side_block()->seek_forward_n_cells(
+        SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth);
+    Base::PackRun(start_width, width, start_depth + AlignedDepth8,
+                  depth - AlignedDepth8);
+  }
+};
+
+// Specialization for 1 Cell of width 8, depth 4.
+// This is the LHS format used by NEONKernel8x1Depth4.
+typedef KernelSideFormat<CellFormat<8, 4>, 1> SideFormat1Cell8x4;
+template <typename LhsMapType>
+class PackSideBlockImpl<LhsMapType, SideFormat1Cell8x4>
+    : public PackSideBlockImplGeneric<LhsMapType, SideFormat1Cell8x4> {
+ public:
+  typedef SideFormat1Cell8x4 SideFormat;
+
+  typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base;
+
+  PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+                    const LhsMapType& src_map)
+      : Base(packed_side_block, src_map) {}
+
+ protected:
+  virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+    // Fall back to generic path for packing too narrow runs.
+    if (width < SideFormat::kWidth) {
+      Base::PackRun(start_width, width, start_depth, depth);
+      return;
+    }
+
+    const std::uint8_t* src_ptr =
+        Base::src_map().data(start_width, start_depth);
+    const int stride = Base::src_map().stride();
+    assert(src_ptr + stride ==
+           Base::src_map().data(start_width + 1, start_depth));
+    assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+    // Prefetch data.
+    for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+      for (int i = 0; i < SideFormat::kWidth; i++) {
+        Prefetch(src_ptr + i * stride + d);
+      }
+    }
+
+    const int AlignedDepth8 = RoundDown<8>(depth);
+    if (AlignedDepth8) {
+      // Fast inner loop for handling multiples of 8 levels of depth
+      ScopedProfilingLabel label("optimized kernel");
+
+      std::int32_t* rank_one_update_ptr =
+          Base::packed_side_block()->rank_one_update() + start_width;
+
+      std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+      const std::uint8_t* dst_end_ptr =
+          dst_ptr + SideFormat::kWidth * AlignedDepth8;
+
+      __attribute__((aligned(16))) std::int32_t sumsbuf[8];
+      asm volatile(
+          "mov r4, %[src_ptr]\n"
+          "mov r3, %[dst_ptr]\n"
+
+          // We will accumulate the rank_one_update sums in q10--q11.
+          "vmov.s32 q10, #0\n"
+          "vmov.s32 q11, q10\n"
+
+          // Main loop.
+          "loop_PackSideBlockImplNEON8x1x4_%=:\n"
+
+          // Load a 8x8 block into the 8 registers d0--d7.
+          // So each of these 8 registers contains 8 entries of
+          // one line, and there are 8 lines being processed.
+          "mov r0, r4\n"
+          "add r4, #8\n"
+          "vld1.8 d0, [r0], %[stride]\n"
+          "vld1.8 d1, [r0], %[stride]\n"
+          "vld1.8 d2, [r0], %[stride]\n"
+          "vld1.8 d3, [r0], %[stride]\n"
+          "vld1.8 d4, [r0], %[stride]\n"
+          "vld1.8 d5, [r0], %[stride]\n"
+          "vld1.8 d6, [r0], %[stride]\n"
+          "vld1.8 d7, [r0], %[stride]\n"
+
+          // The CellOrder is the opposite of the MapOrder here
+          // so we need to transpose the data in each cell.
+          // Fortunately in this case, since what we have to transpose
+          // is a 8x8 block of 8-bit values, we can do so in-place with
+          // vtrn instructions; no need for an auxiliary buffer here.
+          "vtrn.8 d0, d1\n"
+          "vtrn.8 d2, d3\n"
+          "vtrn.8 d4, d5\n"
+          "vtrn.8 d6, d7\n"
+          "vtrn.16 q0, q1\n"
+          "vtrn.16 q2, q3\n"
+          "vtrn.32 q0, q2\n"
+          "vtrn.32 q1, q3\n"
+
+          // Store the packed data to memory.
+          "vst1.8 {d0, d1, d2, d3}, [r3:64]!\n"
+          "vst1.8 {d4, d5, d6, d7}, [r3:64]!\n"
+
+          // Now we are done packing this 20x8 block. We still have
+          // to accumulate the rank-one-update sums for it.
+
+          // Add 8-bit values pair-wise into 16-bit values.
+          "vaddl.u8 q0, d0, d1\n"
+          "vaddl.u8 q1, d2, d3\n"
+          "vaddl.u8 q2, d4, d5\n"
+          "vaddl.u8 q3, d6, d7\n"
+
+          // Add these 16-bit values into the 32-bit accumulators q10--q11.
+          "vaddw.u16 q10, q10, d0\n"
+          "vaddw.u16 q11, q11, d1\n"
+          "vaddw.u16 q10, q10, d2\n"
+          "vaddw.u16 q11, q11, d3\n"
+          "vaddw.u16 q10, q10, d4\n"
+          "vaddw.u16 q11, q11, d5\n"
+          "vaddw.u16 q10, q10, d6\n"
+          "vaddw.u16 q11, q11, d7\n"
+
+          // End of main loop.
+          "cmp r3, %[dst_end_ptr]\n"
+          "bne loop_PackSideBlockImplNEON8x1x4_%=\n"
+
+          // Store our rank-one-update accumulator registers to the
+          // sums buffer.
+          "mov r0, %[sumsbuf]\n"
+          "vst1.32 {d20, d21}, [r0:128]!\n"
+          "vst1.32 {d22, d23}, [r0:128]!\n"
+          :  // no outputs
+          :  // inputs
+          [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+          [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride),
+          [sumsbuf] "r"(sumsbuf)
+          :  // clobbers
+          "cc", "memory", "r0", "r3", "r4",
+          // note: someone on internet says that quad registers are
+          // unsupported in the clobber list!
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+          "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+          "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+          "d31");
+
+      // Accumulate the final rank_one_update vector.
+      int32x4_t sums[2];
+      sums[0] = vld1q_s32(sumsbuf);
+      sums[1] = vld1q_s32(sumsbuf + 4);
+
+      sums[0] = vmulq_n_s32(
+          sums[0], Base::packed_side_block()->rank_one_update_multiplier());
+      sums[1] = vmulq_n_s32(
+          sums[1], Base::packed_side_block()->rank_one_update_multiplier());
+
+      int32x4_t old_sums[2];
+      old_sums[0] = vld1q_s32(rank_one_update_ptr + 0);
+      old_sums[1] = vld1q_s32(rank_one_update_ptr + 4);
+
+      sums[0] = vaddq_s32(sums[0], old_sums[0]);
+      sums[1] = vaddq_s32(sums[1], old_sums[1]);
+
+      vst1q_s32(rank_one_update_ptr + 0, sums[0]);
+      vst1q_s32(rank_one_update_ptr + 4, sums[1]);
+    }
+
+    // We are done handling groups of 8 levels of depth; there may be
+    // a leftover for which we use the generic path.
+    Base::packed_side_block()->seek_forward_n_cells(
+        SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth);
+    Base::PackRun(start_width, width, start_depth + AlignedDepth8,
+                  depth - AlignedDepth8);
+  }
+};
+
+// Partial specialization for 1 Cell of width 4, and any Depth.
+// This is the RHS format used by NEONKernel12x4Depth2.
+template <int Depth>
+using SideFormat1Cell4xD = KernelSideFormat<CellFormat<4, Depth>, 1>;
+template <typename RhsMapType, int Depth>
+class PackSideBlockImpl<RhsMapType, SideFormat1Cell4xD<Depth>>
+    : public PackSideBlockImplGeneric<RhsMapType, SideFormat1Cell4xD<Depth>> {
+ public:
+  typedef SideFormat1Cell4xD<Depth> SideFormat;
+
+  typedef PackSideBlockImplGeneric<RhsMapType, SideFormat> Base;
+
+  PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+                    const RhsMapType& src_map)
+      : Base(packed_side_block, src_map) {}
+
+ protected:
+  virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+    ScopedProfilingLabel label("optimized kernel");
+
+    std::int32_t* rank_one_update_ptr =
+        Base::packed_side_block()->rank_one_update() + start_width;
+    const int AlignedDepth16 = RoundDown<16>(depth);
+    const std::uint8_t* src_line0ptr =
+        width < 1 ? nullptr : Base::src_map().data(start_width, start_depth);
+    const std::uint8_t* src_line1ptr =
+        width < 2 ? nullptr
+                  : Base::src_map().data(start_width + 1, start_depth);
+    const std::uint8_t* src_line2ptr =
+        width < 3 ? nullptr
+                  : Base::src_map().data(start_width + 2, start_depth);
+    const std::uint8_t* src_line3ptr =
+        width < 4 ? nullptr
+                  : Base::src_map().data(start_width + 3, start_depth);
+    std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+    const std::uint8_t* dst_end_ptr =
+        dst_ptr + SideFormat::kWidth * AlignedDepth16;
+    int32x4x4_t local_col_sums;
+    local_col_sums.val[0] = vdupq_n_s32(0);
+    local_col_sums.val[1] = vdupq_n_s32(0);
+    local_col_sums.val[2] = vdupq_n_s32(0);
+    local_col_sums.val[3] = vdupq_n_s32(0);
+
+    switch (width) {
+      case 4:
+        while (dst_ptr != dst_end_ptr) {
+          uint8x16x4_t src_regs;
+          src_regs.val[0] = vld1q_u8(src_line0ptr);
+          src_regs.val[1] = vld1q_u8(src_line1ptr);
+          src_regs.val[2] = vld1q_u8(src_line2ptr);
+          src_regs.val[3] = vld1q_u8(src_line3ptr);
+          uint16x8x4_t s;
+          s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+                              vget_high_u8(src_regs.val[0]));
+          s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+                              vget_high_u8(src_regs.val[1]));
+          s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+                              vget_high_u8(src_regs.val[2]));
+          s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]),
+                              vget_high_u8(src_regs.val[3]));
+          local_col_sums.val[0] =
+              vaddq_s32(local_col_sums.val[0],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+          local_col_sums.val[1] =
+              vaddq_s32(local_col_sums.val[1],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+          local_col_sums.val[2] =
+              vaddq_s32(local_col_sums.val[2],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+          local_col_sums.val[3] =
+              vaddq_s32(local_col_sums.val[3],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[3]), vget_high_u16(s.val[3]))));
+          vst4q_u8(dst_ptr, src_regs);
+          src_line0ptr += 16;
+          src_line1ptr += 16;
+          src_line2ptr += 16;
+          src_line3ptr += 16;
+          dst_ptr += SideFormat::kWidth * 16;
+        }
+        break;
+      case 3:
+        while (dst_ptr != dst_end_ptr) {
+          uint8x16x4_t src_regs;
+          src_regs.val[0] = vld1q_u8(src_line0ptr);
+          src_regs.val[1] = vld1q_u8(src_line1ptr);
+          src_regs.val[2] = vld1q_u8(src_line2ptr);
+          src_regs.val[3] = vdupq_n_u8(0);
+          uint16x8x3_t s;
+          s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+                              vget_high_u8(src_regs.val[0]));
+          s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+                              vget_high_u8(src_regs.val[1]));
+          s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+                              vget_high_u8(src_regs.val[2]));
+          local_col_sums.val[0] =
+              vaddq_s32(local_col_sums.val[0],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+          local_col_sums.val[1] =
+              vaddq_s32(local_col_sums.val[1],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+          local_col_sums.val[2] =
+              vaddq_s32(local_col_sums.val[2],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+          vst4q_u8(dst_ptr, src_regs);
+          src_line0ptr += 16;
+          src_line1ptr += 16;
+          src_line2ptr += 16;
+          dst_ptr += SideFormat::kWidth * 16;
+        }
+        break;
+      case 2:
+        while (dst_ptr != dst_end_ptr) {
+          uint8x16x4_t src_regs;
+          src_regs.val[0] = vld1q_u8(src_line0ptr);
+          src_regs.val[1] = vld1q_u8(src_line1ptr);
+          src_regs.val[2] = vdupq_n_u8(0);
+          src_regs.val[3] = vdupq_n_u8(0);
+          uint16x8x2_t s;
+          s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+                              vget_high_u8(src_regs.val[0]));
+          s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+                              vget_high_u8(src_regs.val[1]));
+          local_col_sums.val[0] =
+              vaddq_s32(local_col_sums.val[0],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+          local_col_sums.val[1] =
+              vaddq_s32(local_col_sums.val[1],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+          vst4q_u8(dst_ptr, src_regs);
+          src_line0ptr += 16;
+          src_line1ptr += 16;
+          dst_ptr += SideFormat::kWidth * 16;
+        }
+        break;
+      case 1:
+        while (dst_ptr != dst_end_ptr) {
+          uint8x16x4_t src_regs;
+          src_regs.val[0] = vld1q_u8(src_line0ptr);
+          src_regs.val[1] = vdupq_n_u8(0);
+          src_regs.val[2] = vdupq_n_u8(0);
+          src_regs.val[3] = vdupq_n_u8(0);
+          uint16x8x2_t s;
+          s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+                              vget_high_u8(src_regs.val[0]));
+          local_col_sums.val[0] =
+              vaddq_s32(local_col_sums.val[0],
+                        vreinterpretq_s32_u32(vaddl_u16(
+                            vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+          vst4q_u8(dst_ptr, src_regs);
+          src_line0ptr += 16;
+          dst_ptr += SideFormat::kWidth * 16;
+        }
+        break;
+      default:
+        abort();
+    }
+
+    int extra = depth - AlignedDepth16;
+
+    if (extra) {
+      __attribute__((aligned(16))) std::uint8_t line0extra[16];
+      __attribute__((aligned(16))) std::uint8_t line1extra[16];
+      __attribute__((aligned(16))) std::uint8_t line2extra[16];
+      __attribute__((aligned(16))) std::uint8_t line3extra[16];
+      __attribute__((aligned(16))) std::uint8_t dstextra[64];
+
+      for (int i = 0; i < extra; i++) {
+        line0extra[i] = width >= 1 ? src_line0ptr[i] : 0;
+        line1extra[i] = width >= 2 ? src_line1ptr[i] : 0;
+        line2extra[i] = width >= 3 ? src_line2ptr[i] : 0;
+        line3extra[i] = width >= 4 ? src_line3ptr[i] : 0;
+      }
+      for (int i = extra; i < 16; i++) {
+        line0extra[i] = 0;
+        line1extra[i] = 0;
+        line2extra[i] = 0;
+        line3extra[i] = 0;
+      }
+      src_line0ptr = line0extra;
+      src_line1ptr = line1extra;
+      src_line2ptr = line2extra;
+      src_line3ptr = line3extra;
+
+      {
+        uint8x16x4_t src_regs;
+        src_regs.val[0] = vld1q_u8(src_line0ptr);
+        src_regs.val[1] = vld1q_u8(src_line1ptr);
+        src_regs.val[2] = vld1q_u8(src_line2ptr);
+        src_regs.val[3] = vld1q_u8(src_line3ptr);
+        uint16x8x4_t s;
+        s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+                            vget_high_u8(src_regs.val[0]));
+        s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+                            vget_high_u8(src_regs.val[1]));
+        s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+                            vget_high_u8(src_regs.val[2]));
+        s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]),
+                            vget_high_u8(src_regs.val[3]));
+        local_col_sums.val[0] =
+            vaddq_s32(local_col_sums.val[0],
+                      vreinterpretq_s32_u32(vaddl_u16(
+                          vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+        local_col_sums.val[1] =
+            vaddq_s32(local_col_sums.val[1],
+                      vreinterpretq_s32_u32(vaddl_u16(
+                          vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+        local_col_sums.val[2] =
+            vaddq_s32(local_col_sums.val[2],
+                      vreinterpretq_s32_u32(vaddl_u16(
+                          vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+        local_col_sums.val[3] =
+            vaddq_s32(local_col_sums.val[3],
+                      vreinterpretq_s32_u32(vaddl_u16(
+                          vget_low_u16(s.val[3]), vget_high_u16(s.val[3]))));
+        vst4q_u8(dstextra, src_regs);
+      }
+
+      for (int i = 0; i < 4 * extra; i++) {
+        dst_ptr[i] = dstextra[i];
+      }
+    }
+
+    // Accumulate the final rank_one_update vector.
+    std::int32_t r[4];
+    if (width >= 1) {
+      vst1q_s32(r, local_col_sums.val[0]);
+      rank_one_update_ptr[0] +=
+          Base::packed_side_block()->rank_one_update_multiplier() *
+          (r[0] + r[1] + r[2] + r[3]);
+    }
+    if (width >= 2) {
+      vst1q_s32(r, local_col_sums.val[1]);
+      rank_one_update_ptr[1] +=
+          Base::packed_side_block()->rank_one_update_multiplier() *
+          (r[0] + r[1] + r[2] + r[3]);
+    }
+    if (width >= 3) {
+      vst1q_s32(r, local_col_sums.val[2]);
+      rank_one_update_ptr[2] +=
+          Base::packed_side_block()->rank_one_update_multiplier() *
+          (r[0] + r[1] + r[2] + r[3]);
+    }
+    if (width >= 4) {
+      vst1q_s32(r, local_col_sums.val[3]);
+      rank_one_update_ptr[3] +=
+          Base::packed_side_block()->rank_one_update_multiplier() *
+          (r[0] + r[1] + r[2] + r[3]);
+    }
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_PACK_NEON_H_
diff --git a/internal/single_thread_gemm.h b/internal/single_thread_gemm.h
new file mode 100644
index 0000000..0ec4201
--- /dev/null
+++ b/internal/single_thread_gemm.h
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// single_thread_gemm.h: Single-threaded GEMM implementation.
+// This is a good place to start reading code, as it shows the overall
+// structure of a GEMM and is much simpler than multi_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+
+#include <cassert>
+
+#include "public/map.h"
+#include "internal/allocator.h"
+#include "internal/pack.h"
+#include "internal/unpack.h"
+#include "internal/compute.h"
+#include "internal/kernel.h"
+
+namespace gemmlowp {
+
+class SingleThreadGemmContext {
+ public:
+  Allocator* allocator() { return &allocator_; }
+
+ protected:
+  Allocator allocator_;
+};
+
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder>
+void SingleThreadGemm(SingleThreadGemmContext* context,
+                      const KernelBase& kernel,
+                      const MatrixMap<const Scalar, LhsOrder>& lhs,
+                      const MatrixMap<const Scalar, RhsOrder>& rhs,
+                      MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                      int rhs_offset, int result_offset, int result_mult_int,
+                      int result_shift) {
+  ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
+
+  assert(lhs.cols() == rhs.rows());
+
+  int rows = result->rows();
+  int cols = result->cols();
+  int depth = lhs.cols();
+
+  Allocator* allocator = context->allocator();
+
+  BlockParams block_params;
+  block_params.Init<KernelFormat>(rows, cols, depth, 1);
+
+  PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(
+      Side::Lhs, allocator, block_params, rhs_offset);
+  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
+      Side::Rhs, allocator, block_params, lhs_offset);
+
+  PackedResultInt32 packed_result(allocator, block_params);
+
+  allocator->Commit();
+
+  const bool pack_rhs_once = block_params.l2_cols == cols;
+
+  if (pack_rhs_once) {
+    PackRhs(&packed_rhs, rhs);
+  }
+
+  for (int r = 0; r < rows; r += block_params.l2_rows) {
+    int rs = std::min(block_params.l2_rows, rows - r);
+
+    PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+    for (int c = 0; c < cols; c += block_params.l2_cols) {
+      int cs = std::min(block_params.l2_cols, cols - c);
+
+      if (!pack_rhs_once) {
+        PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+      }
+
+      Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
+
+      auto result_block = result->block(r, c, rs, cs);
+      UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth,
+                   result_offset, result_mult_int, result_shift);
+    }
+  }
+
+  allocator->Decommit();
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
diff --git a/internal/unpack.h b/internal/unpack.h
new file mode 100644
index 0000000..55ca204
--- /dev/null
+++ b/internal/unpack.h
@@ -0,0 +1,100 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack.h: unpacking the result blocks computed by compute.h,
+// storing them into the destination matrix.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
+#define GEMMLOWP_INTERNAL_UNPACK_H_
+
+#include "internal/allocator.h"
+#include "internal/block_params.h"
+#include "internal/pack.h"
+
+#ifdef GEMMLOWP_NEON
+#include "internal/unpack_neon.h"
+#endif
+
+namespace gemmlowp {
+
+class PackedResultInt32 {
+  Allocator* allocator_;
+  Allocator::Handle matrix_handle_;
+  const BlockParams& block_params_;
+
+ public:
+  PackedResultInt32(Allocator* _allocator, const BlockParams& _block_params)
+      : allocator_(_allocator), block_params_(_block_params) {
+    matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
+                                                       block_params_.l2_cols);
+  }
+
+  ~PackedResultInt32() {}
+
+  MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
+    return MatrixMap<std::int32_t, MapOrder::ColMajor>(
+        allocator_->GetPointer<std::int32_t>(matrix_handle_),
+        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+  }
+
+  MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
+    return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
+        allocator_->GetPointer<const std::int32_t>(matrix_handle_),
+        block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+  }
+};
+
+template <typename ResultBlockType, typename PackedResult,
+          typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResultImpl(ResultBlockType* dst, const PackedResult& src,
+                      const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+                      const PackedSideBlock<KernelRhsFormat>& packed_rhs,
+                      int depth, std::int32_t result_offset,
+                      std::int32_t result_mult_int, std::int32_t result_shift) {
+  std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
+                             packed_rhs.rank_one_update_multiplier() * depth;
+  // No top-level blocking in the depth dimension at the moment.
+  // Too much loss of precision.
+  for (int c = 0; c < dst->cols(); c++) {
+    for (int r = 0; r < dst->rows(); r++) {
+      std::int32_t q = *src.data(r, c);
+      q += packed_lhs.rank_one_update()[r] + packed_rhs.rank_one_update()[c] +
+           rank0update;
+      q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
+          result_shift;
+      (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
+    }
+  }
+}
+
+template <typename ResultBlockType, typename PackedResult,
+          typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResult(ResultBlockType* dst, const PackedResult& src,
+                  const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+                  const PackedSideBlock<KernelRhsFormat>& packed_rhs, int depth,
+                  std::int32_t result_offset, std::int32_t result_mult_int,
+                  std::int32_t result_shift) {
+  ScopedProfilingLabel label("unpack");
+#ifdef GEMMLOWP_NEON
+  UnpackResultImplNEON(dst, src.Map(), packed_lhs, packed_rhs, depth,
+                       result_offset, result_mult_int, result_shift);
+#else
+  UnpackResultImpl(dst, src.Map(), packed_lhs, packed_rhs, depth, result_offset,
+                   result_mult_int, result_shift);
+#endif
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_UNPACK_H_
diff --git a/internal/unpack_neon.h b/internal/unpack_neon.h
new file mode 100644
index 0000000..17bfe78
--- /dev/null
+++ b/internal/unpack_neon.h
@@ -0,0 +1,170 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack_neon.h: optimized NEON specializations of the templates in unpack.h.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
+#define GEMMLOWP_INTERNAL_UNPACK_NEON_H_
+
+#include "internal/unpack.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+template <typename ResultBlockType, typename PackedResult,
+          typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResultImplNEON(ResultBlockType* dst, const PackedResult& src,
+                          const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+                          const PackedSideBlock<KernelRhsFormat>& packed_rhs,
+                          int depth, std::int32_t result_offset,
+                          std::int32_t result_mult_int,
+                          std::int32_t result_shift) {
+  ScopedProfilingLabel label("optimized kernel");
+
+  std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
+                             packed_rhs.rank_one_update_multiplier() * depth;
+  std::int32_t preshift_offset = 1 << (result_shift - 1);
+  int32x4_t shift_reg = vdupq_n_s32(-result_shift);
+  for (int c = 0; c < dst->cols(); c++) {
+    std::uint8_t* dst_ptr = dst->data(0, c);
+    const std::int32_t* src_ptr = src.data(0, c);
+    const std::int32_t* rank_one_update_ptr = packed_lhs.rank_one_update();
+    std::int32_t rank1update = packed_rhs.rank_one_update()[c];
+    std::int32_t constant_offset = rank1update + rank0update + result_offset;
+
+    int dst_rows_aligned4 = RoundDown<4>(dst->rows());
+    int dst_rows_aligned16 = RoundDown<16>(dst->rows());
+
+    if (dst_rows_aligned16) {
+      std::uint8_t* dst_end_ptr = dst_ptr + dst_rows_aligned16;
+
+      asm volatile(
+          "vdup.32 q12, %[constant_offset]\n"
+          "vdup.32 q13, %[preshift_offset]\n"
+          "neg r3, %[result_shift]\n"
+          "vdup.32 q14, r3\n"
+          "vdup.32 q15, %[result_mult_int]\n"
+
+          "loop_UnpackResultImplNEON_%=:\n"
+
+          // Load a 16x1 block of the packed result matrix
+          // (so 16 contiguous entries in one column).
+          "vld1.32 {d0, d1, d2, d3}, [%[src_ptr]]!\n"
+          "vld1.32 {d4, d5, d6, d7}, [%[src_ptr]]!\n"
+
+          // Load entries the LHS rank one update vector.
+          "vld1.32 {d8, d9, d10, d11}, "
+          "[%[rank_one_update_ptr]:256]!\n"
+          "vld1.32 {d12, d13, d14, d15}, "
+          "[%[rank_one_update_ptr]:256]!\n"
+
+          // Apply the LHS rank one update.
+          "vadd.s32 q0, q0, q4\n"
+          "vadd.s32 q1, q1, q5\n"
+          "vadd.s32 q2, q2, q6\n"
+          "vadd.s32 q3, q3, q7\n"
+
+          // Add the constant offset
+          // (which includes the RHS rank one update, see above).
+          "vadd.s32 q0, q0, q12\n"
+          "vadd.s32 q1, q1, q12\n"
+          "vadd.s32 q2, q2, q12\n"
+          "vadd.s32 q3, q3, q12\n"
+
+          // Multiply by the result multiplier
+          "vmul.s32 q0, q0, q15\n"
+          "vmul.s32 q1, q1, q15\n"
+          "vmul.s32 q2, q2, q15\n"
+          "vmul.s32 q3, q3, q15\n"
+
+          // Add the pre-shift offset (so that the shift is rounding)
+          "vadd.s32 q0, q0, q13\n"
+          "vadd.s32 q1, q1, q13\n"
+          "vadd.s32 q2, q2, q13\n"
+          "vadd.s32 q3, q3, q13\n"
+
+          // Shift right (shift left by negative offset).
+          "vshl.s32 q0, q0, q14\n"
+          "vshl.s32 q1, q1, q14\n"
+          "vshl.s32 q2, q2, q14\n"
+          "vshl.s32 q3, q3, q14\n"
+
+          // So far we had signed 32bit values; now we cast them down
+          // to unsigned 8bit, saturating.
+          "vqmovn.s32 d8, q0\n"
+          "vqmovn.s32 d9, q1\n"
+          "vqmovn.s32 d10, q2\n"
+          "vqmovn.s32 d11, q3\n"
+          "vqmovun.s16 d0, q4\n"
+          "vqmovun.s16 d1, q5\n"
+
+          // Store result into the destination matrix.
+          "vst1.8 {d0, d1}, [%[dst_ptr]]!\n"
+
+          // End of the loop.
+          "cmp %[dst_ptr], %[dst_end_ptr]\n"
+          "bne loop_UnpackResultImplNEON_%=\n"
+
+          :  // outputs
+          [dst_ptr] "+r"(dst_ptr), [src_ptr] "+r"(src_ptr),
+          [rank_one_update_ptr] "+r"(rank_one_update_ptr)
+          :  // inputs
+          [dst_end_ptr] "r"(dst_end_ptr),
+          [constant_offset] "r"(constant_offset),
+          [result_mult_int] "r"(result_mult_int),
+          [preshift_offset] "r"(preshift_offset),
+          [result_shift] "r"(result_shift)
+          :  // clobbers
+          "cc", "memory", "r3",
+          // note: someone on internet says that quad registers are
+          // unsupported in the clobber list!
+          "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+          "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+          "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+          "d31");
+    }
+
+    // We have finished handling groups of 16 entries at once; now
+    // try to handle 4 entries at once.
+    for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
+      int32x4_t q = vld1q_s32(src_ptr);
+      q = vaddq_s32(q, vld1q_s32(rank_one_update_ptr));
+      q = vaddq_s32(q, vdupq_n_s32(constant_offset));
+      q = vmulq_n_s32(q, result_mult_int);
+      q = vaddq_s32(q, vdupq_n_s32(preshift_offset));
+      q = vshlq_s32(q, shift_reg);
+      int16x4_t q_16 = vqmovn_s32(q);
+      uint8x8_t q_8 = vqmovun_s16(vcombine_s16(q_16, q_16));
+      vst1_lane_u32(reinterpret_cast<std::uint32_t*>(dst_ptr),
+                    vreinterpret_u32_u8(q_8), 0);
+      dst_ptr += 4;
+      src_ptr += 4;
+      rank_one_update_ptr += 4;
+    }
+    // We have finished handling 4 entries at once; now handle
+    // remaining entries one by one.
+    for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
+      std::int32_t q = src(r, c);
+      q += packed_lhs.rank_one_update()[r] + rank1update + rank0update;
+      q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
+          result_shift;
+      (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
+    }
+  }
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_INTERNAL_UNPACK_NEON_H_
diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h
new file mode 100644
index 0000000..b1592c8
--- /dev/null
+++ b/profiling/instrumentation.h
@@ -0,0 +1,217 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// instrumentation.h: contains the definitions needed to
+// instrument code for profiling:
+//   ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
+//
+// profiler.h is only needed to drive the profiler:
+//   StartProfiling, FinishProfiling.
+//
+// See the usage example in profiler.h.
+
+#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
+#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
+
+#include <pthread.h>
+#include <cstdint>
+#include <cassert>
+#include <cstdlib>
+#include <algorithm>
+
+#ifdef GEMMLOWP_PROFILING
+#include <set>
+#include <cstdio>
+#include <cstring>
+#endif
+
+// We should always use C++11 thread_local; unfortunately that
+// isn't fully supported on Apple yet.
+#ifdef __APPLE__
+#define GEMMLOWP_THREAD_LOCAL static __thread
+#else
+#define GEMMLOWP_THREAD_LOCAL thread_local
+#endif
+
+namespace gemmlowp {
+
+inline void ReleaseBuildAssertion(bool condition, const char* msg) {
+  if (!condition) {
+    fprintf(stderr, "gemmlowp error: %s\n", msg);
+    abort();
+  }
+}
+
+// To be used as template parameter for GlobalLock.
+// GlobalLock<ProfilerLockId> is the profiler global lock:
+// registering threads, starting profiling, finishing profiling, and
+// the profiler itself as it samples threads, all need to lock it.
+struct ProfilerLockId;
+
+// A very plain global lock. Templated in LockId so we can have multiple
+// locks, one for each LockId type.
+template <typename LockId>
+class GlobalLock {
+  static pthread_mutex_t* Mutex() {
+    static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+    return &m;
+  }
+
+ public:
+  static void Lock() { pthread_mutex_lock(Mutex()); }
+  static void Unlock() { pthread_mutex_unlock(Mutex()); }
+};
+
+// A very simple RAII helper to lock and unlock a GlobalLock
+template <typename LockId>
+struct AutoGlobalLock {
+  AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
+  ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
+};
+
+// MemoryBarrier is purely a compile-time thing; it tells two things
+// to the compiler:
+//   1) It prevents reordering code across it
+//     (thanks to the 'volatile' after 'asm')
+//   2) It requires the compiler to assume that any value previously
+//     read from memory, may have changed. Thus it offers an alternative
+//     to using 'volatile' variables.
+inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
+
+// Profiling definitions. Two paths: when profiling is enabled,
+// and when profiling is disabled.
+#ifdef GEMMLOWP_PROFILING
+// This code path is when profiling is enabled.
+
+// A pseudo-call-stack. Contrary to a real call-stack, this only
+// contains pointers to literal strings that were manually entered
+// in the instrumented code (see ScopedProfilingLabel).
+struct ProfilingStack {
+  static const std::size_t kMaxSize = 15;
+  typedef const char* LabelsArrayType[kMaxSize];
+  LabelsArrayType labels;
+  std::size_t size;
+
+  ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
+
+  void Push(const char* label) {
+    MemoryBarrier();
+    ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
+    labels[size] = label;
+    MemoryBarrier();
+    size++;
+    MemoryBarrier();
+  }
+
+  void Pop() {
+    MemoryBarrier();
+    ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
+    size--;
+    MemoryBarrier();
+  }
+
+  void UpdateTop(const char* new_label) {
+    MemoryBarrier();
+    assert(size);
+    labels[size - 1] = new_label;
+    MemoryBarrier();
+  }
+
+  ProfilingStack& operator=(const ProfilingStack& other) {
+    memcpy(this, &other, sizeof(ProfilingStack));
+    return *this;
+  }
+
+  bool operator==(const ProfilingStack& other) const {
+    return !memcmp(this, &other, sizeof(ProfilingStack));
+  }
+};
+
+static_assert(
+    !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
+    "ProfilingStack should have power-of-two size to fit in cache lines");
+
+struct ThreadInfo;
+
+// The global set of threads being profiled.
+inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
+  static std::set<ThreadInfo*> v;
+  return v;
+}
+
+struct ThreadInfo {
+  pthread_key_t key;  // used only to get a callback at thread exit.
+  ProfilingStack stack;
+
+  ThreadInfo() {
+    pthread_key_create(&key, ThreadExitCallback);
+    pthread_setspecific(key, this);
+  }
+
+  static void ThreadExitCallback(void* ptr) {
+    AutoGlobalLock<ProfilerLockId> lock;
+    ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
+    ThreadsUnderProfiling().erase(self);
+    pthread_key_delete(self->key);
+  }
+};
+
+inline ThreadInfo& ThreadLocalThreadInfo() {
+  GEMMLOWP_THREAD_LOCAL ThreadInfo i;
+  return i;
+}
+
+// ScopedProfilingLabel is how one instruments code for profiling
+// with this profiler. Construct local ScopedProfilingLabel variables,
+// passing a literal string describing the local code. Profile
+// samples will then be annotated with this label, while it is in scope
+// (whence the name --- also known as RAII).
+// See the example in profiler.h.
+class ScopedProfilingLabel {
+  ProfilingStack* profiling_stack_;
+
+ public:
+  explicit ScopedProfilingLabel(const char* label)
+      : profiling_stack_(&ThreadLocalThreadInfo().stack) {
+    profiling_stack_->Push(label);
+  }
+
+  ~ScopedProfilingLabel() { profiling_stack_->Pop(); }
+
+  void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
+};
+
+// To be called once on each thread to be profiled.
+inline void RegisterCurrentThreadForProfiling() {
+  AutoGlobalLock<ProfilerLockId> lock;
+  ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
+}
+
+#else  // not GEMMLOWP_PROFILING
+// This code path is when profiling is disabled.
+
+// This empty definition of ScopedProfilingLabel ensures that
+// it has zero runtime overhead when profiling is disabled.
+struct ScopedProfilingLabel {
+  explicit ScopedProfilingLabel(const char*) {}
+  void Update(const char*) {}
+};
+
+inline void RegisterCurrentThreadForProfiling() {}
+
+#endif
+
+}  // end namespace gemmlowp
+
+#endif  // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
diff --git a/profiling/profiler.h b/profiling/profiler.h
new file mode 100644
index 0000000..9ea7a9f
--- /dev/null
+++ b/profiling/profiler.h
@@ -0,0 +1,373 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// profiler.h: a simple sampling profiler that's always just one #include away!
+//
+// Overview
+// ========
+//
+// This profiler only samples a pseudo-stack, not the actual call stack.
+// The code to be profiled needs to be instrumented with
+// pseudo-stack "labels", see ScopedProfilingLabel.
+// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
+// portable, and independent of compilation details such as function inlining
+// and frame pointers. The granularity of instrumentation can be freely chosen,
+// and it is possible to get some annotate-like detail, i.e. detail within one
+// function without splitting it into multiple functions.
+//
+// This profiler should remain small and simple; its key feature is to fit in
+// a single header file so that there should never be a reason to refrain
+// from profiling. More complex and feature-rich alternatives are
+// readily available. This one offers a strict superset of its
+// functionality: https://github.com/bgirard/GeckoProfiler, including
+// intertwining pseudostacks with real call stacks, more annotation options,
+// and advanced visualization.
+//
+// Usage
+// =====
+//
+// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
+//    not enabled, profiling instrumentation from common.h
+//    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
+//    is still defined but does nothing. On the other hand,
+//    when profiling is not enabled, it is an error to #include the
+//    present file.
+//
+// 1. Each thread can opt in to profiling by calling
+//    RegisterCurrentThreadForProfiling() defined in common.h.
+//    This can be done at any time, before or during profiling.
+//    No sample will be collected from a thread until
+//    it has called RegisterCurrentThreadForProfiling().
+//
+// 2. Instrument your code to be profiled with ScopedProfilingLabel,
+//    which is a RAII helper defined in common.h. The identifier
+//    names (some_label, etc) do not matter; what will show up
+//    in the profile is the string passed to the constructor, which
+//    must be a literal string. See the full example below.
+//
+//    Note: the overhead of ScopedProfilingLabel is zero when not
+//    enabling profiling (when not defining GEMMLOWP_PROFILING).
+//
+// 3. Use the profiler.h interface to control profiling. There are two
+//    functions: StartProfiling() and FinishProfiling(). They must be
+//    called on the same thread. FinishProfiling() prints the profile
+//    on stdout.
+//
+// Full example
+// ============
+/*
+    #define GEMMLOWP_PROFILING
+    #include "profiling/instrumentation.h"
+    using namespace gemmlowp;
+
+    const int iters = 100000000;
+    volatile int i;
+
+    void Bar() {
+      ScopedProfilingLabel label("Bar");
+      for (i = 0; i < iters; i++) {}
+    }
+
+    void Foo() {
+      ScopedProfilingLabel label("Foo");
+      for (i = 0; i < iters; i++) {}
+      Bar();
+    }
+
+    void Init() {
+      RegisterCurrentThreadForProfiling();
+    }
+
+    #include "profiling/profiler.h"
+
+    int main() {
+      Init();
+      StartProfiling();
+      Foo();
+      FinishProfiling();
+    }
+*
+* Output:
+*
+    gemmlowp profile (1 threads, 304 samples)
+    100.00% Foo
+        51.32% other
+        48.68% Bar
+    0.00% other (outside of any label)
+*/
+//
+// Interpreting results
+// ====================
+//
+//  Each node shows the absolute percentage, among all the samples,
+//  of the number of samples that recorded the given pseudo-stack.
+//  The percentages are *NOT* relative to the parent node. In addition
+//  to your own labels, you will also see 'other' nodes that collect
+//  the remainder of samples under the parent node that didn't fall into
+//  any of the labelled child nodes. Example:
+//
+//  20% Foo
+//      12% Bar
+//      6% Xyz
+//      2% other
+//
+//  This means that 20% of all labels were under Foo, of which 12%/20%==60%
+//  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
+//  under either Bar or Xyz.
+//
+//  Typically, one wants to keep adding ScopedProfilingLabel's until
+//  the 'other' nodes show low percentages.
+//
+// Interpreting results with multiple threads
+// ==========================================
+//
+// At each sample, each thread registered for profiling gets sampled once.
+// So if there is one "main thread" spending its time in MainFunc() and
+// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
+// samples will be in WorkerFunc, so the profile will look like this:
+//
+// 80% WorkerFunc
+// 20% MainFunc
+
+#ifndef GEMMLOWP_PROFILING_PROFILER_H_
+#define GEMMLOWP_PROFILING_PROFILER_H_
+
+#ifndef GEMMLOWP_PROFILING
+#error Profiling is not enabled!
+#endif
+
+#include <vector>
+
+#include "profiling/instrumentation.h"
+
+namespace gemmlowp {
+
+// A tree view of a profile.
+class ProfileTreeView {
+  struct Node {
+    std::vector<Node*> children;
+    const char* label;
+    std::size_t weight;
+    Node() : label(nullptr), weight(0) {}
+    ~Node() {
+      for (auto child : children) {
+        delete child;
+      }
+    }
+  };
+
+  static bool CompareNodes(Node* n1, Node* n2) {
+    return n1->weight > n2->weight;
+  }
+
+  Node root_;
+
+  void PrintNode(const Node* node, int level) const {
+    if (level) {
+      for (int i = 1; i < level; i++) {
+        printf("    ");
+      }
+      printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
+    }
+    for (auto child : node->children) {
+      PrintNode(child, level + 1);
+    }
+  }
+
+  static void AddStackToNode(const ProfilingStack& stack, Node* node,
+                             std::size_t level) {
+    node->weight++;
+    if (stack.size == level) {
+      return;
+    }
+    Node* child_to_add_to = nullptr;
+    for (auto child : node->children) {
+      if (child->label == stack.labels[level]) {
+        child_to_add_to = child;
+        break;
+      }
+    }
+    if (!child_to_add_to) {
+      child_to_add_to = new Node;
+      child_to_add_to->label = stack.labels[level];
+      node->children.push_back(child_to_add_to);
+    }
+    AddStackToNode(stack, child_to_add_to, level + 1);
+    return;
+  }
+
+  void AddStack(const ProfilingStack& stack) {
+    AddStackToNode(stack, &root_, 0);
+  }
+
+  void AddOtherChildrenToNode(Node* node) {
+    std::size_t top_level_children_weight = 0;
+    for (auto c : node->children) {
+      AddOtherChildrenToNode(c);
+      top_level_children_weight += c->weight;
+    }
+    if (top_level_children_weight) {
+      Node* other_child = new Node;
+      other_child->label =
+          node == &root_ ? "other (outside of any label)" : "other";
+      other_child->weight = node->weight - top_level_children_weight;
+      node->children.push_back(other_child);
+    }
+  }
+
+  void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
+
+  void SortNode(Node* node) {
+    std::sort(node->children.begin(), node->children.end(), CompareNodes);
+    for (auto child : node->children) {
+      SortNode(child);
+    }
+  }
+
+  void Sort() { SortNode(&root_); }
+
+ public:
+  explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
+    for (auto stack : stacks) {
+      AddStack(stack);
+    }
+    AddOtherNodes();
+    Sort();
+  }
+
+  void Print() const {
+    printf("\n");
+    printf("gemmlowp profile (%d threads, %d samples)\n",
+           static_cast<int>(ThreadsUnderProfiling().size()),
+           static_cast<int>(root_.weight));
+    PrintNode(&root_, 0);
+    printf("\n");
+  }
+};
+
+// This function is the only place that determines our sampling frequency.
+inline void WaitOneProfilerTick() {
+  static const int millisecond = 1000000;
+
+#if defined __arm__ || defined __aarch64__
+  // Reduced sampling frequency on mobile devices helps limit time and memory
+  // overhead there.
+  static const int interval = 10 * millisecond;
+#else
+  static const int interval = 1 * millisecond;
+#endif
+
+  timespec ts;
+  ts.tv_sec = 0;
+  ts.tv_nsec = interval;
+  nanosleep(&ts, nullptr);
+}
+
+// This is how we track whether we've already started profiling,
+// to guard against misuse of the API.
+inline bool& IsProfiling() {
+  static bool b;
+  return b;
+}
+
+// This is how we tell the profiler thread to finish.
+inline bool& ProfilerThreadShouldFinish() {
+  static bool b;
+  return b;
+}
+
+// The profiler thread. See ProfilerThreadFunc.
+inline pthread_t& ProfilerThread() {
+  static pthread_t t;
+  return t;
+}
+
+// Records a stack from a running thread.
+// The tricky part is that we're not interrupting the thread.
+// This is OK because we're looking at a pseudo-stack of labels,
+// not at the real thread stack, and if the pseudo-stack changes
+// while we're recording it, we are OK with getting either the
+// old or the new stack. Note that ProfilingStack::Pop
+// only decrements the size, and doesn't null the popped label,
+// so if we're concurrently recording it, it shouldn't change
+// under our feet until another label is pushed, at which point
+// we are OK with getting either this new label or the old one.
+// In the end, the key atomicity property that we are relying on
+// here is that pointers are changed atomically, and the labels
+// are pointers (to literal strings).
+inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
+  assert(!dst->size);
+  while (dst->size < thread->stack.size) {
+    dst->labels[dst->size] = thread->stack.labels[dst->size];
+    dst->size++;
+    MemoryBarrier();  // thread->stack can change at any time
+  }
+}
+
+// The profiler thread's entry point.
+// Note that a separate thread is to be started each time we call
+// StartProfiling(), and finishes when we call FinishProfiling().
+// So here we only need to handle the recording and reporting of
+// a single profile.
+inline void* ProfilerThreadFunc(void*) {
+  assert(ProfilerThread() == pthread_self());
+
+  // Since we only handle one profile per profiler thread, the
+  // profile data (the array of recorded stacks) can be a local variable here.
+  std::vector<ProfilingStack> stacks;
+
+  while (!ProfilerThreadShouldFinish()) {
+    WaitOneProfilerTick();
+    {
+      AutoGlobalLock<ProfilerLockId> lock;
+      for (auto t : ThreadsUnderProfiling()) {
+        ProfilingStack s;
+        RecordStack(t, &s);
+        stacks.push_back(s);
+      }
+    }
+  }
+
+  // Profiling is finished and we now report the results.
+  ProfileTreeView(stacks).Print();
+
+  return nullptr;
+}
+
+// Starts recording samples.
+inline void StartProfiling() {
+  AutoGlobalLock<ProfilerLockId> lock;
+  ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
+  IsProfiling() = true;
+  ProfilerThreadShouldFinish() = false;
+  pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
+}
+
+// Stops recording samples, and prints a profile tree-view on stdout.
+inline void FinishProfiling() {
+  {
+    AutoGlobalLock<ProfilerLockId> lock;
+    ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
+    // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
+    // as the scary comments below should make clear.
+    // Should we use a condition variable?
+    ProfilerThreadShouldFinish() = true;
+  }  // must release the lock here to avoid deadlock with profiler thread.
+  pthread_join(ProfilerThread(), nullptr);
+  IsProfiling() = false;  // yikes, this should be guarded by the lock!
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PROFILING_PROFILER_H_
diff --git a/public/gemmlowp.h b/public/gemmlowp.h
new file mode 100644
index 0000000..7b0f35e
--- /dev/null
+++ b/public/gemmlowp.h
@@ -0,0 +1,52 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// gemmlowp.h: the main public interface header of gemmlowp.
+
+#ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_
+#define GEMMLOWP_PUBLIC_GEMMLOWP_H_
+
+#include "public/map.h"
+#include "internal/multi_thread_gemm.h"
+#include "internal/kernel_default.h"
+
+namespace gemmlowp {
+
+class GemmContext : public MultiThreadGemmContext {};
+
+// Computes a general matrix product ("GEMM").
+// The meaning of the offsets, result_mult_int and result_shift
+// parameters is the same as in the standard EightBitIntGemm interface
+// (which is also implemented in the eight_bit_int_gemm directory).
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder>
+void Gemm(GemmContext* context, const MatrixMap<const Scalar, LhsOrder>& lhs,
+          const MatrixMap<const Scalar, RhsOrder>& rhs,
+          MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+          int rhs_offset, int result_offset, int result_mult_int,
+          int result_shift) {
+  if (rhs.cols() > DefaultKernelForGEMM::Format::kCols / 2) {
+    MultiThreadGemm<DefaultKernelForGEMM::Format>(
+        context, DefaultKernelForGEMM(), lhs, rhs, result, lhs_offset,
+        rhs_offset, result_offset, result_mult_int, result_shift);
+  } else {
+    MultiThreadGemm<DefaultKernelForGEMV::Format>(
+        context, DefaultKernelForGEMV(), lhs, rhs, result, lhs_offset,
+        rhs_offset, result_offset, result_mult_int, result_shift);
+  }
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_GEMMLOWP_H_
diff --git a/public/map.h b/public/map.h
new file mode 100644
index 0000000..ba92c08
--- /dev/null
+++ b/public/map.h
@@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// map.h: a minimalist view-existing-buffer-as-a-matrix class,
+// which is how gemmlowp interfaces with external matrix data.
+
+#ifndef GEMMLOWP_PUBLIC_MAP_H_
+#define GEMMLOWP_PUBLIC_MAP_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// The two storage orders allowed to map buffers as matrices: ColMajor
+// means column-major, RowMajor means row-major.
+enum class MapOrder { ColMajor, RowMajor };
+
+// A MatrixMap is a view of an existing buffer as a matrix. It does not own
+// the buffer.
+template <typename tScalar, MapOrder tOrder>
+class MatrixMap {
+ public:
+  typedef tScalar Scalar;
+  static const MapOrder kOrder = tOrder;
+
+ protected:
+  Scalar* data_;  // not owned.
+  int rows_, cols_, stride_;
+
+ public:
+  MatrixMap(Scalar* data, int rows, int cols, int stride)
+      : data_(data), rows_(rows), cols_(cols), stride_(stride) {}
+
+  MatrixMap(const MatrixMap& other)
+      : data_(other.data_),
+        rows_(other.rows_),
+        cols_(other.cols_),
+        stride_(other.stride_) {}
+
+  int rows() const { return rows_; }
+  int cols() const { return cols_; }
+  int stride() const { return stride_; }
+  int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; }
+  int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; }
+  Scalar* data() const { return data_; }
+  Scalar* data(int row, int col) const {
+    return data_ + row * rows_stride() + col * cols_stride();
+  }
+  Scalar operator()(int row, int col) const { return *data(row, col); }
+  Scalar& operator()(int row, int col) { return *data(row, col); }
+
+  MatrixMap block(int start_row, int start_col, int block_rows,
+                  int block_cols) const {
+    assert(start_row >= 0);
+    assert(start_row + block_rows <= rows_);
+    assert(start_col >= 0);
+    assert(start_col + block_cols <= cols_);
+
+    return MatrixMap(data(start_row, start_col), block_rows, block_cols,
+                     stride_);
+  }
+};
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_PUBLIC_MAP_H_
diff --git a/scripts/prepare-device-for-benchmarking.sh b/scripts/prepare-device-for-benchmarking.sh
new file mode 100755
index 0000000..16ac2eb
--- /dev/null
+++ b/scripts/prepare-device-for-benchmarking.sh
@@ -0,0 +1,138 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Puts device in a special state giving optimal benchmark results.
+# Not very realistic wrt real-world conditions. We are more interested
+# in performance on devices in default state.
+
+#!/bin/bash
+
+echo "disabling mpdecision..."
+adb root
+adb remount
+adb shell "mv /system/bin/mpdecision /system/bin/mpdecision-dontfind" > /dev/null
+
+echo "rebooting device..."
+adb reboot
+adb wait-for-device
+
+echo "restarting adbd as root..."
+isroot=0
+while [ $isroot -eq 0 ]
+do
+  adb root > /dev/null
+  if [ $? -eq 0 ]
+  then
+    isroot=1
+  fi
+  echo "  retrying in 1 s..."
+  sleep 1
+done
+
+echo "querying ro.hardware..."
+hardware="`adb shell getprop ro.hardware`"
+while [ "$hardware" == "" ]
+do
+  echo "retrying in 1 s..."
+  sleep 1
+  hardware="`adb shell getprop ro.hardware`"
+done
+
+echo "got ro.hardware=$hardware"
+
+shouldstopui=0
+if [[ "$#" =~ .*--stop-ui.* ]]
+then
+  shouldstopui=1
+else
+  if [[ "$hardware" =~ sprout.* ]]
+  then
+    echo "detected Android One (sprout). Will default to leaving the UI on."
+  else
+    echo "Will default to stopping the UI."
+    shouldstopui=1
+  fi
+fi
+
+if [ $shouldstopui -ne 0 ]
+then
+  echo "stopping the UI..."
+  isshellstopped=0
+  while [ $isshellstopped -eq 0 ]
+  do
+    adb shell stop > /dev/null
+    if [ $? -eq 0 ]
+    then
+      isshellstopped=1
+    fi
+    echo "  retrying in 1 s..."
+    sleep 1
+  done
+fi
+
+waitsec=10
+echo "sleeping $waitsec s before changing CPU settings, to work around a race with Android startup..."
+sleep $waitsec
+
+echo "bringing all cores online..."
+for cpu in `seq 0 3`
+do
+  file="/sys/devices/system/cpu/cpu$cpu/online"
+  if [ -e $file ]
+  then
+    echo "  cpu $cpu"
+    echo "echo 1 > $file; exit" | adb shell > /dev/null
+    if [ $? -ne 0 ]
+    then
+      echo "WARNING: failed to bring cpu $cpu online ($file)"
+    fi
+  fi
+done
+
+echo "setting performance governor..."
+for cpu in `seq 0 3`
+do
+  file="/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor"
+  if [ -e $file ]
+  then
+    echo "  cpu $cpu"
+    adb shell "echo performance > $file" > /dev/null
+    if [ $? -ne 0 ]
+    then
+      echo "WARNING: failed to set cpufreq governor ($file)"
+    fi
+  fi
+done
+
+cpuloadlowsecs=0
+echo "waiting for CPU load to settle down..."
+while [ $cpuloadlowsecs -lt 5 ]
+do
+  cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`"
+  if [ "$cpuload" -lt "2" ]
+  then
+    cpuloadlowsecs=$((cpuloadlowsecs+1))
+    echo "CPU load has been low for $cpuloadlowsecs s..."
+  else
+    cpuloadlowsecs=0
+    echo "CPU load isn't low enough ($cpuload %)..."
+  fi
+  sleep 1
+done
+
+if [ $shouldstopui -eq 0 ]
+then
+  echo "OK, the device might be ready now, but the UI is still running,"
+  echo "so take a look at the screen to check if it's not doing something special."
+fi
diff --git a/scripts/restore-device-normal-state.sh b/scripts/restore-device-normal-state.sh
new file mode 100755
index 0000000..583a24d
--- /dev/null
+++ b/scripts/restore-device-normal-state.sh
@@ -0,0 +1,42 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Un-does the effect of prepare-device-for-benchmarking.sh
+
+#!/bin/bash
+
+echo "restoring mpdecision..."
+adb root
+adb remount
+adb shell "mv /system/bin/mpdecision-dontfind /system/bin/mpdecision" > /dev/null
+
+echo "rebooting device..."
+adb reboot
+adb wait-for-device
+
+cpuloadlowsecs=0
+echo "waiting for CPU load to settle down..."
+while [ $cpuloadlowsecs -lt 5 ]
+do
+  cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`"
+  if [ "$cpuload" -lt "2" ]
+  then
+    cpuloadlowsecs=$((cpuloadlowsecs+1))
+    echo "CPU load has been low for $cpuloadlowsecs s..."
+  else
+    cpuloadlowsecs=0
+    echo "CPU load isn't low enough ($cpuload %)..."
+  fi
+  sleep 1
+done
diff --git a/scripts/test-android.sh b/scripts/test-android.sh
new file mode 100755
index 0000000..b5d83d7
--- /dev/null
+++ b/scripts/test-android.sh
@@ -0,0 +1,83 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+if [ -z "$CXX" ]
+then
+  echo "please set the CXX environment variable to point to your native Android toolchain C++ compiler"
+  exit 1
+fi
+
+default_cflags="-O3"
+
+if [ "$#" -eq 0 ]
+then
+  echo "Usage: $0 files... [cflags...]"
+  echo "All command-line parameters are passed along to the C++ compiler, so they can \
+be either source files, or compiler flags."
+  echo "Default cflags: $default_cflags"
+  echo "Relies on the CXX environment variable to point to an Android C++ toolchain compiler."
+  exit 1
+fi
+
+EXE=gemmlowp-android-binary
+
+$CXX \
+ --std=c++11 \
+ -Wall -Wextra -pedantic \
+ -fPIE -pie -mfpu=neon -mfloat-abi=softfp \
+ -lstdc++ -latomic \
+ -I . -I .. \
+ -o $EXE \
+ -Wno-unused-variable -Wno-unused-parameter \
+ $default_cflags \
+ $*
+
+if [ $? != 0 ]; then
+  echo "build failed"
+  exit 1
+fi
+
+adb root
+
+if [ $? != 0 ]; then
+  echo "$0: adb root failed"
+  exit 1
+fi
+
+adb shell mkdir -p /data/local/tmp
+
+if [ $? != 0 ]; then
+  echo "$0: adb shell failed to mkdir /data/local/tmp"
+  exit 1
+fi
+
+adb push $EXE /data/local/tmp
+
+if [ $? != 0 ]; then
+  echo "$0: adb push failed to write to /data/local/tmp"
+  exit 1
+fi
+
+adb shell "echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"
+
+adb shell "/data/local/tmp/$EXE" | tee "log-$EXE"
+
+if [ $? != 0 ]; then
+  echo "$0: adb shell failed to run binary on device"
+  exit 1
+fi
+
+adb shell "echo ondemand > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"
diff --git a/test/benchmark.cc b/test/benchmark.cc
new file mode 100644
index 0000000..d59164a
--- /dev/null
+++ b/test/benchmark.cc
@@ -0,0 +1,210 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#include <iostream>
+#include <ctime>
+#include <cstdint>
+#include <vector>
+#include <map>
+#include <cstdlib>
+
+#include "test/test.h"
+#include "public/gemmlowp.h"
+
+namespace gemmlowp {
+
+double time() {
+#ifdef __APPLE__
+  timeval t;
+  gettimeofday(&t, nullptr);
+  return t.tv_sec + 1e-6 * t.tv_usec;
+#else
+  timespec t;
+  clock_gettime(CLOCK_REALTIME, &t);
+  return t.tv_sec + 1e-9 * t.tv_nsec;
+#endif
+}
+
+const double min_accurate_duration = 1e-1;
+const std::size_t min_working_set_size = 16 * 1024 * 1024;
+
+template <typename Kernel, typename LhsType, typename RhsType,
+          typename ResultType>
+double gflops_for_gemm_size(GemmContext* context, int rows, int depth,
+                            int cols) {
+  typedef std::uint8_t Scalar;
+
+  // set up the matrix pool
+
+  const std::size_t combined_three_matrices_sizes =
+      sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
+
+  const std::size_t matrix_pool_size =
+      1 + min_working_set_size / combined_three_matrices_sizes;
+
+  std::vector<LhsType> lhs(matrix_pool_size);
+  std::vector<RhsType> rhs(matrix_pool_size);
+  std::vector<ResultType> result(matrix_pool_size);
+
+  lhs[0].Resize(rows, depth);
+  MakeConstant(&lhs[0], 128);
+  rhs[0].Resize(depth, cols);
+  MakeConstant(&rhs[0], 128);
+  result[0].Resize(rows, cols);
+  MakeZero(&result[0]);
+
+  for (std::size_t i = 1; i < matrix_pool_size; i++) {
+    lhs[i] = lhs[0];
+    rhs[i] = rhs[0];
+    result[i] = result[0];
+  }
+
+  const int depth_shift = static_cast<int>(
+      std::ceil(0.5 * std::log(static_cast<float>(depth)) / std::log(2.0f)));
+
+  // main benchmark loop
+
+  int iters_at_a_time = 1;
+  float time_per_iter = 0.0f;
+  std::size_t matrix_index = 0;
+
+  while (true) {
+    double starttime = time();
+    for (int i = 0; i < iters_at_a_time; i++) {
+      Gemm(context, lhs[matrix_index].const_map(),
+           rhs[matrix_index].const_map(), &result[matrix_index].map(), -75, -91,
+           74980, 123, 18 + depth_shift);
+
+      matrix_index++;
+      if (matrix_index == matrix_pool_size) {
+        matrix_index = 0;
+      }
+    }
+    double endtime = time();
+
+    const float timing = static_cast<float>(endtime - starttime);
+
+    if (timing >= min_accurate_duration) {
+      time_per_iter = timing / iters_at_a_time;
+      break;
+    }
+
+    iters_at_a_time *= 2;
+  }
+
+  return 2e-9 * rows * depth * cols / time_per_iter;
+}
+
+void benchmark(GemmContext* context) {
+#ifdef GEMMLOWP_TEST_KERNEL
+  typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMM;
+  typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMV;
+#else
+  typedef gemmlowp::DefaultKernelForGEMM KernelForGEMM;
+  typedef gemmlowp::DefaultKernelForGEMV KernelForGEMV;
+#endif
+
+  std::map<std::tuple<int, int, int>, std::vector<double>> benchmark_results;
+
+  std::vector<std::tuple<int, int, int>> benchmark_sizes;
+  benchmark_sizes.emplace_back(10, 10, 10);
+  benchmark_sizes.emplace_back(20, 20, 20);
+  benchmark_sizes.emplace_back(30, 30, 30);
+  benchmark_sizes.emplace_back(40, 40, 40);
+  benchmark_sizes.emplace_back(50, 50, 50);
+  benchmark_sizes.emplace_back(60, 60, 60);
+  benchmark_sizes.emplace_back(64, 256, 147);
+  benchmark_sizes.emplace_back(100, 100, 1);
+  benchmark_sizes.emplace_back(100, 100, 100);
+  benchmark_sizes.emplace_back(100, 1000, 100);
+  benchmark_sizes.emplace_back(1000, 1000, 1);
+  benchmark_sizes.emplace_back(1000, 1000, 10);
+  benchmark_sizes.emplace_back(1000, 1000, 100);
+  benchmark_sizes.emplace_back(1000, 1000, 1000);
+
+  const int repeat = 2;
+
+  typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+  typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+#ifdef GEMMLOWP_TEST_PROFILE
+  gemmlowp::RegisterCurrentThreadForProfiling();
+  gemmlowp::StartProfiling();
+#endif
+
+  // We don't record the first repetition, it's just warm-up.
+  for (int r = 0; r < repeat + 1; r++) {
+    std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
+              << std::flush;
+    for (auto s : benchmark_sizes) {
+      double gflops = 0;
+      int rows = std::get<0>(s);
+      int depth = std::get<1>(s);
+      int cols = std::get<2>(s);
+      if (cols > KernelForGEMM::Format::kCols / 2) {
+        gflops =
+            gflops_for_gemm_size<KernelForGEMM, LhsType, RhsType, ResultType>(
+                context, rows, depth, cols);
+      } else {
+        gflops =
+            gflops_for_gemm_size<KernelForGEMV, LhsType, RhsType, ResultType>(
+                context, rows, depth, cols);
+      }
+      if (r > 0) {
+        benchmark_results[s].emplace_back(gflops);
+      }
+    }
+  }
+
+  std::cout << "                                                \r"
+            << std::flush;
+
+#ifdef GEMMLOWP_TEST_PROFILE
+  gemmlowp::FinishProfiling();
+#endif
+
+  std::cout.precision(4);
+
+  for (auto b : benchmark_results) {
+    sort(b.second.begin(), b.second.end());
+    std::cout << std::get<0>(b.first) << "x" << std::get<1>(b.first) << "x"
+              << std::get<2>(b.first) << " : " << b.second.back() << " GFlops/s"
+              << std::endl;
+  }
+  std::cout << std::endl;
+}
+
+}  // end namespace gemmlowp
+
+int main() {
+  {
+    gemmlowp::GemmContext context;
+    std::cout << "Benchmarking default mode (typically multi-threaded)..."
+              << std::endl;
+    gemmlowp::benchmark(&context);
+  }
+
+  {
+    gemmlowp::GemmContext context;
+    context.set_max_num_threads(1);
+    std::cout << "Benchmarking single-threaded mode..." << std::endl;
+    gemmlowp::benchmark(&context);
+  }
+}
diff --git a/test/test.cc b/test/test.cc
new file mode 100644
index 0000000..5574640
--- /dev/null
+++ b/test/test.cc
@@ -0,0 +1,526 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <unistd.h>
+#include <iostream>
+#include <ctime>
+#include <cstdint>
+#include <vector>
+#include <cstdlib>
+#include <string>
+
+#include "public/gemmlowp.h"
+#include "internal/kernel_reference.h"
+#include "eight_bit_int_gemm/eight_bit_int_gemm.h"
+
+namespace gemmlowp {
+
+struct ReferenceEightBitIntGemmContext {
+  ReferenceEightBitIntGemmContext()
+      : saturated_0_values(0), saturated_255_values(0) {}
+
+  int saturated_0_values, saturated_255_values;
+};
+
+void ReferenceEightBitIntGemm(ReferenceEightBitIntGemmContext* context, int m,
+                              int n, int k, const uint8_t* a, int32_t a_offset,
+                              int lda, const uint8_t* b, int32_t b_offset,
+                              int ldb, uint8_t* c, int32_t c_offset,
+                              int32_t c_mult_int, int32_t c_shift, int ldc) {
+  context->saturated_0_values = 0;
+  context->saturated_255_values = 0;
+  int i, j, l;
+
+  for (j = 0; j < n; j++) {
+    for (i = 0; i < m; i++) {
+      int32_t total = 0;
+      for (l = 0; l < k; l++) {
+        const int a_index = ((i * lda) + l);
+        const uint8_t a_as_byte = a[a_index];
+        const int32_t a_as_int = ((static_cast<std::int32_t>(a_as_byte)) + a_offset);
+        const int b_index = ((j * ldb) + l);
+        const uint8_t b_as_byte = b[b_index];
+        const int32_t b_as_int = ((static_cast<std::int32_t>(b_as_byte)) + b_offset);
+        const int32_t mult_as_int = (a_as_int * b_as_int);
+        total += mult_as_int;
+      }
+      const int c_index = ((ldc * i) + j);
+      int32_t output =
+          ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1))) >>
+           c_shift);
+      if (output >= 255) {
+        output = 255;
+        context->saturated_255_values++;
+      }
+      if (output <= 0) {
+        output = 0;
+        context->saturated_0_values++;
+      }
+      c[c_index] = static_cast<std::uint8_t>(output);
+    }
+  }
+}
+
+// *GemmWrapper's allow to wrap various Gemm functions in a uniform
+// interface, so we can use the same testing code to test all of them
+
+template <typename Kernel, typename Scalar, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder>
+struct SingleThreadGemmWrapper {
+  static const int kLhsBitDepth = Kernel::kLhsBitDepth;
+  static const int kRhsBitDepth = Kernel::kRhsBitDepth;
+
+  static const char* Name() {
+    static char buf[256];
+    snprintf(buf, sizeof(buf),
+            "SingleThreadGemm, Kernel: %s", Kernel().Name());
+    return buf;
+  }
+
+  typedef SingleThreadGemmContext Context;
+
+  static void Gemm(Context* context,
+                   const MatrixMap<const Scalar, LhsOrder>& lhs,
+                   const MatrixMap<const Scalar, RhsOrder>& rhs,
+                   MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                   int rhs_offset, int result_offset, int result_mult_int,
+                   int result_shift) {
+    SingleThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder,
+                     ResultOrder>(context, Kernel(), lhs, rhs, result,
+                                  lhs_offset, rhs_offset, result_offset,
+                                  result_mult_int, result_shift);
+  }
+};
+
+template <typename Kernel, typename Scalar, MapOrder LhsOrder,
+          MapOrder RhsOrder, MapOrder ResultOrder>
+struct MultiThreadGemmWrapper {
+  static const int kLhsBitDepth = Kernel::kLhsBitDepth;
+  static const int kRhsBitDepth = Kernel::kRhsBitDepth;
+
+  static const char* Name() {
+    static char buf[256];
+    snprintf(buf, sizeof(buf),
+            "MultiThreadGemm, Kernel: %s", Kernel().Name());
+    return buf;
+  }
+
+  typedef MultiThreadGemmContext Context;
+
+  static void Gemm(Context* context,
+                   const MatrixMap<const Scalar, LhsOrder>& lhs,
+                   const MatrixMap<const Scalar, RhsOrder>& rhs,
+                   MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                   int rhs_offset, int result_offset, int result_mult_int,
+                   int result_shift) {
+    MultiThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder,
+                    ResultOrder>(context, Kernel(), lhs, rhs, result,
+                                 lhs_offset, rhs_offset, result_offset,
+                                 result_mult_int, result_shift);
+  }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder>
+struct PublicGemmWrapper {
+  static const int kLhsBitDepth = 8;
+  static const int kRhsBitDepth = 8;
+
+  static const char* Name() { return "public Gemm"; }
+
+  typedef GemmContext Context;
+
+  static void Gemm(Context* context,
+                   const MatrixMap<const Scalar, LhsOrder>& lhs,
+                   const MatrixMap<const Scalar, RhsOrder>& rhs,
+                   MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                   int rhs_offset, int result_offset, int result_mult_int,
+                   int result_shift) {
+    gemmlowp::Gemm<uint8_t, LhsOrder, RhsOrder, ResultOrder>(
+        context, lhs, rhs, result, lhs_offset, rhs_offset, result_offset,
+        result_mult_int, result_shift);
+  }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder>
+struct EightBitIntGemmWrapper {
+  static const int kLhsBitDepth = 8;
+  static const int kRhsBitDepth = 8;
+
+  static const char* Name() { return "EightBitIntGemm"; }
+
+  typedef void Context;
+
+  static void Gemm(Context*, const MatrixMap<const Scalar, LhsOrder>& lhs,
+                   const MatrixMap<const Scalar, RhsOrder>& rhs,
+                   MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                   int rhs_offset, int result_offset, int result_mult_int,
+                   int result_shift) {
+    eight_bit_int_gemm::EightBitIntGemm(
+        rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset,
+        rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(),
+        result_offset, result_mult_int, result_shift, result->stride());
+  }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+          MapOrder ResultOrder>
+struct ReferenceEightBitIntGemmWrapper {
+  static const int kLhsBitDepth = 8;
+  static const int kRhsBitDepth = 8;
+
+  static const char* Name() { return "ReferenceEightBitIntGemm"; }
+
+  typedef ReferenceEightBitIntGemmContext Context;
+
+  static void Gemm(Context* context,
+                   const MatrixMap<const Scalar, LhsOrder>& lhs,
+                   const MatrixMap<const Scalar, RhsOrder>& rhs,
+                   MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+                   int rhs_offset, int result_offset, int result_mult_int,
+                   int result_shift) {
+    ReferenceEightBitIntGemm(
+        context, rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset,
+        rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(),
+        result_offset, result_mult_int, result_shift, result->stride());
+  }
+};
+
+// Our approach to choosing result_shift values for testing, is bisection.
+// This function takes an interval, [result_shift_min .. result_shift_max].
+// If too much saturation occurred in either direction, it bisects accordingly,
+// recursing until the interval contains only one value.
+// The primary reason why we prefer this over computing optimal shift values,
+// is that we actually want to exercise some saturation, as there is nontrivial
+// code handling that in gemmlowp.
+// Secondarily, this is faster than computing optimal shifts, since in 90% of
+// cases the first-tried shift value 16 turns out to be good enough.
+template <typename GemmWrapper, typename LhsType, typename RhsType,
+          typename ResultType>
+void test_gemm_impl(typename GemmWrapper::Context* context, const LhsType& lhs,
+                    const RhsType& rhs, ResultType* result, int lhs_offset,
+                    int rhs_offset, int result_offset, int result_mult_int,
+                    int result_shift_min, int result_shift_max) {
+  const int rows = lhs.rows();
+  const int cols = rhs.cols();
+  Check(lhs.cols() == rhs.rows());
+  const int depth = lhs.cols();
+
+  const int result_shift = (result_shift_min + result_shift_max) / 2;
+
+  GemmWrapper::Gemm(context, lhs.const_map(), rhs.const_map(), &result->map(),
+                    lhs_offset, rhs_offset, result_offset, result_mult_int,
+                    result_shift);
+
+  typedef typename ResultType::Scalar Scalar;
+  static const MapOrder kLhsOrder = LhsType::kOrder;
+  static const MapOrder kRhsOrder = RhsType::kOrder;
+  static const MapOrder kResultOrder = ResultType::kOrder;
+  ResultType ref_result(rows, cols);
+  ReferenceEightBitIntGemmContext reference_context;
+  ReferenceEightBitIntGemmWrapper<Scalar, kLhsOrder, kRhsOrder,
+                                  kResultOrder>::Gemm(&reference_context,
+                                                      lhs.const_map(),
+                                                      rhs.const_map(),
+                                                      &ref_result.map(),
+                                                      lhs_offset, rhs_offset,
+                                                      result_offset,
+                                                      result_mult_int,
+                                                      result_shift);
+
+  const bool good = *result == ref_result;
+  printf("%s: %dx%dx%d, %s, offsets %d/%d/%d, mult %d, shift %d\n",
+         good ? "PASS" : "FAIL", rows, depth, cols, GemmWrapper::Name(),
+         lhs_offset, rhs_offset, result_offset, result_mult_int, result_shift);
+
+  if (!good) {
+    int maxdiff = 0;
+    int countdiff = 0;
+    for (int c = 0; c < result->cols(); c++) {
+      for (int r = 0; r < result->rows(); r++) {
+        int a = (*result)(r, c);
+        int b = ref_result(r, c);
+        if (a != b) {
+          countdiff++;
+          maxdiff = std::max(maxdiff, std::abs(a - b));
+        }
+      }
+    }
+    printf("max difference: %d\n", maxdiff);
+    printf("number of different places: %d\n", countdiff);
+    int bad_coeffs_printed = 0;
+    for (int c = 0; c < result->cols() && bad_coeffs_printed < 20; c++) {
+      for (int r = 0; r < result->rows() && bad_coeffs_printed < 20; r++) {
+        if (ref_result(r, c) != (*result)(r, c)) {
+          printf("bad coeff: at (%d, %d), expected %d, got %d\n", r, c,
+                 ref_result(r, c), (*result)(r, c));
+          bad_coeffs_printed++;
+        }
+      }
+    }
+  }
+
+  Check(good);
+
+  if (result_shift_min != result_shift_max) {
+    const int max_allowed_saturated_values = result->size() / 16;
+
+    int new_result_shift_min = result_shift_min;
+    int new_result_shift_max = result_shift_max;
+    bool retry = false;
+
+    if (reference_context.saturated_0_values > max_allowed_saturated_values) {
+      new_result_shift_max = (result_shift_min + result_shift_max) / 2;
+      retry = true;
+    }
+
+    if (reference_context.saturated_255_values > max_allowed_saturated_values) {
+      new_result_shift_min = (result_shift_min + result_shift_max) / 2;
+      retry = true;
+    }
+
+    if (retry) {
+      test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset,
+                                  rhs_offset, result_offset, result_mult_int,
+                                  new_result_shift_min, new_result_shift_max);
+    }
+  }
+}
+
+template <typename GemmWrapper, typename LhsType, typename RhsType,
+          typename ResultType>
+void test_gemm(typename GemmWrapper::Context* context, const LhsType& lhs,
+               const RhsType& rhs, ResultType* result, int lhs_offset,
+               int rhs_offset, int result_offset, int result_mult_int) {
+  test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset, rhs_offset,
+                              result_offset, result_mult_int, 0, 32);
+}
+enum class WhatParamsToTest {
+  AllCombos,
+  OnlyGenericCase,
+};
+
+template <typename GemmWrapper>
+void test_gemm(typename GemmWrapper::Context* context, int rows, int depth,
+               int cols,
+               WhatParamsToTest what_to_test = WhatParamsToTest::AllCombos) {
+  typedef std::uint8_t Scalar;
+  typedef Matrix<Scalar, MapOrder::RowMajor> LhsType;
+  LhsType lhs(rows, depth);
+  MakeRandom(&lhs, GemmWrapper::kLhsBitDepth);
+  typedef Matrix<Scalar, MapOrder::ColMajor> RhsType;
+  RhsType rhs(depth, cols);
+  MakeRandom(&rhs, GemmWrapper::kRhsBitDepth);
+  typedef Matrix<Scalar, MapOrder::ColMajor> ResultType;
+  ResultType result(rows, cols);
+  MakeZero(&result);
+
+  if (what_to_test == WhatParamsToTest::AllCombos) {
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 1);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 0, 0, 1);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 10, 0, 1);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 10, 1);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 10);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 10, 10, 10);
+    test_gemm<GemmWrapper>(context, lhs, rhs, &result, 256, 1, 17, 4);
+  }
+  test_gemm<GemmWrapper>(context, lhs, rhs, &result, -75, -91, 74980, 123);
+}
+
+template <typename Kernel>
+void test_gemm_kernel(MultiThreadGemmContext* context) {
+  typedef MultiThreadGemmWrapper<Kernel, uint8_t, MapOrder::RowMajor,
+                                 MapOrder::ColMajor,
+                                 MapOrder::ColMajor> GemmWrapper;
+  test_gemm<GemmWrapper>(context, 1, 1, 1, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 2, 2, 2, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 3, 3, 3, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 4, 4, 4, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 5, 5, 5, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 9, 11, 13, WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 50, 50, 50, WhatParamsToTest::AllCombos);
+  test_gemm<GemmWrapper>(context, 500, 500, 500,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 100, 5000, 100,
+                         WhatParamsToTest::OnlyGenericCase);
+}
+
+template <typename GemmWrapper>
+void test_gemm(typename GemmWrapper::Context* context) {
+  test_gemm<GemmWrapper>(context, 1, 1, 1);
+  test_gemm<GemmWrapper>(context, 2, 1, 1);
+  test_gemm<GemmWrapper>(context, 1, 2, 1);
+  test_gemm<GemmWrapper>(context, 1, 1, 2);
+  test_gemm<GemmWrapper>(context, 2, 2, 2);
+  test_gemm<GemmWrapper>(context, 3, 3, 3);
+  test_gemm<GemmWrapper>(context, 4, 4, 4);
+  test_gemm<GemmWrapper>(context, 5, 5, 5);
+  test_gemm<GemmWrapper>(context, 6, 6, 6);
+  test_gemm<GemmWrapper>(context, 3, 5, 7);
+  test_gemm<GemmWrapper>(context, 7, 3, 5);
+  test_gemm<GemmWrapper>(context, 5, 7, 3);
+  test_gemm<GemmWrapper>(context, 8, 8, 8);
+
+  test_gemm<GemmWrapper>(context, 16, 16, 16);
+  test_gemm<GemmWrapper>(context, 32, 32, 32);
+  test_gemm<GemmWrapper>(context, 64, 64, 64);
+  test_gemm<GemmWrapper>(context, 128, 128, 128);
+
+  test_gemm<GemmWrapper>(context, 17, 24, 31);
+  test_gemm<GemmWrapper>(context, 37, 55, 73);
+  test_gemm<GemmWrapper>(context, 57, 87, 117);
+  test_gemm<GemmWrapper>(context, 93, 83, 73);
+  test_gemm<GemmWrapper>(context, 109, 89, 99);
+  test_gemm<GemmWrapper>(context, 78, 101, 82);
+
+  test_gemm<GemmWrapper>(context, 512, 512, 512,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1024, 1024, 1024,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 567, 2345, 123,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 100, 5000, 100,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1, 1, 1000,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1000, 1, 1,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1, 1000, 1,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1, 1000, 1000,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1000, 1, 1000,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 1000, 1000, 1,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 777, 3456, 1,
+                         WhatParamsToTest::OnlyGenericCase);
+  test_gemm<GemmWrapper>(context, 4567, 555, 1,
+                         WhatParamsToTest::OnlyGenericCase);
+}
+
+template <typename GemmWrapper>
+void test_gemv(typename GemmWrapper::Context* context) {
+  test_gemm<GemmWrapper>(context, 2, 2, 1);
+  test_gemm<GemmWrapper>(context, 3, 3, 1);
+  test_gemm<GemmWrapper>(context, 4, 4, 1);
+  test_gemm<GemmWrapper>(context, 5, 5, 1);
+  test_gemm<GemmWrapper>(context, 6, 6, 1);
+  test_gemm<GemmWrapper>(context, 3, 5, 1);
+  test_gemm<GemmWrapper>(context, 7, 3, 1);
+  test_gemm<GemmWrapper>(context, 5, 7, 1);
+  test_gemm<GemmWrapper>(context, 8, 8, 1);
+  test_gemm<GemmWrapper>(context, 32, 32, 1);
+  test_gemm<GemmWrapper>(context, 128, 128, 1);
+  test_gemm<GemmWrapper>(context, 321, 123, 1);
+}
+
+void test() {
+#ifdef GEMMLOWP_TEST_PROFILE
+  RegisterCurrentThreadForProfiling();
+  StartProfiling();
+#endif
+
+  GemmContext context;
+
+  // Test the internal GEMM interfaces
+  test_gemm<
+      SingleThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor,
+                              MapOrder::ColMajor, MapOrder::ColMajor>>(
+      &context);
+
+  test_gemm<
+      MultiThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor,
+                             MapOrder::ColMajor, MapOrder::ColMajor>>(&context);
+
+  // Test the public GEMM interfaces
+  test_gemm<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor,
+                              MapOrder::ColMajor>>(&context);
+
+  test_gemm<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor,
+                                   MapOrder::ColMajor, MapOrder::ColMajor>>(
+      &context);
+
+  // Test GEMV cases (internal interfaces)
+  test_gemv<
+      SingleThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor,
+                              MapOrder::ColMajor, MapOrder::ColMajor>>(
+      &context);
+
+  test_gemv<
+      MultiThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor,
+                             MapOrder::ColMajor, MapOrder::ColMajor>>(&context);
+
+  // Test GEMV cases (public interfaces)
+  test_gemv<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor,
+                              MapOrder::ColMajor>>(&context);
+
+  test_gemv<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor,
+                                   MapOrder::ColMajor, MapOrder::ColMajor>>(
+      &context);
+
+  // Test specific kernels with various different formats,
+  // to exercises corner cases especially in the packing code.
+  test_gemm_kernel<
+      ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<1, 1>, 1>,
+                                   KernelSideFormat<CellFormat<1, 1>, 1>>>>(
+      &context);
+
+  test_gemm_kernel<
+      ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<3, 4>, 2>,
+                                   KernelSideFormat<CellFormat<5, 4>, 3>>>>(
+      &context);
+
+  test_gemm_kernel<
+      ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<5, 3>, 3>,
+                                   KernelSideFormat<CellFormat<4, 3>, 2>>>>(
+      &context);
+
+  test_gemm_kernel<
+      ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>,
+                                   KernelSideFormat<CellFormat<4, 3>, 1>>>>(
+      &context);
+
+  test_gemm_kernel<
+      ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>,
+                                   KernelSideFormat<CellFormat<2, 3>, 2>>>>(
+      &context);
+
+// Test all our optimized kernels, even if they are not used
+// at the moment, as they might be handy later and so it's
+// useful to keep them functional for now.
+#ifdef GEMMLOWP_NEON
+  test_gemm_kernel<gemmlowp::NEONKernel12x4Depth2>(&context);
+  test_gemm_kernel<gemmlowp::NEONKernel20x1Depth4>(&context);
+  test_gemm_kernel<gemmlowp::NEONKernel8x1Depth4>(&context);
+#endif
+
+#ifdef GEMMLOWP_TEST_PROFILE
+  FinishProfiling();
+#endif
+
+  std::cerr << "All tests passed." << std::endl;
+
+  // We have been testing the eight_bit_int_gemm, so we should free its
+  // persistent
+  // resources now to avoid having leak-checking tools report leaks.
+  eight_bit_int_gemm::FreePersistentResources();
+}
+
+}  // end namespace gemmlowp
+
+int main() { gemmlowp::test(); }
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 0000000..f630674
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,118 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// test.h: shared testing helpers.
+
+#ifndef GEMMLOWP_TEST_TEST_H_
+#define GEMMLOWP_TEST_TEST_H_
+
+#ifdef GEMMLOWP_TEST_PROFILE
+#define GEMMLOWP_PROFILING
+#include "profiling/profiler.h"
+#endif
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+
+#include "public/map.h"
+
+namespace gemmlowp {
+
+inline int Random() {
+  // Use ugly old rand() since this doesn't need to be high quality.
+  return rand();
+}
+
+inline void Check(bool b) { ReleaseBuildAssertion(b, "test failed"); }
+
+// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
+// since it only maps existing data. In tests though, we need to
+// create our own matrices.
+template <typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+  typedef MatrixMap<tScalar, tOrder> Map;
+  typedef MatrixMap<const tScalar, tOrder> ConstMap;
+  typedef typename Map::Scalar Scalar;
+  using Map::kOrder;
+  using Map::rows_;
+  using Map::cols_;
+  using Map::stride_;
+  using Map::data_;
+
+ public:
+  Matrix() : Map(nullptr, 0, 0, 0) {}
+
+  Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+  Matrix(const Matrix& other) { *this = other; }
+
+  Matrix& operator=(const Matrix& other) {
+    Resize(other.rows_, other.cols_);
+    memcpy(data_, other.data_, size() * sizeof(Scalar));
+    return *this;
+  }
+
+  friend bool operator==(const Matrix& a, const Matrix& b) {
+    return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+           !memcmp(a.data_, b.data_, a.size());
+  }
+
+  void Resize(int rows, int cols) {
+    rows_ = rows;
+    cols_ = cols;
+    stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
+    storage.resize(size());
+    data_ = storage.data();
+  }
+
+  int size() const { return rows_ * cols_; }
+
+  Map& map() { return *static_cast<Map*>(this); }
+
+  ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+  std::vector<Scalar> storage;
+};
+
+template <typename MatrixType>
+void MakeRandom(MatrixType* m, int bits) {
+  typedef typename MatrixType::Scalar Scalar;
+  const Scalar mask = (1 << bits) - 1;
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = Random() & mask;
+    }
+  }
+}
+
+template <typename MatrixType>
+void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) {
+  for (int c = 0; c < m->cols(); c++) {
+    for (int r = 0; r < m->rows(); r++) {
+      (*m)(r, c) = val;
+    }
+  }
+}
+
+template <typename MatrixType>
+void MakeZero(MatrixType* m) {
+  MakeConstant(m, 0);
+}
+
+}  // namespace gemmlowp
+
+#endif  // GEMMLOWP_TEST_TEST_H_
diff --git a/test/test_allocator.cc b/test/test_allocator.cc
new file mode 100644
index 0000000..9e76b79
--- /dev/null
+++ b/test/test_allocator.cc
@@ -0,0 +1,54 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+#include "internal/allocator.h"
+
+namespace gemmlowp {
+
+void test_allocator(Allocator* a, int max_array_size) {
+  const std::size_t int32_array_size = Random() % max_array_size;
+  auto handle_to_int32_array = a->Reserve<std::int32_t>(int32_array_size);
+  const std::size_t int8_array_size = Random() % max_array_size;
+  auto handle_to_int8_array = a->Reserve<std::int8_t>(int8_array_size);
+  a->Commit();
+  std::int32_t* int32_array =
+      a->GetPointer<std::int32_t>(handle_to_int32_array);
+  std::int8_t* int8_array = a->GetPointer<std::int8_t>(handle_to_int8_array);
+  Check(int32_array == a->GetPointer<std::int32_t>(handle_to_int32_array));
+  Check(int8_array == a->GetPointer<std::int8_t>(handle_to_int8_array));
+  Check(
+      !(reinterpret_cast<std::uintptr_t>(int32_array) % Allocator::kAlignment));
+  Check(
+      !(reinterpret_cast<std::uintptr_t>(int8_array) % Allocator::kAlignment));
+  Check(reinterpret_cast<std::uintptr_t>(int8_array) >=
+        reinterpret_cast<std::uintptr_t>(int32_array + int32_array_size));
+  memset(int32_array, 0, sizeof(*int32_array) * int32_array_size);
+  memset(int8_array, 0, sizeof(*int8_array) * int8_array_size);
+  a->Decommit();
+}
+
+void test_allocator() {
+  Allocator allocator;
+
+  // Test allocating increasingly large sizes on the same allocator,
+  // starting with size 0.
+  for (int i = 1; i < 1000; i += 10) {
+    test_allocator(&allocator, i);
+  }
+}
+
+}  // namespace gemmlowp
+
+int main() { gemmlowp::test_allocator(); }
diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc
new file mode 100644
index 0000000..1fa3c9d
--- /dev/null
+++ b/test/test_blocking_counter.cc
@@ -0,0 +1,102 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <pthread.h>
+#include <vector>
+
+#include "internal/multi_thread_gemm.h"
+
+namespace gemmlowp {
+
+class Thread {
+ public:
+  Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement)
+      : blocking_counter_(blocking_counter),
+        number_of_times_to_decrement_(number_of_times_to_decrement),
+        made_the_last_decrement_(false) {
+    pthread_create(&thread_, nullptr, ThreadFunc, this);
+  }
+
+  ~Thread() { Join(); }
+
+  bool Join() const {
+    pthread_join(thread_, nullptr);
+    return made_the_last_decrement_;
+  }
+
+ private:
+  Thread(const Thread& other) = delete;
+
+  void ThreadFunc() {
+    for (int i = 0; i < number_of_times_to_decrement_; i++) {
+      Check(!made_the_last_decrement_);
+      made_the_last_decrement_ = blocking_counter_->DecrementCount();
+    }
+  }
+
+  static void* ThreadFunc(void* ptr) {
+    static_cast<Thread*>(ptr)->ThreadFunc();
+    return nullptr;
+  }
+
+  BlockingCounter* const blocking_counter_;
+  const int number_of_times_to_decrement_;
+  pthread_t thread_;
+  bool made_the_last_decrement_;
+};
+
+void test_blocking_counter(BlockingCounter* blocking_counter, int num_threads,
+                           int num_decrements_per_thread,
+                           int num_decrements_to_wait_for) {
+  std::vector<Thread*> threads;
+  blocking_counter->Reset(num_decrements_to_wait_for);
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(new Thread(blocking_counter, num_decrements_per_thread));
+  }
+  blocking_counter->Wait();
+
+  int num_threads_that_made_the_last_decrement = 0;
+  for (int i = 0; i < num_threads; i++) {
+    if (threads[i]->Join()) {
+      num_threads_that_made_the_last_decrement++;
+    }
+    delete threads[i];
+  }
+  Check(num_threads_that_made_the_last_decrement == 1);
+}
+
+void test_blocking_counter() {
+  BlockingCounter* blocking_counter = new BlockingCounter;
+
+  // repeating the entire test sequence ensures that we test
+  // non-monotonic changes.
+  for (int repeat = 1; repeat <= 2; repeat++) {
+    for (int num_threads = 1; num_threads <= 16; num_threads++) {
+      for (int num_decrements_per_thread = 1;
+           num_decrements_per_thread <= 64 * 1024;
+           num_decrements_per_thread *= 4) {
+        test_blocking_counter(blocking_counter, num_threads,
+                              num_decrements_per_thread,
+                              num_threads * num_decrements_per_thread);
+      }
+    }
+  }
+  delete blocking_counter;
+}
+
+}  // end namespace gemmlowp
+
+int main() { gemmlowp::test_blocking_counter(); }
diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc
new file mode 100644
index 0000000..c1482f7
--- /dev/null
+++ b/test/test_math_helpers.cc
@@ -0,0 +1,134 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <limits>
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// Our math helpers don't intend to be reliable all the way to the
+// limit of representable range, wrt overflow.
+// We don't care for 2G sized matrices.
+// This test stops at half of the representable range.
+template <typename Integer>
+Integer ValueRangeCutoff() {
+  return std::numeric_limits<Integer>::max() / 2;
+}
+
+int RandomNonnegativeFarAwayFromOverflow() { return Random() % (1 << 24); }
+
+template <int Modulus>
+void test_round_up_down(int x) {
+  Check(x >= RoundDown<Modulus>(x));
+  Check(x < RoundDown<Modulus>(x) + Modulus);
+  Check(RoundDown<Modulus>(x) % Modulus == 0);
+
+  Check(x <= RoundUp<Modulus>(x));
+  Check(x > RoundUp<Modulus>(x) - Modulus);
+  Check(RoundUp<Modulus>(x) % Modulus == 0);
+}
+
+template <int Modulus>
+void test_round_up_down() {
+  for (int i = 0; i < 100; i++) {
+    test_round_up_down<Modulus>(i);
+    const int N = ValueRangeCutoff<int>();
+    test_round_up_down<Modulus>(Random() % N);
+  }
+}
+
+template <typename Integer>
+void test_ceil_quotient(Integer x, Integer y) {
+  Check(CeilQuotient(x, y) * y >= x);
+  Check(CeilQuotient(x, y) * y < x + y);
+}
+
+template <typename Integer>
+void test_ceil_quotient() {
+  const Integer N = ValueRangeCutoff<Integer>();
+  const Integer K = std::min(N, Integer(100));
+  for (Integer x = 0; x < K; x++) {
+    for (Integer y = 1; y < K; y++) {
+      test_ceil_quotient(x, y);
+      test_ceil_quotient(x, Integer(1 + (Random() % (N - 1))));
+      test_ceil_quotient(Integer(Random() % N), y);
+      test_ceil_quotient(Integer(Random() % N),
+                         Integer(1 + (Random() % (N - 1))));
+    }
+  }
+}
+
+template <typename Integer>
+void test_round_up_to_next_power_of_two(Integer x) {
+  Check(RoundUpToPowerOfTwo(RoundUpToPowerOfTwo(x) == RoundUpToPowerOfTwo(x)));
+  Check(RoundUpToPowerOfTwo(x) >= x);
+  Check(x == 0 || RoundUpToPowerOfTwo(x) < 2 * x);
+  Check((RoundUpToPowerOfTwo(x) & (RoundUpToPowerOfTwo(x) - 1)) == 0);
+}
+
+template <typename Integer>
+void test_round_up_to_next_power_of_two() {
+  const Integer N = ValueRangeCutoff<Integer>();
+  const Integer K = std::min(N, Integer(100));
+  for (Integer x = 0; x < K; x++) {
+    test_round_up_to_next_power_of_two(x);
+    test_round_up_to_next_power_of_two(Random() % N);
+  }
+}
+
+void test_math_helpers() {
+  test_round_up_down<1>();
+  test_round_up_down<2>();
+  test_round_up_down<3>();
+  test_round_up_down<4>();
+  test_round_up_down<5>();
+  test_round_up_down<6>();
+  test_round_up_down<7>();
+  test_round_up_down<8>();
+  test_round_up_down<9>();
+  test_round_up_down<10>();
+  test_round_up_down<11>();
+  test_round_up_down<12>();
+  test_round_up_down<13>();
+  test_round_up_down<14>();
+  test_round_up_down<15>();
+  test_round_up_down<16>();
+
+  test_round_up_down<50>();
+  test_round_up_down<51>();
+
+  test_round_up_down<500>();
+  test_round_up_down<501>();
+
+  test_ceil_quotient<std::int8_t>();
+  test_ceil_quotient<std::uint8_t>();
+  test_ceil_quotient<std::int16_t>();
+  test_ceil_quotient<std::uint16_t>();
+  test_ceil_quotient<std::int32_t>();
+  test_ceil_quotient<std::uint32_t>();
+
+  test_round_up_to_next_power_of_two<std::int8_t>();
+  test_round_up_to_next_power_of_two<std::uint8_t>();
+  test_round_up_to_next_power_of_two<std::int16_t>();
+  test_round_up_to_next_power_of_two<std::uint16_t>();
+  test_round_up_to_next_power_of_two<std::int32_t>();
+  test_round_up_to_next_power_of_two<std::uint32_t>();
+}
+
+}  // end namespace gemmlowp
+
+int main() { gemmlowp::test_math_helpers(); }
author	Benoit Jacob <benoitjacob@google.com>	2015-06-25 15:50:59 -0400
committer	Benoit Jacob <benoitjacob@google.com>	2015-06-25 15:53:04 -0400
commit	75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae (patch)
tree	c8e35a06c7d959e6ad0a90b4929305055919e3f8
download	gemmlowp-75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae.tar.gz