aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBenoit Jacob <benoitjacob@google.com>2015-06-25 15:50:59 -0400
committerBenoit Jacob <benoitjacob@google.com>2015-06-25 15:53:04 -0400
commit75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae (patch)
treec8e35a06c7d959e6ad0a90b4929305055919e3f8
downloadgemmlowp-75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae.tar.gz
initial import
-rw-r--r--.gitignore7
-rw-r--r--AUTHORS.txt9
-rw-r--r--CONTRIBUTING.txt33
-rw-r--r--CONTRIBUTORS.txt13
-rw-r--r--LICENSE.txt202
-rw-r--r--README.txt126
-rw-r--r--eight_bit_int_gemm/eight_bit_int_gemm.cc89
-rw-r--r--eight_bit_int_gemm/eight_bit_int_gemm.h66
-rw-r--r--internal/allocator.h211
-rw-r--r--internal/block_params.h166
-rw-r--r--internal/common.h117
-rw-r--r--internal/compute.h103
-rw-r--r--internal/kernel.h217
-rw-r--r--internal/kernel_default.h41
-rw-r--r--internal/kernel_neon.h570
-rw-r--r--internal/kernel_reference.h119
-rw-r--r--internal/multi_thread_gemm.h496
-rw-r--r--internal/pack.h350
-rw-r--r--internal/pack_neon.h938
-rw-r--r--internal/single_thread_gemm.h103
-rw-r--r--internal/unpack.h100
-rw-r--r--internal/unpack_neon.h170
-rw-r--r--profiling/instrumentation.h217
-rw-r--r--profiling/profiler.h373
-rw-r--r--public/gemmlowp.h52
-rw-r--r--public/map.h77
-rwxr-xr-xscripts/prepare-device-for-benchmarking.sh138
-rwxr-xr-xscripts/restore-device-normal-state.sh42
-rwxr-xr-xscripts/test-android.sh83
-rw-r--r--test/benchmark.cc210
-rw-r--r--test/test.cc526
-rw-r--r--test/test.h118
-rw-r--r--test/test_allocator.cc54
-rw-r--r--test/test_blocking_counter.cc102
-rw-r--r--test/test_math_helpers.cc134
35 files changed, 6372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..4ff62a0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+*.o
+*.ii
+*.s
+**/.DS_Store
+?
+??
+???
diff --git a/AUTHORS.txt b/AUTHORS.txt
new file mode 100644
index 0000000..13a49e0
--- /dev/null
+++ b/AUTHORS.txt
@@ -0,0 +1,9 @@
+# This is the official list of gemmlowp authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS.txt file.
+# See the latter for an explanation.
+
+# Names should be added to this file as:
+# Name or Organization <email address>
+# The email address is not required for organizations.
+
+Google Inc.
diff --git a/CONTRIBUTING.txt b/CONTRIBUTING.txt
new file mode 100644
index 0000000..e7244b3
--- /dev/null
+++ b/CONTRIBUTING.txt
@@ -0,0 +1,33 @@
+Want to contribute? Great! First, read this page (including the small print at the end).
+
+Before you contribute
+=====================
+
+Before we can use your code, you must sign the Google Individual Contributor
+License Agreement (CLA),
+
+ https://developers.google.com/open-source/cla/individual?csw=1
+
+which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+Code reviews
+============
+
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+The small print
+===============
+
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt
new file mode 100644
index 0000000..e9396d6
--- /dev/null
+++ b/CONTRIBUTORS.txt
@@ -0,0 +1,13 @@
+# People who have agreed to one of the CLAs and can contribute patches.
+# The AUTHORS.txt file lists the copyright holders; this file
+# lists people. For example, Google employees are listed here
+# but not in AUTHORS.txt, because Google holds the copyright.
+#
+# https://developers.google.com/open-source/cla/individual
+# https://developers.google.com/open-source/cla/corporate
+#
+# Names should be added to this file as:
+# Name <email address>
+
+Benoit Jacob <benoitjacob@google.com>
+Pete Warden <petewarden@google.com>
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/README.txt b/README.txt
new file mode 100644
index 0000000..29fcdc2
--- /dev/null
+++ b/README.txt
@@ -0,0 +1,126 @@
+gemmlowp: a small self-contained low-precision GEMM library
+===========================================================
+
+This is not a full linear algebra library, only a GEMM library: it only does
+general matrix multiplication ("GEMM").
+
+
+Disclaimer
+==========
+
+This is not an official Google product (experimental or otherwise), it is just
+code that happens to be owned by Google.
+
+
+Portability, target platforms/architectures
+===========================================
+
+Should be portable to any platform with some C++11 and POSIX support,
+while we have optional optimized code paths for specifica architectures.
+
+Required:
+ C++11 (a small conservative subset of it)
+
+Required for some features:
+ * Some POSIX interfaces:
+ * pthreads (for multi-threaded operation and for profiling).
+ * sysconf (for multi-threaded operation to detect number of cores;
+ may be bypassed).
+
+Optional optimized code paths:
+At the moment, we only have optimized code paths for ARM NEON SIMD.
+Some are written in inline assembly, some are written in C++ using
+intrinsics. Both GCC and Clang are supported.
+
+
+Public interfaces
+=================
+
+1. gemmlowp public interface
+----------------------------
+
+ gemmlowp's main public interface is in the public/ subdirectory. The
+ header to include is
+ public/gemmlowp.h.
+ This is a headers-only library, so there is nothing to link to.
+
+2. EightBitIntGemm standard interface
+-------------------------------------
+
+ Additionally, the eight_bit_int_gemm/ subdirectory provides an
+ implementation of the standard EightBitIntGemm interface. The header
+ to include is
+ eight_bit_int_gemm/eight_bit_int_gemm.h
+ This is *NOT* a headers-only library, users need to link to
+ eight_bit_int_gemm/eight_bit_int_gemm.cc.
+
+
+Testing
+=======
+
+The test/ directory contains unit tests. The primary unit test is
+ test/test.cc
+Since it covers also the EightBitIntGemm interface, it needs to be
+linked against
+ eight_bit_int_gemm/eight_bit_int_gemm.cc
+
+The scripts/ directory contains a script to build and run a program
+on an Android device:
+ scripts/test-android.sh
+
+It expects the CXX environment variable to point to an Android toolchain's
+C++ compiler, and expects source files (and optionally, cflags) as
+command-line parameters. To build and run the above-mentioned main unit test,
+first set CXX e.g.:
+
+$ export CXX=/some/toolchains/arm-linux-androideabi-4.8/bin/arm-linux-androideabi-g++
+
+Then run:
+
+$ ./scripts/test-android.sh test/test.cc eight_bit_int_gemm/eight_bit_int_gemm.cc
+
+
+Profiling
+=========
+
+The profiling/ subdirectory offers a very simple non-interrupting sampling
+profiler that only requires pthreads (no signals).
+
+It relies on source code being instrumented with pseudo-stack labels.
+See profiling/instrumentation.h.
+A full example of using this profiler is given in profiling/profiler.h.
+
+
+Low-precision?
+==============
+
+"Low-precision" means that the input and output matrix entries are integers
+on at most 8 bits. The scalar type is uint8_t.
+
+This isn't the same as just doing plain matrix arithmetic over uint8_t,
+because that would overflow. To avoid overflow, we internally accumulate
+results on more than 8 bits, and at the end we keep only some significant
+8 bits. This relies on the caller providing suitable offset/multiplier/shift
+parameters, which effectively govern how we extract some significant 8 bit
+from our more-than-8bit temporary accumulators. See the extra function
+parameters taken by Gemm() in public/gemmlowp.h or by EightBitIntGemm() in
+eight_bit_int_gemm/eight_bit_int_gemm.h.
+
+
+Performance goals
+============================
+
+Our performance goals differ from typical GEMM performance goals in the
+following ways:
+
+1. We care not only about speed, but also about minimizing power usage.
+ We specifically care about charge usage in mobile/embedded devices.
+ This implies that we care doubly about minimizing memory bandwidth usage:
+ we care about it, like any GEMM, because of the impact on speed, and we
+ also care about it because it is a key factor of power usage.
+
+2. Most GEMMs are optimized primarily for large dense matrix sizes (>= 1000).
+ We do care about large sizes, but we also care specifically about the
+ typically smaller matrix sizes encountered in various mobile applications.
+ This means that we have to optimize for all sizes, not just for large enough
+ sizes.
diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.cc b/eight_bit_int_gemm/eight_bit_int_gemm.cc
new file mode 100644
index 0000000..4c52dee
--- /dev/null
+++ b/eight_bit_int_gemm/eight_bit_int_gemm.cc
@@ -0,0 +1,89 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "eight_bit_int_gemm/eight_bit_int_gemm.h"
+
+// gemmlowp symbols should have hidden visibility.
+// currently this is ensured in the build system by
+// passing -finlines-visibility-hidden. TODO: it would be
+// safer to hardcode it here with some #pragma's.
+#include "public/gemmlowp.h"
+
+namespace gemmlowp {
+
+namespace eight_bit_int_gemm {
+
+namespace {
+
+// To be used as template parameter for GlobalLock.
+// GlobalLock<EightBitIntGemmLockId> is the global lock
+// on EightBitIntGemm entry points, protecting
+// EightBitIntGemm's global state.
+struct EightBitIntGemmLockId;
+
+// Global state: consists of one global GemmContext instance.
+GemmContext* global_context;
+
+GemmContext* GetOrCreateGlobalContext() {
+ if (!global_context) {
+ global_context = new GemmContext;
+ }
+ return global_context;
+}
+
+void DestroyGlobalContext() {
+ delete global_context;
+ global_context = nullptr;
+}
+
+} // end anonymous namespace
+
+// Public interface entry points
+
+void EightBitIntGemm(int m, int n, int k, const std::uint8_t* a,
+ std::int32_t a_offset, int lda, const std::uint8_t* b,
+ std::int32_t b_offset, int ldb, std::uint8_t* c,
+ std::int32_t c_offset, std::int32_t c_mult_int,
+ std::int32_t c_shift, int ldc) {
+ AutoGlobalLock<EightBitIntGemmLockId> lock;
+ GemmContext* context = GetOrCreateGlobalContext();
+
+ MatrixMap<const std::uint8_t, MapOrder::RowMajor> lhs(b, n, k, ldb);
+ MatrixMap<const std::uint8_t, MapOrder::ColMajor> rhs(a, k, m, lda);
+ MatrixMap<std::uint8_t, MapOrder::ColMajor> result(c, n, m, ldc);
+
+ const int lhs_offset = b_offset;
+ const int rhs_offset = a_offset;
+ const int result_offset = c_offset;
+ const int result_mult_int = c_mult_int;
+ const int result_shift = c_shift;
+
+ Gemm(context, lhs, rhs, &result, lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+}
+
+void SetMaxNumThreads(int n) {
+ AutoGlobalLock<EightBitIntGemmLockId> lock;
+ GemmContext* context = GetOrCreateGlobalContext();
+ context->set_max_num_threads(n);
+}
+
+void FreePersistentResources() {
+ AutoGlobalLock<EightBitIntGemmLockId> lock;
+ DestroyGlobalContext();
+}
+
+} // namespace eight_bit_int_gemm
+
+} // namespace gemmlowp
diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.h b/eight_bit_int_gemm/eight_bit_int_gemm.h
new file mode 100644
index 0000000..1d2b1cb
--- /dev/null
+++ b/eight_bit_int_gemm/eight_bit_int_gemm.h
@@ -0,0 +1,66 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// eight_bit_int_gemm.h: exposes the standard EightBitIntGemm interface.
+
+#ifndef GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
+#define GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
+
+#include <cstdint>
+
+namespace gemmlowp {
+
+namespace eight_bit_int_gemm {
+
+// Concurrency / reentrancy notice
+// ===============================
+//
+// This eight_bit_int_gemm has global singleton persistent state.
+// A global lock ensures serialization of calls, so this library
+// is fully reentrant but only one calling thread gets to actually run
+// at a time, while other calling threads would wait. So it is safe
+// albeit potentially slow to call the functions exposed here on
+// multiple threads concurrently.
+//
+// Users who prefer a state-less, singleton-less interface,
+// should use the main gemmlowp interface (public/gemmlowp.h) instead.
+
+// The main entry point to compute a Gemm. This is the standard
+// EightBitIntGemm interface.
+void EightBitIntGemm(int m, int n, int k, const std::uint8_t *a,
+ std::int32_t a_offset, int lda, const std::uint8_t *b,
+ std::int32_t b_offset, int ldb, std::uint8_t *c,
+ std::int32_t c_offset, std::int32_t c_mult_int,
+ std::int32_t c_shift, int ldc);
+
+// Frees any persistent resources
+// (threads, thread pools, allocators, buffers, ...)
+// that gemmlowp might hold. This is called automatically
+// on thread exit, but one may also call it earlier, at any time.
+void FreePersistentResources();
+
+// Allows specifying the number of hardware threads, as a hint as to
+// how many worker threads to use for sufficiently large Gemm's.
+// We will never use more threads than that, but may use fewer,
+// for instance on Gemm's that are too small to benefit from all
+// available threads. The value 0 lets the implementation query
+// the system to determine the number of hardware threads.
+// Default value: 0.
+void SetMaxNumThreads(int n);
+
+} // namespace eight_bit_int_gemm
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
diff --git a/internal/allocator.h b/internal/allocator.h
new file mode 100644
index 0000000..30b2bd8
--- /dev/null
+++ b/internal/allocator.h
@@ -0,0 +1,211 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// allocator.h: a buffer allocator that allows avoiding most of the
+// malloc/free overhead, by:
+// 1. Requiring all N allocations to be reserved in advance, and
+// then commited at once, turning N allocations into 1.
+// 2. Being persistent, the allocated storage is reused across commits,
+// and only reallocated as needed when the commit size gets larger.
+//
+// This is driven by Android-specific needs:
+// 1. On Android, the default (Bionic) allocator tends to aggressively
+// unmap pages, which means that malloc/free can be surprisingly expensive.
+// 2. On Android, stack allocations with alloca() can't be as large as on
+// desktop platforms.
+//
+// General usage:
+// 1. Reserve blocks by calling Reserve(), which returns a Handle.
+// 2. Call Commit() once.
+// 3. Now it is possible to get pointers to allocated buffers by calling
+// GetPointer().
+// 4. Call Decommit() once.
+// 5. The allocator is now reverted to its original state, except that
+// it retained its allocated storage, so the next Commit() will be faster.
+// The allocated storage is only freed when the Allocator object is
+// destroyed.
+
+#ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_
+#define GEMMLOWP_INTERNAL_ALLOCATOR_H_
+
+#include "internal/common.h"
+
+#if defined ANDROID || defined __ANDROID__
+#include <android/api-level.h>
+#if __ANDROID_API__ < 16
+#include <malloc.h>
+#define GEMMLOWP_USE_MEMALIGN
+#endif
+#endif
+
+namespace gemmlowp {
+
+enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 };
+
+template <typename T>
+struct GetTypeIdImpl {};
+
+template <typename T>
+inline TypeId GetTypeId() {
+ return GetTypeIdImpl<T>::Value;
+}
+
+template <typename T>
+struct GetTypeIdImpl<const T> : GetTypeIdImpl<T> {};
+
+#define GEMMLOWP_REGISTER_TYPEID(type_, id) \
+ template <> \
+ struct GetTypeIdImpl<type_> { \
+ static const TypeId Value = TypeId::id; \
+ };
+
+GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8)
+GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8)
+GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16)
+GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16)
+GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32)
+GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32)
+
+class Allocator {
+ public:
+ Allocator()
+ : committed_(false),
+ storage_size_(0),
+ storage_(nullptr),
+ reserved_blocks_(0),
+ reserved_bytes_(0),
+ generation_(0) {}
+
+ ~Allocator() {
+ assert(!committed_);
+ assert(!reserved_blocks_);
+ DeallocateStorage();
+ }
+
+ // Alignment of allocated blocks.
+ static const std::size_t kAlignment = kDefaultCacheLineSize;
+
+ // This is all we need so far, and since the usage pattern is fixed,
+ // there is no point in allowing more until we need to.
+ static const std::size_t kMaxBlocks = 5;
+
+ void Commit() {
+ assert(!committed_);
+
+ if (reserved_bytes_ > storage_size_) {
+ DeallocateStorage();
+ storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_);
+#ifdef GEMMLOWP_USE_MEMALIGN
+ storage_ = memalign(kAlignment, storage_size_);
+#else
+ if (posix_memalign(&storage_, kAlignment, storage_size_)) {
+ storage_ = nullptr;
+ }
+#endif
+ }
+
+ ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure");
+ committed_ = true;
+ }
+
+ void Decommit() {
+ assert(committed_);
+ committed_ = false;
+ generation_++;
+
+ reserved_blocks_ = 0;
+ reserved_bytes_ = 0;
+ }
+
+ // See generation_
+ typedef std::size_t generation_t;
+
+ // A handle on a reserved block. The user obtains
+ // one by calling Reserve() and, after committing,
+ // passes it to GetPointer().
+ class Handle {
+ std::uint8_t index_;
+ generation_t generation_;
+ TypeId type_;
+
+ friend class Allocator;
+ };
+
+ // Reserves a block sized for n elements of type T, and
+ // returns a handle to it. Must be called before committing.
+ template <typename T>
+ Handle Reserve(std::size_t n) {
+ assert(!committed_ && "can't reserve blocks while committed");
+ assert(reserved_blocks_ < kMaxBlocks &&
+ "didn't expect to allocate this many blocks");
+ const std::size_t bytes = RoundUp<kAlignment>(n * sizeof(T));
+ const std::size_t offset = reserved_bytes_;
+ const std::size_t index = reserved_blocks_;
+
+ reserved_blocks_offsets_[index] = offset;
+ Handle h;
+ h.index_ = index;
+ h.generation_ = generation_;
+ h.type_ = GetTypeId<T>();
+
+ reserved_blocks_++;
+ reserved_bytes_ += bytes;
+
+ return h;
+ }
+
+ // Returns the pointer to the allocated buffer for the given handle.
+ // Must be called after committing.
+ template <typename T>
+ T* GetPointer(const Handle& h) const {
+ assert(committed_ && "can't get block pointers unless committed");
+ assert(h.index_ < reserved_blocks_ &&
+ "bad handle, points to inexistant block");
+ assert(h.generation_ == generation_ &&
+ "handle from earlier generation, have decommitted since");
+ assert(h.type_ == GetTypeId<T>() && "type mismatch");
+ std::size_t offset = reserved_blocks_offsets_[h.index_];
+ std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(storage_) + offset;
+ return reinterpret_cast<T*>(addr);
+ }
+
+ private:
+ void DeallocateStorage() {
+ assert(!committed_);
+ free(storage_);
+ storage_size_ = 0;
+ }
+
+ // Set to true by Commit() and to false by Decommit(). Initially false.
+ bool committed_;
+
+ // The actually allocated storage size and buffer pointer.
+ std::size_t storage_size_;
+ mutable void* storage_;
+
+ // The number of blocks that have been reserved by Reserve().
+ std::size_t reserved_blocks_;
+ // The number of bytes that have been reserved by Reserve().
+ std::size_t reserved_bytes_;
+ // The offsets of reserved blocks into the storage buffer.
+ std::size_t reserved_blocks_offsets_[kMaxBlocks];
+
+ // The 'generation' is incremented on Decommit() and allows catching
+ // bad GetPointer() calls still referring to a previous commit.
+ generation_t generation_;
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_ALLOCATOR_H_
diff --git a/internal/block_params.h b/internal/block_params.h
new file mode 100644
index 0000000..8545f94
--- /dev/null
+++ b/internal/block_params.h
@@ -0,0 +1,166 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// block_params.h: Logic to choose L1 and L2 block sizes
+// to optimize cache-friendliness.
+
+#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// A BlockParams instance contains a full description of all the block size
+// parameters to be used by a Gemm.
+// There are two nested levels of block subdivisions: first a subdivision
+// into large blocks that should fit in last-level cache (what we call L2 here)
+// and then another subdivision into smaller blocks that should fit in
+// L1 cache. There is then actually a third level of subdivision to fit
+// in registers, but we are not concerned with that here.
+struct BlockParams {
+ // L1 block parameters determine the size of small blocks that should
+ // fit in L1 cache.
+ int l1_rows;
+ int l1_cols;
+ int l1_depth;
+
+ // L2 block parameters determine the size of larger blocks that should
+ // fit in L2 cache.
+ int l2_rows;
+ int l2_cols;
+ int l2_depth;
+
+ template <typename KernelFormat>
+ void Init(int rows, int cols, int depth, int num_threads) {
+ FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, &l2_rows,
+ &l2_cols, &l2_depth);
+ FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, &l1_rows,
+ &l1_cols, &l1_depth);
+ }
+
+ template <typename KernelFormat>
+ static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads,
+ int* out_l2_rows, int* out_l2_cols,
+ int* out_l2_depth) {
+ int l2_rows = 0;
+ int l2_cols = 0;
+ int l2_depth = 0;
+ // No L2 blocking in the depth dimension at the moment.
+ // Too much loss of accuracy due to storing intermediate results in
+ // low precision.
+ // However, we still want to round l2_depth up to the next multiple
+ // of kernel depth, so as to avoid having to special-case unaligned depths.
+ l2_depth = RoundUp<KernelFormat::kDepth>(depth);
+
+ const int l2_bytes_to_use = kDefaultL2CacheSize;
+ const float l2_rhs_factor = kDefaultL2RhsFactor;
+
+ {
+ int max_cache_friendly_l2_cols = std::max(
+ 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth)));
+ int min_l2_cols_blocks = CeilQuotient(cols, max_cache_friendly_l2_cols);
+ l2_cols =
+ RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks));
+ }
+
+ {
+ int max_cache_friendly_l2_rows =
+ std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) /
+ (num_threads * (l2_depth + 4 * l2_cols)));
+ int min_l2_rows_blocks = CeilQuotient(rows, max_cache_friendly_l2_rows);
+ l2_rows =
+ RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks));
+ }
+
+ *out_l2_rows = l2_rows;
+ *out_l2_cols = l2_cols;
+ *out_l2_depth = l2_depth;
+ }
+
+ template <typename KernelFormat>
+ static void FindL1BlockSizes(int rows, int cols, int depth, int* out_l1_rows,
+ int* out_l1_cols, int* out_l1_depth) {
+ int l1_rows = 0;
+ int l1_cols = 0;
+ int l1_depth = 0;
+
+ // L2 block sizes should already be multiples of kernel block sizes.
+ assert(rows % KernelFormat::kRows == 0);
+ assert(cols % KernelFormat::kCols == 0);
+ assert(depth % KernelFormat::kDepth == 0);
+
+ // No L1 blocking in the columns dimension at the moment.
+ // Thought not to be needed. Similar to Eigen.
+ l1_cols = cols;
+
+ const int l1_bytes_to_use = kDefaultL1CacheSize;
+
+ {
+ int max_cache_friendly_l1_depth = std::max(
+ 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) /
+ (KernelFormat::kRows + KernelFormat::kCols));
+ int min_l1_depth_blocks =
+ CeilQuotient(depth, max_cache_friendly_l1_depth);
+ l1_depth = RoundUp<KernelFormat::kDepth>(
+ CeilQuotient(depth, min_l1_depth_blocks));
+ }
+
+ {
+ int max_cache_friendly_l1_rows =
+ std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols));
+ int min_l1_rows_blocks = CeilQuotient(rows, max_cache_friendly_l1_rows);
+ l1_rows =
+ RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks));
+ }
+
+ *out_l1_rows = l1_rows;
+ *out_l1_cols = l1_cols;
+ *out_l1_depth = l1_depth;
+ }
+};
+
+// A SideBlockParams instance contains only the block params relevant to
+// one side (LHS or RHS), expressed in terms of 'width' instead of
+// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means
+// the number of rows, while in the RHS, 'width' means the number of columns.
+// That allows us to write generic code that applies to either LHS or RHS.
+struct SideBlockParams {
+ // L1 block parameters determine the size of small blocks that should
+ // fit in L1 cache.
+ int l1_width;
+ int l1_depth;
+
+ // L2 block parameters determine the size of larger blocks that should
+ // fit in L2 cache.
+ int l2_width;
+ int l2_depth;
+};
+
+enum class Side { Lhs, Rhs };
+
+inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params,
+ const BlockParams& block_params) {
+ side_block_params->l1_width =
+ side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols;
+ side_block_params->l2_width =
+ side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols;
+
+ side_block_params->l1_depth = block_params.l1_depth;
+ side_block_params->l2_depth = block_params.l2_depth;
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_
diff --git a/internal/common.h b/internal/common.h
new file mode 100644
index 0000000..f7e643e
--- /dev/null
+++ b/internal/common.h
@@ -0,0 +1,117 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// common.h: contains stuff that's used throughout gemmlowp
+// and should always be available.
+
+#ifndef GEMMLOWP_INTERNAL_COMMON_H_
+#define GEMMLOWP_INTERNAL_COMMON_H_
+
+#include <pthread.h>
+#include <cstdint>
+#include <cassert>
+#include <cmath>
+#include <cstdlib>
+#include <algorithm>
+
+#include "profiling/instrumentation.h"
+
+#ifdef GEMMLOWP_PROFILING
+#include <set>
+#include <cstdio>
+#include <cstring>
+#endif
+
+// Detect NEON. It's important to check for both tokens.
+#if (defined __ARM_NEON) || (defined __ARM_NEON__)
+#define GEMMLOWP_NEON
+#endif
+
+namespace gemmlowp {
+
+// Standard cache line size. Useful to optimize alignment and
+// prefetches. Ideally we would query this at runtime, however
+// 64 byte cache lines are the vast majority, and even if it's
+// wrong on some device, it will be wrong by no more than a 2x factor,
+// which should be acceptable.
+const int kDefaultCacheLineSize = 64;
+
+// Default L1 and L2 data cache sizes. On x86, we should ideally query this at
+// runtime. On ARM, the instruction to query this is privileged and
+// Android kernels do not expose it to userspace. Fortunately, the majority
+// of ARM devices have roughly comparable values:
+// Nexus 5: L1 16k, L2 1M
+// Android One: L1 32k, L2 512k
+// The following values are equal to or somewhat lower than that, and were
+// found to perform well on both the Nexus 5 and Android One.
+// Of course, they would be too low for typical x86 CPUs where we would want
+// to set the L2 value to (L3 cache size / number of cores) at least.
+const int kDefaultL1CacheSize = 16 * 1024;
+const int kDefaultL2CacheSize = 256 * 1024;
+
+// The proportion of the cache that we intend to use for storing
+// RHS blocks. This should be between 0 and 1, and typically closer to 1,
+// as we typically want to use most of the L2 cache for storing a large
+// RHS block.
+const float kDefaultL2RhsFactor = 0.90f;
+
+// Hints the CPU to prefetch the cache line containing ptr.
+inline void Prefetch(const void* ptr) {
+#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch.
+ __builtin_prefetch(ptr);
+#else
+ (void)ptr;
+#endif
+}
+
+// Returns the runtime argument rounded down to the nearest multiple of
+// the fixed Modulus.
+template <int Modulus>
+int RoundDown(int i) {
+ return i - (i % Modulus);
+}
+
+// Returns the runtime argument rounded up to the nearest multiple of
+// the fixed Modulus.
+template <int Modulus>
+int RoundUp(int i) {
+ return RoundDown<Modulus>(i + Modulus - 1);
+}
+
+// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
+template <typename Integer>
+Integer CeilQuotient(Integer a, Integer b) {
+ return (a + b - 1) / b;
+}
+
+// Returns the argument rounded up to the nearest power of two.
+template <typename Integer>
+Integer RoundUpToPowerOfTwo(Integer n) {
+ Integer i = n - 1;
+ i |= i >> 1;
+ i |= i >> 2;
+ i |= i >> 4;
+ i |= i >> 8;
+ i |= i >> 16;
+ return i + 1;
+}
+
+template <int N>
+struct IsPowerOfTwo {
+ static const bool value = !(N & (N - 1));
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_COMMON_H_
diff --git a/internal/compute.h b/internal/compute.h
new file mode 100644
index 0000000..4b2f56a
--- /dev/null
+++ b/internal/compute.h
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compute.h: the central stage of the Gemm computation, operates
+// on already-packed LHS and RHS blocks and calls the Gemm kernel
+// to compute a block of the product.
+
+#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
+#define GEMMLOWP_INTERNAL_COMPUTE_H_
+
+#include "internal/kernel.h"
+#include "internal/block_params.h"
+#include "internal/pack.h"
+
+namespace gemmlowp {
+
+template <typename KernelFormat, typename PackedResult>
+class ComputeImpl {
+ typedef typename KernelFormat::Lhs KernelLhsFormat;
+ typedef typename KernelFormat::Rhs KernelRhsFormat;
+
+ const KernelBase& kernel_;
+ const BlockParams& block_params_;
+
+ PackedResult* const packed_result_;
+ const PackedSideBlock<KernelLhsFormat>& packed_lhs_;
+ const PackedSideBlock<KernelRhsFormat>& packed_rhs_;
+
+ public:
+ ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
+ PackedResult* _packed_result,
+ const PackedSideBlock<KernelLhsFormat>& _packed_lhs,
+ const PackedSideBlock<KernelRhsFormat>& _packed_rhs)
+ : kernel_(_kernel),
+ block_params_(_block_params),
+ packed_result_(_packed_result),
+ packed_lhs_(_packed_lhs),
+ packed_rhs_(_packed_rhs) {}
+
+ void Compute() {
+ for (int d = 0; d < block_params_.l2_depth; d += block_params_.l1_depth) {
+ int ds = std::min(block_params_.l1_depth, block_params_.l2_depth - d);
+
+ for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
+ int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
+
+ ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
+ }
+ }
+ }
+
+ private:
+ void ComputeRun(int start_row, int start_col, int start_depth, int depth) {
+ packed_lhs_.seek_run(start_row, start_depth);
+ packed_rhs_.seek_run(start_col, start_depth);
+ auto packed_result_block = packed_result_->Map().block(
+ start_row, start_col, KernelFormat::kRows, KernelFormat::kCols);
+ kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
+ packed_result_block.cols_stride(), packed_lhs_.current_data(),
+ packed_rhs_.current_data(), start_depth, depth);
+ }
+
+ void ComputeL1(int start_row, int rows, int start_col, int cols,
+ int start_depth, int depth) {
+ assert(rows % KernelFormat::kRows == 0);
+ assert(cols % KernelFormat::kCols == 0);
+ assert(depth % KernelFormat::kDepth == 0);
+
+ for (int c = 0; c < cols; c += KernelFormat::kCols) {
+ for (int r = 0; r < rows; r += KernelFormat::kRows) {
+ ComputeRun(start_row + r, start_col + c, start_depth, depth);
+ }
+ }
+ }
+};
+
+template <typename PackedResult, typename KernelLhsFormat,
+ typename KernelRhsFormat>
+void Compute(const KernelBase& kernel, const BlockParams& block_params,
+ PackedResult* packed_result,
+ const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+ const PackedSideBlock<KernelRhsFormat>& packed_rhs) {
+ ScopedProfilingLabel label("compute");
+ ComputeImpl<KernelFormat<KernelLhsFormat, KernelRhsFormat>, PackedResult>
+ impl(kernel, block_params, packed_result, packed_lhs, packed_rhs);
+
+ impl.Compute();
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_COMPUTE_H_
diff --git a/internal/kernel.h b/internal/kernel.h
new file mode 100644
index 0000000..5cb5e69
--- /dev/null
+++ b/internal/kernel.h
@@ -0,0 +1,217 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel.h: general definitions for kernels.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_H_
+#define GEMMLOWP_INTERNAL_KERNEL_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// Explanation of general gemmlowp terminology
+// ===========================================
+//
+// We use the following abbreviations:
+// LHS = "left-hand side"
+// RHS = "right-hand side"
+// Sometimes when referring to either LHS or RHS, we just say a "Side".
+//
+// In a matrix product of a MxK matrix times a KxN matrix,
+// we call K the 'depth'. Note that M is the number of rows
+// of the result (and of the LHS), and N is the number of columns
+// of the result (and of the RHS).
+//
+// In each of the LHS and RHS matrices, we call 'width' the
+// other dimension, besides the depth. So in the LHS, 'width'
+// is the number of rows, while in the RHS, 'width' is the number
+// of columns.
+//
+// So in the LHS MxK matrix, the depth is K and the width in M.
+// And in the RHS KxN matrix, the depth is K and the width in N.
+//
+// This is illustrated in this picture:
+//
+// RHS width
+// <----------------->
+// +-----------------+ ^
+// | RHS | | Depth
+// +-----------------+ v
+// ^ +--+ +-----------------+
+// | |L | | |
+// LHS width | |H | | Result |
+// | |S | | |
+// v +--+ +-----------------+
+// <-->
+// Depth
+
+// Explanation of gemmlowp kernel formats and "cells"
+// ==================================================
+//
+// Kernels operate on small LHS and RHS blocks that fit in registers.
+// These blocks are stored contiguously in memory, but not always
+// in a traditional column-major or row-major order; instead,
+// they consist of a number of sub-blocks, which we call "cells",
+// that are stored in column-major or row-major order. However,
+// what really matters to us is not so much rows vs columns, but
+// rather width vs depth. So we refer to "width-major" and "depth-major"
+// storage orders. In the LHS, width-major means row-major,
+// while in the RHS, width-major means column-major.
+// There is also a third possibility, "diagonal order",
+// which is unused at the moment.
+//
+// We aim to treat both sides, LHS and RHS, on an equal footing,
+// so we call them both 'sides'. A KernelFormat thus is just a pair
+// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat
+// contains a CellFormat and a number of cells; cells are only ever
+// stacked in the width dimension, which means stacked vertically in the
+// LHS and stacked horizondally in the RHS.
+//
+// Example
+// =======
+//
+// Let's work out the data layout expected by a kernel having the
+// following format (the struct names here are defined below in this file):
+//
+// KernelFormat<
+// KernelSideFormat<CellFormat<3, 4>, 3>,
+// KernelSideFormat<CellFormat<5, 4>, 2>
+// >
+//
+// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means:
+// 3 cells, each cell having dimensions (width=3, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the LHS,
+// DepthMajor means column-major, so the LHS cells are of size 3x4 in
+// column-major order, so the LHS layout is:
+//
+// 0 3 6 9
+// 1 4 8 10
+// 2 5 9 11
+// 12 15 18 21
+// 13 16 19 22
+// 14 17 20 23
+// 24 27 30 33
+// 25 28 31 34
+// 26 29 32 35
+//
+// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means:
+// 2 cells each having dimensions (width=5, depth=4), laid out in
+// DepthMajor order (the default value, see CellFormat). In the RHS,
+// DepthMajor means row-major, so the RHS cells are of size 4x5 in
+// row-major order, so the RHS layout is:
+//
+// 0 1 2 3 4 20 21 22 23 24
+// 5 6 7 8 9 25 26 27 28 29
+// 10 11 12 13 14 30 31 32 33 34
+// 15 16 17 18 19 35 36 37 38 39
+
+// CellOrder enumerates the possible storage orders (=layouts) for
+// a cell (see explanation above).
+enum class CellOrder { DepthMajor, WidthMajor, Diagonal };
+
+// CellFormat describes how data is laid
+// out in a cell. That is, a CellOrder together with actual dimensions.
+template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor>
+struct CellFormat {
+ static const int kWidth = tWidth;
+ static const int kDepth = tDepth;
+ static const CellOrder kOrder = tOrder;
+
+ static const int kSize = kWidth * kDepth;
+};
+
+// KernelSideFormat describes how data is laid out in a kernel side
+// (i.e. LHS or RHS). That is, a CellFormat together with a number of
+// cells. These cells are always stacked in the Width dimension.
+// For example, in the LHS case, the Width dimension is the rows dimension,
+// se we're saying that in the LHS, cells are stacked vertically.
+// We never stack cells in the Depth dimension.
+template <typename tCellFormat, int tCells>
+struct KernelSideFormat {
+ typedef tCellFormat Cell;
+ static const int kCells = tCells;
+ static const int kWidth = kCells * Cell::kWidth;
+ static const int kDepth = Cell::kDepth;
+};
+
+// KernelFormat describes fully the input data layout that a kernel expects.
+// It consists of two KernelSideFormat's, one for LHS and one for RHS.
+template <typename tLhs, typename tRhs>
+struct KernelFormat {
+ typedef tLhs Lhs;
+ typedef tRhs Rhs;
+
+ static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, "");
+ static const int kDepth = Lhs::Cell::kDepth;
+ static const int kRows = Lhs::Cell::kWidth * Lhs::kCells;
+ static const int kCols = Rhs::Cell::kWidth * Rhs::kCells;
+};
+
+inline const char* CellOrderName(CellOrder o) {
+ switch (o) {
+ case CellOrder::DepthMajor:
+ return "DepthMajor";
+ case CellOrder::WidthMajor:
+ return "WidthMajor";
+ case CellOrder::Diagonal:
+ return "Diagonal";
+ default:
+ assert(false);
+ return nullptr;
+ }
+}
+
+// Returns the offset into a cell, at which a given coefficient is stored.
+template <typename CellFormat>
+inline int OffsetIntoCell(int w, int d) {
+ switch (CellFormat::kOrder) {
+ case CellOrder::DepthMajor:
+ return w + d * CellFormat::kWidth;
+ case CellOrder::WidthMajor:
+ return d + w * CellFormat::kDepth;
+ case CellOrder::Diagonal:
+ assert(CellFormat::kWidth == CellFormat::kDepth);
+ static const int cell_width = CellFormat::kWidth;
+ return w + ((d + cell_width - w) % cell_width) * cell_width;
+ default:
+ assert(false);
+ return 0;
+ }
+}
+
+// KernelBase is the virtual base class below all kernels.
+// The idea is that we don't need to templatize all our code on the exact
+// kernel type; we only need to templatize on kernel format. Kernels
+// sharing the same format can thus share the same packing/unpacking code.
+struct KernelBase {
+ virtual const char* Name() const = 0;
+
+ // This is the kernel implementation. We use the word 'run' consistently
+ // throughout gemmlowp to mean an inner loop, the implementation of which
+ // is to be provided by a separate optimized function.
+ virtual void Run(std::int32_t* dst_ptr, int dst_row_stride,
+ int dst_col_stride, const std::uint8_t* lhs_ptr,
+ const std::uint8_t* rhs_ptr, int start_depth,
+ int run_depth) const = 0;
+
+ static const int kLhsBitDepth = 8;
+ static const int kRhsBitDepth = 8;
+
+ virtual ~KernelBase() {}
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_KERNEL_H_
diff --git a/internal/kernel_default.h b/internal/kernel_default.h
new file mode 100644
index 0000000..3e7cd5d
--- /dev/null
+++ b/internal/kernel_default.h
@@ -0,0 +1,41 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_default.h: Chooses default GEMM and GEMV kernels for the
+// host platform.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+#define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
+
+#include "internal/common.h"
+
+#ifdef GEMMLOWP_NEON
+#include "internal/kernel_neon.h"
+namespace gemmlowp {
+typedef NEONKernel12x4Depth2 DefaultKernelForGEMM;
+typedef NEONKernel8x1Depth4 DefaultKernelForGEMV;
+}
+#else
+#include "internal/kernel_reference.h"
+namespace gemmlowp {
+typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>,
+ KernelSideFormat<CellFormat<4, 4>, 2> > >
+ DefaultKernelForGEMM;
+typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>,
+ KernelSideFormat<CellFormat<1, 4>, 1> > >
+ DefaultKernelForGEMV;
+}
+#endif
+
+#endif // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
diff --git a/internal/kernel_neon.h b/internal/kernel_neon.h
new file mode 100644
index 0000000..2177fc2
--- /dev/null
+++ b/internal/kernel_neon.h
@@ -0,0 +1,570 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_neon.h: a collection of NEON optimized kernels.
+// Check in kernel_default.h which one(s) are actually used by default.
+// Others are mere experiments; they are still covered by tests
+// in case they might be useful some day.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+#define GEMMLOWP_INTERNAL_KERNEL_NEON_H_
+
+#include "internal/kernel.h"
+
+#include <cassert>
+
+namespace gemmlowp {
+
+// Our main GEMM kernel.
+struct NEONKernel12x4Depth2 : KernelBase {
+ typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>,
+ KernelSideFormat<CellFormat<4, 2>, 1> > Format;
+
+ const char* Name() const override { return "NEON, 12x4, straight, depth 2"; }
+
+ // TODO(benoitjacob): reorder function arguments so dst comes last
+ void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+ const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+ int start_depth, int run_depth) const override {
+ ScopedProfilingLabel label("optimized kernel");
+
+ assert(dst_row_stride == 1);
+
+ asm volatile(
+ // Clear accumulator registers (see layout below)
+ "vmov.s32 q4, #0\n"
+ "vmov.s32 q8, q4\n"
+ "vmov.s32 q12, q4\n"
+ "vmov.s32 q5, q4\n"
+ "vmov.s32 q9, q4\n"
+ "vmov.s32 q13, q4\n"
+ "vmov.s32 q6, q4\n"
+ "vmov.s32 q10, q4\n"
+ "vmov.s32 q14, q4\n"
+ "vmov.s32 q7, q4\n"
+ "vmov.s32 q11, q4\n"
+ "vmov.s32 q15, q4\n"
+
+ /* Main loop */
+
+ "loop_NEONKernel12x4Depth2_%=:\n"
+
+ // Overview of register layout:
+ //
+ // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0).
+ // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7
+ // (q1--q3).
+ // A 12x4 block of accumulators is stored in 32bit in q4--q15.
+ //
+ // +-----+-----+-----+-----+
+ // |d0[0]|d0[1]|d0[2]|d0[3]|
+ // Rhs +-----+-----+-----+-----+
+ // |d1[0]|d1[1]|d1[2]|d1[3]|
+ // +-----+-----+-----+-----+
+ //
+ // | | | | |
+ //
+ // Lhs | | | | |
+ //
+ // +--+--+ - - - - +-----+-----+-----+-----+
+ // |d2|d3| | q4 | q5 | q6 | q7 |
+ // |d2|d3| | q4 | q5 | q6 | q7 |
+ // |d2|d3| | q4 | q5 | q6 | q7 |
+ // |d2|d3| | q4 | q5 | q6 | q7 |
+ // +--+--+ - - - - +-----+-----+-----+-----+
+ // |d4|d5| | q8 | q9 | q10 | q11 |
+ // |d4|d5| | q8 | q9 | q10 | q11 |
+ // |d4|d5| | q8 | q9 | q10 | q11 |
+ // |d4|d5| | q8 | q9 | q10 | q11 |
+ // +--+--+ - - - - +-----+-----+-----+-----+
+ // |d6|d7| | q12 | q13 | q14 | q15 |
+ // |d6|d7| | q12 | q13 | q14 | q15 |
+ // |d6|d7| | q12 | q13 | q14 | q15 |
+ // |d6|d7| | q12 | q13 | q14 | q15 |
+ // +--+--+ - - - - +-----+-----+-----+-----+
+ //
+ // Accumulator
+
+ // Load 1 Rhs cell of size 2x4
+ "vld1.8 {d0}, [%[rhs_ptr]:64]!\n"
+
+ // Load 3 Lhs cells of size 4x2 each
+ "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+
+ // Expand Lhs/Rhs cells to 16 bit.
+ "vmovl.u8 q0, d0\n"
+ "vmovl.u8 q1, d2\n"
+ "vmovl.u8 q2, d4\n"
+ "vmovl.u8 q3, d6\n"
+
+ // Multiply-accumulate, level of depth 0
+ "vmlal.u16 q4, d2, d0[0]\n"
+ "vmlal.u16 q5, d2, d0[1]\n"
+ "vmlal.u16 q6, d2, d0[2]\n"
+ "vmlal.u16 q7, d2, d0[3]\n"
+ "vmlal.u16 q8, d4, d0[0]\n"
+ "vmlal.u16 q9, d4, d0[1]\n"
+ "vmlal.u16 q10, d4, d0[2]\n"
+ "vmlal.u16 q11, d4, d0[3]\n"
+ "vmlal.u16 q12, d6, d0[0]\n"
+ "vmlal.u16 q13, d6, d0[1]\n"
+ "vmlal.u16 q14, d6, d0[2]\n"
+ "vmlal.u16 q15, d6, d0[3]\n"
+
+ // Multiply-accumulate, level of depth 1
+ "vmlal.u16 q4, d3, d1[0]\n"
+ "vmlal.u16 q5, d3, d1[1]\n"
+ "vmlal.u16 q6, d3, d1[2]\n"
+ "vmlal.u16 q7, d3, d1[3]\n"
+ "vmlal.u16 q8, d5, d1[0]\n"
+ "vmlal.u16 q9, d5, d1[1]\n"
+ "vmlal.u16 q10, d5, d1[2]\n"
+ "vmlal.u16 q11, d5, d1[3]\n"
+ "vmlal.u16 q12, d7, d1[0]\n"
+ "vmlal.u16 q13, d7, d1[1]\n"
+ "vmlal.u16 q14, d7, d1[2]\n"
+ "vmlal.u16 q15, d7, d1[3]\n"
+
+ // Loop. Decrement loop index (depth) by 2, since we just handled 2
+ // levels of depth (Kernel::kDepth=2).
+ "subs %[run_depth], #2\n"
+ "bne loop_NEONKernel12x4Depth2_%=\n"
+
+ /* end of main loop */
+
+ /* Accumulate our local accumulator registers into the destination block
+ */
+
+ // Compute stride between consecutive columns, in bytes
+ "mov r0, #4\n" // multiply by 4 = sizeof(int32)
+ "mul %[dst_col_stride], r0\n"
+
+ // If start_depth == 0, then there is no preexisting accumulator
+ // to accumulate, so we can simply store our result.
+ "cmp %[start_depth], #0\n"
+ "beq store_result_NEONKernel12x4Depth2_%=\n"
+
+ "mov r0, %[dst_ptr]\n"
+
+ // Load a column
+ "mov r1, r0\n"
+ "vld1.32 {d0, d1}, [r1]!\n"
+ "vld1.32 {d2, d3}, [r1]!\n"
+ "vld1.32 {d4, d5}, [r1]!\n"
+ // Accumulate a column
+ "vadd.s32 q4, q4, q0\n"
+ "vadd.s32 q8, q8, q1\n"
+ "vadd.s32 q12, q12, q2\n"
+
+ "add r0, %[dst_col_stride]\n"
+ // Load a column
+ "mov r1, r0\n"
+ "vld1.32 {d0, d1}, [r1]!\n"
+ "vld1.32 {d2, d3}, [r1]!\n"
+ "vld1.32 {d4, d5}, [r1]!\n"
+ // Accumulate a column
+ "vadd.s32 q5, q5, q0\n"
+ "vadd.s32 q9, q9, q1\n"
+ "vadd.s32 q13, q13, q2\n"
+
+ "add r0, %[dst_col_stride]\n"
+ // Load a column
+ "mov r1, r0\n"
+ "vld1.32 {d0, d1}, [r1]!\n"
+ "vld1.32 {d2, d3}, [r1]!\n"
+ "vld1.32 {d4, d5}, [r1]!\n"
+ // Accumulate a column
+ "vadd.s32 q6, q6, q0\n"
+ "vadd.s32 q10, q10, q1\n"
+ "vadd.s32 q14, q14, q2\n"
+
+ "add r0, %[dst_col_stride]\n"
+ // Load a column
+ "mov r1, r0\n"
+ "vld1.32 {d0, d1}, [r1]!\n"
+ "vld1.32 {d2, d3}, [r1]!\n"
+ "vld1.32 {d4, d5}, [r1]!\n"
+ // Accumulate a column
+ "vadd.s32 q7, q7, q0\n"
+ "vadd.s32 q11, q11, q1\n"
+ "vadd.s32 q15, q15, q2\n"
+
+ "store_result_NEONKernel12x4Depth2_%=:\n"
+
+ "mov r0, %[dst_ptr]\n"
+ // Store a column
+ "mov r1, r0\n"
+ "vst1.32 {d8, d9}, [r1]!\n"
+ "vst1.32 {d16, d17}, [r1]!\n"
+ "vst1.32 {d24, d25}, [r1]!\n"
+ // Store a column
+ "add r0, %[dst_col_stride]\n"
+ "mov r1, r0\n"
+ "vst1.32 {d10, d11}, [r1]!\n"
+ "vst1.32 {d18, d19}, [r1]!\n"
+ "vst1.32 {d26, d27}, [r1]!\n"
+ // Store a column
+ "add r0, %[dst_col_stride]\n"
+ "mov r1, r0\n"
+ "vst1.32 {d12, d13}, [r1]!\n"
+ "vst1.32 {d20, d21}, [r1]!\n"
+ "vst1.32 {d28, d29}, [r1]!\n"
+ // Store a column
+ "add r0, %[dst_col_stride]\n"
+ "mov r1, r0\n"
+ "vst1.32 {d14, d15}, [r1]!\n"
+ "vst1.32 {d22, d23}, [r1]!\n"
+ "vst1.32 {d30, d31}, [r1]!\n"
+ : // outputs
+ [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+ [dst_ptr] "+r"(dst_ptr),
+ [run_depth] "+r"(run_depth)
+ : // inputs
+ [start_depth] "r"(start_depth),
+ [dst_col_stride] "r"(dst_col_stride)
+ : // clobbers
+ "cc", "memory", "r0", "r1",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+ }
+};
+
+// Our main GEMV kernel.
+struct NEONKernel8x1Depth4 : KernelBase {
+ typedef KernelFormat<KernelSideFormat<CellFormat<8, 4>, 1>,
+ KernelSideFormat<CellFormat<1, 4>, 1> > Format;
+
+ const char* Name() const override { return "NEON, 8x1, straight, depth 4"; }
+
+ void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+ const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+ int start_depth, int run_depth) const override {
+ ScopedProfilingLabel label("optimized kernel");
+
+ assert(dst_row_stride == 1);
+
+ asm volatile(
+ // Clear accumulator registers (see layout below)
+ "vmov.s32 q11, #0\n"
+ "vmov.s32 q12, q11\n"
+
+ /* Main loop */
+
+ "loop_NEONKernel8x1Depth4_%=:\n"
+
+ // Overview of register layout:
+ //
+ // A 4x1 cell of Rhs is stored in 16bit in d0.
+ // A 8x4 cell Lhs is stored in 16bit in d2--d9 (q1--q4).
+ // A block of accumulators of size 8x1 is stored in 32bit in q11--q12
+ //
+ // +-----+
+ // |d0[0]|
+ // +-----+
+ // |d0[1]|
+ // +-----+ Rhs
+ // |d0[2]|
+ // +-----+
+ // |d0[3]|
+ // +-----+
+ // | |
+ //
+ // Lhs | |
+ //
+ // +--+--+--+--+ - - - - +-----+
+ // |d2|d4|d6|d8| | q11 |
+ // |d2|d4|d6|d8| | q11 |
+ // |d2|d4|d6|d8| | q11 |
+ // |d2|d4|d6|d8| | q11 |
+ // +--+--+--+--+ - - - - +-----+ Accumulator
+ // |d3|d5|d7|d9| | q12 |
+ // |d3|d5|d7|d9| | q12 |
+ // |d3|d5|d7|d9| | q12 |
+ // |d3|d5|d7|d9| | q12 |
+ // +--+--+--+--+ - - - - +-----+
+
+ // Load 1 Rhs cell of size 4x1.
+ "vldr.32 d0, [%[rhs_ptr]]\n"
+ "add %[rhs_ptr], #4\n"
+
+ // Load 1 Lhs cell of size 8x4. Each vld1 instruction loads 1 col.
+ "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d8}, [%[lhs_ptr]:64]!\n"
+
+ // Expand Lhs/Rhs cells to 16 bit.
+ "vmovl.u8 q0, d0\n" // d1 is unused.
+ "vmovl.u8 q1, d2\n"
+ "vmovl.u8 q2, d4\n"
+ "vmovl.u8 q3, d6\n"
+ "vmovl.u8 q4, d8\n"
+
+ // Multiply-accumulate, level of depth 0
+ "vmlal.u16 q11, d2, d0[0]\n"
+ "vmlal.u16 q12, d3, d0[0]\n"
+
+ // Multiply-accumulate, level of depth 1
+ "vmlal.u16 q11, d4, d0[1]\n"
+ "vmlal.u16 q12, d5, d0[1]\n"
+
+ // Multiply-accumulate, level of depth 2
+ "vmlal.u16 q11, d6, d0[2]\n"
+ "vmlal.u16 q12, d7, d0[2]\n"
+
+ // Multiply-accumulate, level of depth 3
+ "vmlal.u16 q11, d8, d0[3]\n"
+ "vmlal.u16 q12, d9, d0[3]\n"
+
+ // Loop. Decrement loop index (depth) by 4, since we just handled 4
+ // levels
+ // of depth (Kernel::kDepth=4).
+ "subs %[run_depth], #4\n"
+ "bne loop_NEONKernel8x1Depth4_%=\n"
+
+ /* end of main loop */
+
+ /* Accumulate our local accumulator registers into the destination block
+ */
+
+ "cmp %[start_depth], #0\n"
+ "beq store_result_NEONKernel8x1Depth4_%=\n"
+
+ "mov r0, %[dst_ptr]\n"
+ // Load a column
+ "vld1.32 {d0, d1}, [r0]!\n"
+ "vld1.32 {d2, d3}, [r0]!\n"
+
+ // Accumulate a column
+ "vadd.s32 q11, q11, q0\n"
+ "vadd.s32 q12, q12, q1\n"
+
+ "store_result_NEONKernel8x1Depth4_%=:\n"
+
+ "mov r0, %[dst_ptr]\n"
+ "vst1.32 {d22, d23}, [r0]!\n"
+ "vst1.32 {d24, d25}, [r0]!\n"
+ : // outputs
+ [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+ [run_depth] "+r"(run_depth)
+ : // inputs
+ [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth),
+ [dst_ptr] "r"(dst_ptr)
+ : // clobbers
+ "cc", "memory", "r0",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+ }
+};
+
+// Another GEMV kernel, that performs best by itself;
+// unfortunately GEMV is dominated by LHS packing and we haven't succeeded
+// so far in making efficient packing code for this very wide (width 20)
+// format.
+struct NEONKernel20x1Depth4 : KernelBase {
+ typedef KernelFormat<KernelSideFormat<CellFormat<4, 4>, 5>,
+ KernelSideFormat<CellFormat<1, 4>, 1> > Format;
+
+ const char* Name() const override { return "NEON, 20x1, straight, depth 4"; }
+
+ void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+ const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+ int start_depth, int run_depth) const override {
+ ScopedProfilingLabel label("optimized kernel");
+
+ assert(dst_row_stride == 1);
+
+ asm volatile(
+ // Clear accumulator registers (see layout below)
+ "vmov.s32 q11, #0\n"
+ "vmov.s32 q12, q11\n"
+ "vmov.s32 q13, q11\n"
+ "vmov.s32 q14, q11\n"
+ "vmov.s32 q15, q11\n"
+
+ /* Main loop */
+
+ "loop_NEONKernel20x1Depth4_%=:\n"
+
+ // Overview of register layout:
+ //
+ // A 4x1 cell of Rhs is stored in 16bit in d0.
+ // 5 cells, of size 4x4 each, are stored in 16bit in d2--d21.
+ // A block of accumulators of size 20x1 is stored in 32bit
+ // in q11--q15.
+ //
+ // +-----+
+ // |d0[0]|
+ // +-----+
+ // |d0[1]|
+ // +-----+ Rhs
+ // |d0[2]|
+ // +-----+
+ // |d0[3]|
+ // +-----+
+ // | |
+ //
+ // Lhs | |
+ //
+ // +---+---+---+---+ - - - - +-----+
+ // |d2 |d3 |d4 |d5 | | q11 |
+ // |d2 |d3 |d4 |d5 | | q11 |
+ // |d2 |d3 |d4 |d5 | | q11 |
+ // |d2 |d3 |d4 |d5 | | q11 |
+ // +---+---+---+---+ - - - - +-----+
+ // |d6 |d7 |d8 |d9 | | q12 |
+ // |d6 |d7 |d8 |d9 | | q12 |
+ // |d6 |d7 |d8 |d9 | | q12 |
+ // |d6 |d7 |d8 |d9 | | q12 |
+ // +---+---+---+---+ - - - - +-----+
+ // |d10|d11|d12|d13| | q13 |
+ // |d10|d11|d12|d13| | q13 | Accumulator
+ // |d10|d11|d12|d13| | q13 |
+ // |d10|d11|d12|d13| | q13 |
+ // +---+---+---+---+ - - - - +-----+
+ // |d14|d15|d16|d17| | q14 |
+ // |d14|d15|d16|d17| | q14 |
+ // |d14|d15|d16|d17| | q14 |
+ // |d14|d15|d16|d17| | q14 |
+ // +---+---+---+---+ - - - - +-----+
+ // |d18|d19|d20|d21| | q15 |
+ // |d18|d19|d20|d21| | q15 |
+ // |d18|d19|d20|d21| | q15 |
+ // |d18|d19|d20|d21| | q15 |
+ // +---+---+---+---+ - - - - +-----+
+
+ // Load 1 Rhs cell
+ "vldr.32 d0, [%[rhs_ptr]]\n"
+ "add %[rhs_ptr], #4\n"
+
+ // Load 10 Lhs cells
+ "vld1.8 {d2}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d4}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d6}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d8}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d10}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d12}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d14}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d16}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d18}, [%[lhs_ptr]:64]!\n"
+ "vld1.8 {d20}, [%[lhs_ptr]:64]!\n"
+
+ // Expand Lhs/Rhs cells to 16 bit.
+ "vmovl.u8 q0, d0\n"
+ "vmovl.u8 q1, d2\n"
+ "vmovl.u8 q2, d4\n"
+ "vmovl.u8 q3, d6\n"
+ "vmovl.u8 q4, d8\n"
+ "vmovl.u8 q5, d10\n"
+ "vmovl.u8 q6, d12\n"
+ "vmovl.u8 q7, d14\n"
+ "vmovl.u8 q8, d16\n"
+ "vmovl.u8 q9, d18\n"
+ "vmovl.u8 q10, d20\n"
+
+ // Multiply-accumulate, level of depth 0
+ "vmlal.u16 q11, d2, d0[0]\n"
+ "vmlal.u16 q12, d6, d0[0]\n"
+ "vmlal.u16 q13, d10, d0[0]\n"
+ "vmlal.u16 q14, d14, d0[0]\n"
+ "vmlal.u16 q15, d18, d0[0]\n"
+
+ // Multiply-accumulate, level of depth 1
+ "vmlal.u16 q11, d3, d0[1]\n"
+ "vmlal.u16 q12, d7, d0[1]\n"
+ "vmlal.u16 q13, d11, d0[1]\n"
+ "vmlal.u16 q14, d15, d0[1]\n"
+ "vmlal.u16 q15, d19, d0[1]\n"
+
+ // Multiply-accumulate, level of depth 2
+ "vmlal.u16 q11, d4, d0[2]\n"
+ "vmlal.u16 q12, d8, d0[2]\n"
+ "vmlal.u16 q13, d12, d0[2]\n"
+ "vmlal.u16 q14, d16, d0[2]\n"
+ "vmlal.u16 q15, d20, d0[2]\n"
+
+ // Multiply-accumulate, level of depth 3
+ "vmlal.u16 q11, d5, d0[3]\n"
+ "vmlal.u16 q12, d9, d0[3]\n"
+ "vmlal.u16 q13, d13, d0[3]\n"
+ "vmlal.u16 q14, d17, d0[3]\n"
+ "vmlal.u16 q15, d21, d0[3]\n"
+
+ // Loop. Decrement loop index (depth) by 4, since we just handled 4
+ // levels
+ // of depth (Kernel::kDepth=4).
+ "subs %[run_depth], #4\n"
+ "bne loop_NEONKernel20x1Depth4_%=\n"
+
+ /* end of main loop */
+
+ /* Accumulate our local accumulator registers into the destination block
+ */
+
+ "cmp %[start_depth], #0\n"
+ "beq store_result_NEONKernel20x1Depth4_%=\n"
+
+ "mov r0, %[dst_ptr]\n"
+ // Load a column
+ "vld1.32 {d0, d1}, [r0]!\n"
+ "vld1.32 {d2, d3}, [r0]!\n"
+ "vld1.32 {d4, d5}, [r0]!\n"
+ "vld1.32 {d6, d7}, [r0]!\n"
+ "vld1.32 {d8, d9}, [r0]!\n"
+
+ // Accumulate a column
+ "vadd.s32 q11, q11, q0\n"
+ "vadd.s32 q12, q12, q1\n"
+ "vadd.s32 q13, q13, q2\n"
+ "vadd.s32 q14, q14, q3\n"
+ "vadd.s32 q15, q15, q4\n"
+
+ "store_result_NEONKernel20x1Depth4_%=:\n"
+
+ "mov r0, %[dst_ptr]\n"
+ "vst1.32 {d22, d23}, [r0]!\n"
+ "vst1.32 {d24, d25}, [r0]!\n"
+ "vst1.32 {d26, d27}, [r0]!\n"
+ "vst1.32 {d28, d29}, [r0]!\n"
+ "vst1.32 {d30, d31}, [r0]!\n"
+ : // outputs
+ [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
+ [run_depth] "+r"(run_depth)
+ : // inputs
+ [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride),
+ [start_depth] "r"(start_depth)
+ : // clobbers
+ "cc", "memory", "r0",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+ }
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_KERNEL_NEON_H_
diff --git a/internal/kernel_reference.h b/internal/kernel_reference.h
new file mode 100644
index 0000000..2c65087
--- /dev/null
+++ b/internal/kernel_reference.h
@@ -0,0 +1,119 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// kernel_reference.h: a reference kernel for CPU architectures where we don't
+// have optimized kernels yet. Also useful for testing, as it's templatized
+// to have any arbitrary format, allowing tests to cover all sorts of corner
+// cases.
+
+#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
+
+#include "internal/kernel.h"
+
+#include <cstring>
+#include <cstdio>
+
+namespace gemmlowp {
+
+// This kernel is templatized in an arbitrary Format template parameter,
+// allowing it to have any arbitrary format.
+template <typename tFormat>
+struct ReferenceKernel : KernelBase {
+ typedef tFormat Format;
+
+ const char* Name() const override {
+ static char buf[256];
+ snprintf(buf, sizeof(buf),
+ "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)",
+ Format::Lhs::kCells, Format::Lhs::Cell::kWidth,
+ Format::Lhs::Cell::kDepth,
+ CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells,
+ Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth,
+ CellOrderName(Format::Rhs::Cell::kOrder));
+ return buf;
+ }
+
+ void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride,
+ const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr,
+ int start_depth, int run_depth) const override {
+ std::int32_t accumulator[Format::kRows * Format::kCols];
+ memset(accumulator, 0, sizeof(accumulator));
+
+ const int run_depth_cells = run_depth / Format::kDepth;
+
+ // The outer loop is over the depth dimension.
+ for (int dc = 0; dc < run_depth_cells; dc++) {
+ // The next two loops are over cells of the Lhs (stacked vertically),
+ // and over cells of the Rhs (stacked horizontally).
+ for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
+ const std::uint8_t* lhs_cell_ptr = lhs_ptr +
+ (dc * Format::Lhs::kCells + rc) *
+ Format::Lhs::Cell::kWidth *
+ Format::kDepth;
+ for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
+ const std::uint8_t* rhs_cell_ptr = rhs_ptr +
+ (dc * Format::Rhs::kCells + cc) *
+ Format::Rhs::Cell::kWidth *
+ Format::kDepth;
+
+ // Now we are inside one cell of the Lhs and inside one cell
+ // of the Rhs, so the remaining inner loops are just
+ // traditional three loops of matrix multiplication.
+ for (int di = 0; di < Format::kDepth; di++) {
+ for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
+ for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
+ const std::uint8_t* lhs_coeff_ptr =
+ lhs_cell_ptr +
+ OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
+ const std::uint8_t* rhs_coeff_ptr =
+ rhs_cell_ptr +
+ OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
+ std::int32_t* accumulator_coeff_ptr =
+ accumulator + (ri + rc * Format::Lhs::Cell::kWidth) +
+ (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
+ *accumulator_coeff_ptr +=
+ std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (start_depth == 0) {
+ // start_depth == 0 means we haven't accumulated anything yet, so we need
+ // to overwrite the accumulator, as it hasn't been initialized to zero.
+ for (int r = 0; r < Format::kRows; r++) {
+ for (int c = 0; c < Format::kCols; c++) {
+ dst_ptr[r * dst_row_stride + c * dst_col_stride] =
+ accumulator[r + c * Format::kRows];
+ }
+ }
+ } else {
+ // We have already accumulated stuff, so we need to continue accumulating
+ // instead of just overwriting.
+ for (int r = 0; r < Format::kRows; r++) {
+ for (int c = 0; c < Format::kCols; c++) {
+ dst_ptr[r * dst_row_stride + c * dst_col_stride] +=
+ accumulator[r + c * Format::kRows];
+ }
+ }
+ }
+ }
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h
new file mode 100644
index 0000000..b008034
--- /dev/null
+++ b/internal/multi_thread_gemm.h
@@ -0,0 +1,496 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// multi_thread_gemm.h: Multi-threaded GEMM entry point.
+// Readers note: To understand this file, it is useful to first
+// read and understand the much simpler single_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
+
+#include <pthread.h>
+#include <unistd.h>
+#include <vector>
+
+#include "internal/single_thread_gemm.h"
+
+namespace gemmlowp {
+
+// A BlockingCounter lets one thread to wait for N events to occur.
+// This is how the master thread waits for all the worker threads
+// to have finished working.
+class BlockingCounter {
+ public:
+ BlockingCounter()
+ : cond_(PTHREAD_COND_INITIALIZER),
+ mutex_(PTHREAD_MUTEX_INITIALIZER),
+ count_(0) {}
+
+ // Sets/resets the counter; initial_count is the number of
+ // decrementing events that the Wait() call will be waiting for.
+ void Reset(int initial_count) {
+ pthread_mutex_lock(&mutex_);
+ assert(count_ == 0);
+ count_ = initial_count;
+ pthread_mutex_unlock(&mutex_);
+ }
+
+ // Decrements the counter; if the counter hits zero, signals
+ // the thread that was waiting for that, and returns true.
+ // Otherwise (if the decremented count is still nonzero),
+ // returns false.
+ bool DecrementCount() {
+ pthread_mutex_lock(&mutex_);
+ assert(count_ > 0);
+ count_--;
+ if (count_ == 0) {
+ pthread_cond_signal(&cond_);
+ }
+ bool retval = count_ == 0;
+ pthread_mutex_unlock(&mutex_);
+ return retval;
+ }
+
+ // Waits for the N other threads (N having been set by Reset())
+ // to hit the BlockingCounter.
+ void Wait() {
+ ScopedProfilingLabel label("BlockingCounter::Wait");
+ pthread_mutex_lock(&mutex_);
+ while (count_) {
+ pthread_cond_wait(&cond_, &mutex_);
+ }
+ pthread_mutex_unlock(&mutex_);
+ }
+
+ private:
+ pthread_cond_t cond_;
+ pthread_mutex_t mutex_;
+ int count_;
+};
+
+// A workload for a worker.
+struct Task {
+ Task() : local_allocator(nullptr) {}
+ virtual ~Task() {}
+ virtual void Run() const = 0;
+ Allocator* local_allocator;
+};
+
+// A worker thread.
+class Worker {
+ public:
+ enum class State {
+ ThreadStartup, // The initial state before the thread main loop runs.
+ Ready, // Is not working, has not yet received new work to do.
+ HasWork, // Has work to do.
+ ExitAsSoonAsPossible // Should exit at earliest convenience.
+ };
+
+ explicit Worker(BlockingCounter* counter_to_decrement_when_ready)
+ : task_(nullptr),
+ state_cond_(PTHREAD_COND_INITIALIZER),
+ state_mutex_(PTHREAD_MUTEX_INITIALIZER),
+ state_(State::ThreadStartup),
+ counter_to_decrement_when_ready_(counter_to_decrement_when_ready) {
+ pthread_create(&thread_, nullptr, ThreadFunc, this);
+ }
+
+ ~Worker() {
+ ChangeState(State::ExitAsSoonAsPossible);
+ pthread_join(thread_, nullptr);
+ }
+
+ // Changes State; may be called from either the worker thread
+ // or the master thread; however, not all state transitions are legal,
+ // which is guarded by assertions.
+ void ChangeState(State new_state) {
+ ScopedProfilingLabel label("Worker::ChangeState");
+ pthread_mutex_lock(&state_mutex_);
+ assert(new_state != state_);
+ switch (state_) {
+ case State::ThreadStartup:
+ assert(new_state == State::Ready);
+ break;
+ case State::Ready:
+ assert(new_state == State::HasWork ||
+ new_state == State::ExitAsSoonAsPossible);
+ break;
+ case State::HasWork:
+ assert(new_state == State::Ready ||
+ new_state == State::ExitAsSoonAsPossible);
+ break;
+ default:
+ abort();
+ }
+ state_ = new_state;
+ pthread_cond_signal(&state_cond_);
+ if (state_ == State::Ready) {
+ counter_to_decrement_when_ready_->DecrementCount();
+ }
+ pthread_mutex_unlock(&state_mutex_);
+ }
+
+ // Thread entry point.
+ void ThreadFunc() {
+ ScopedProfilingLabel label("Worker::ThreadFunc");
+ RegisterCurrentThreadForProfiling();
+
+ ChangeState(State::Ready);
+
+ // Thread main loop
+ while (true) {
+ // Get a state to act on
+ pthread_mutex_lock(&state_mutex_);
+ switch (state_) {
+ case State::ExitAsSoonAsPossible:
+ case State::HasWork:
+ break;
+ case State::Ready:
+ // In the 'Ready' state, we have nothing to do but to wait until
+ // we switch to another state.
+ while (state_ == State::Ready) {
+ ScopedProfilingLabel label("Worker::ThreadFunc waiting");
+ pthread_cond_wait(&state_cond_, &state_mutex_);
+ }
+ break;
+ default:
+ abort();
+ }
+ State state_to_act_upon = state_;
+ pthread_mutex_unlock(&state_mutex_);
+
+ // We now have a state to act on, so act.
+ switch (state_to_act_upon) {
+ case State::HasWork:
+ // Got work to do! So do it, and then revert to 'Ready' state.
+ assert(task_);
+ task_->Run();
+ delete task_;
+ task_ = nullptr;
+ ChangeState(State::Ready);
+ break;
+ case State::ExitAsSoonAsPossible:
+ return;
+ default:
+ abort();
+ }
+ }
+ }
+
+ static void* ThreadFunc(void* arg) {
+ static_cast<Worker*>(arg)->ThreadFunc();
+ return nullptr;
+ }
+
+ // Called by the master thead to give this worker work to do.
+ // It is only legal to call this if the worker
+ void StartWork(Task* task) {
+ assert(!task_);
+ task->local_allocator = &local_allocator_;
+ task_ = task;
+ assert(state_ == State::Ready);
+ ChangeState(State::HasWork);
+ }
+
+ private:
+ // The underlying thread.
+ pthread_t thread_;
+
+ // The task to be worked on.
+ const Task* task_;
+
+ // The condition variable and mutex guarding state changes.
+ pthread_cond_t state_cond_;
+ pthread_mutex_t state_mutex_;
+
+ // The state enum tells if we're currently working, waiting for work, etc.
+ State state_;
+
+ // Each thread had a local allocator so they can allocate temporary
+ // buffers without blocking each other.
+ Allocator local_allocator_;
+
+ // pointer to the master's thread BlockingCounter object, to notify the
+ // master thread of when this worker switches to the 'Ready' state.
+ BlockingCounter* const counter_to_decrement_when_ready_;
+};
+
+// A very simple pool of workers, that only allows the very
+// specific parallelization pattern that we use here:
+// a fixed number of workers can be given work, and one then
+// waits for all of them to finish.
+class WorkersPool {
+ public:
+ WorkersPool() {}
+
+ ~WorkersPool() {
+ for (auto w : workers_) {
+ delete w;
+ }
+ }
+
+ BlockingCounter& counter_to_decrement_when_ready() {
+ return counter_to_decrement_when_ready_;
+ }
+
+ // Give work to a specific worker.
+ void StartWorker(int index, Task* task_) {
+ assert(static_cast<std::size_t>(index) < workers_.size());
+ workers_[index]->StartWork(task_);
+ }
+
+ // Ensures that the pool has at least the given count of workers.
+ // If any new worker has to be created, this function waits for it to
+ // be ready.
+ void CreateWorkers(std::size_t workers_count) {
+ if (workers_.size() >= workers_count) {
+ return;
+ }
+ counter_to_decrement_when_ready_.Reset(workers_count - workers_.size());
+ while (workers_.size() < workers_count) {
+ workers_.push_back(new Worker(&counter_to_decrement_when_ready_));
+ }
+ counter_to_decrement_when_ready_.Wait();
+ }
+
+ private:
+ // copy construction disallowed
+ WorkersPool(const WorkersPool&) = delete;
+
+ // The workers in this pool. They are owned by the pool:
+ // the pool creates workers and destroys them in its destructor.
+ std::vector<Worker*> workers_;
+
+ // The BlockingCounter used to wait for the workers.
+ BlockingCounter counter_to_decrement_when_ready_;
+};
+
+// The task we use to implement a multi-threaded Gemm: a block of the
+// RHS has been packed by the master thread; each worker thread
+// then has to pack a block of the LHS and accumulate the Gemm of these
+// packed LHS and RHS blocks.
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+ MapOrder RhsOrder, MapOrder ResultOrder>
+struct GemmWithPackedRhsTask : Task {
+ GemmWithPackedRhsTask(
+ const KernelBase& _kernel, const MatrixMap<const Scalar, LhsOrder>& _lhs,
+ const PackedSideBlock<typename KernelFormat::Rhs>& _packed_rhs,
+ MatrixMap<Scalar, ResultOrder>* _result, int _lhs_offset, int _rhs_offset,
+ int _result_offset, int _result_mult_int, int _result_shift)
+ : kernel(_kernel),
+ lhs(_lhs),
+ packed_rhs(_packed_rhs),
+ result(*_result),
+ lhs_offset(_lhs_offset),
+ rhs_offset(_rhs_offset),
+ result_offset(_result_offset),
+ result_mult_int(_result_mult_int),
+ result_shift(_result_shift) {}
+
+ void Run() const override {
+ ScopedProfilingLabel label("GemmWithPackedRhsTask");
+
+ const int rows = result.rows();
+ const int cols = result.cols();
+ const int depth = lhs.cols();
+
+ BlockParams block_params;
+ block_params.Init<KernelFormat>(rows, cols, depth, 1);
+
+ PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(
+ Side::Lhs, local_allocator, block_params, rhs_offset);
+
+ PackedResultInt32 packed_result(local_allocator, block_params);
+
+ local_allocator->Commit();
+
+ for (int c = 0; c < cols; c += block_params.l2_cols) {
+ int cs = std::min(block_params.l2_cols, cols - c);
+
+ for (int r = 0; r < rows; r += block_params.l2_rows) {
+ int rs = std::min(block_params.l2_rows, rows - r);
+
+ PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+ Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
+
+ auto result_block = result.block(r, c, rs, cs);
+ UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs,
+ depth, result_offset, result_mult_int, result_shift);
+ }
+ }
+
+ local_allocator->Decommit();
+ }
+
+ const KernelBase& kernel;
+ const MatrixMap<const Scalar, LhsOrder> lhs;
+ const PackedSideBlock<typename KernelFormat::Rhs> packed_rhs;
+ MatrixMap<Scalar, ResultOrder> result;
+ int lhs_offset;
+ int rhs_offset;
+ int result_offset;
+ int result_mult_int;
+ int result_shift;
+};
+
+class MultiThreadGemmContext : public SingleThreadGemmContext {
+ public:
+ MultiThreadGemmContext() : max_num_threads_(0) {}
+
+ void set_max_num_threads(int n) { max_num_threads_ = n; }
+
+ int max_num_threads() const { return max_num_threads_; }
+
+ WorkersPool* workers_pool() { return &workers_pool_; }
+
+ protected:
+ // The workers pool used by MultiThreadGemm. Making
+ // this part of the context allows it to be persistent,
+ // avoiding recreating threads on every Gemm.
+ WorkersPool workers_pool_;
+
+ // The maximum number of worker threads to use (in addition
+ // to the master thread).
+ // The default value 0 means the default behavior of
+ // detecting the number of hardware threads. Nonzero values mean
+ // skipping and overriding hardware detection.
+ int max_num_threads_;
+};
+
+// Determines how many worker threads should be used for a given Gemm
+// operation.
+template <int KernelRows>
+inline int HowManyWorkers(MultiThreadGemmContext* context, int rows, int cols,
+ int depth) {
+ // First check if the user set an explicit maximum number of threads.
+ int max_count = context->max_num_threads();
+ if (!max_count) {
+ // No user-set maximum number of threads, so we need to
+ // do some hardware detection.
+ // This is expensive to query so we do it only once.
+ // Too bad for dynamicness. Also, we dont use the c++11 standard getter
+ // because Google's coding style currently bans #include <thread_>.
+ static const int hardware_threads_count =
+ static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
+
+ max_count = hardware_threads_count;
+ }
+
+ // Basic calculation: take into account max pool size, and
+ // how many rows we have to feed our kernel.
+ int workers_count = std::min(max_count, CeilQuotient(rows, KernelRows));
+
+ // At this point for small products we already have workers_count==1 so
+ // we can avoid doing more work; otherwise, we still want to check
+ // that the cubic size (rows*cols*depth) is big enough to keep
+ // workers_ busy.
+ if (workers_count > 1) {
+ // Empirically determined value.
+ static const int min_cubic_size_per_thread = 32 * 1024;
+
+ // We can only multiply two out of three sizes without risking overflow
+ int cols_times_depth = cols * depth;
+
+ if (cols_times_depth < min_cubic_size_per_thread) {
+ // in that case, we can multiply by rows without risking overflow
+ int cubic_size = rows * cols_times_depth;
+ workers_count = std::min(
+ workers_count, CeilQuotient(cubic_size, min_cubic_size_per_thread));
+ }
+ }
+
+ assert(workers_count > 0 && workers_count <= max_count);
+ return workers_count;
+}
+
+// The main multi-threaded Gemm function.
+// To understand it, first read the code of SingleThreadedGemm().
+// The parallelization scheme used here is to have this master function
+// pack a block of RHS and then start worker threads to pack a block of LHS
+// each, and accumulate the corresponding products.
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+ MapOrder RhsOrder, MapOrder ResultOrder>
+void MultiThreadGemm(MultiThreadGemmContext* context, const KernelBase& kernel,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ ScopedProfilingLabel label("gemmlowp::MultiThreadGemm");
+
+ assert(lhs.cols() == rhs.rows());
+
+ int rows = result->rows();
+ int cols = result->cols();
+ int depth = lhs.cols();
+
+ const int workers_count =
+ HowManyWorkers<KernelFormat::kRows>(context, rows, cols, depth);
+ if (workers_count == 1) {
+ return SingleThreadGemm<KernelFormat, Scalar, LhsOrder, RhsOrder,
+ ResultOrder>(context, kernel, lhs, rhs, result,
+ lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+ }
+ assert(workers_count > 1);
+
+ Allocator* allocator = context->allocator();
+ WorkersPool* workers_pool = context->workers_pool();
+
+ workers_pool->CreateWorkers(workers_count);
+
+ BlockParams block_params;
+ block_params.Init<KernelFormat>(rows, cols, depth, workers_count);
+
+ PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
+ Side::Rhs, allocator, block_params, lhs_offset);
+ allocator->Commit();
+
+ // We loop over large blocks of the RHS.
+ for (int c = 0; c < cols; c += block_params.l2_cols) {
+ int cs = std::min(block_params.l2_cols, cols - c);
+
+ // Pack a large block of the RHS.
+ PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+
+ // Give work to each worker.
+ int next_start_row = 0;
+ workers_pool->counter_to_decrement_when_ready().Reset(workers_count);
+ for (int w = 0; w < workers_count; w++) {
+ int start_row = next_start_row;
+ next_start_row = std::min(
+ rows, RoundUp<KernelFormat::kRows>(rows * (w + 1) / workers_count));
+
+ int block_rows = next_start_row - start_row;
+ auto lhs_block = lhs.block(start_row, 0, block_rows, depth);
+ auto result_block = result->block(start_row, c, block_rows, cs);
+ typedef GemmWithPackedRhsTask<KernelFormat, Scalar, LhsOrder, RhsOrder,
+ ResultOrder> TaskType;
+
+ auto task = new TaskType(kernel, lhs_block, packed_rhs, &result_block,
+ lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+ workers_pool->StartWorker(w, task);
+ }
+ // Wait for the workers.
+ workers_pool->counter_to_decrement_when_ready().Wait();
+ }
+
+ allocator->Decommit();
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_
diff --git a/internal/pack.h b/internal/pack.h
new file mode 100644
index 0000000..b37d505
--- /dev/null
+++ b/internal/pack.h
@@ -0,0 +1,350 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack.h: packing blocks of the LHS and RHS into the data layout
+// that is expected by compute.h and eventually by kernels.
+// Because this data layout depends on the kernel format, code here
+// is templated in KernelLhsFormat/KernelRhsFormat.
+//
+// Readers note: an important theme around here is that we try hard
+// to handle both Lhs and Rhs with a single piece of code. We indifferently
+// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
+// by (row, column) indices, we address them by (width, depth), as explained
+// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
+// at once.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_H_
+#define GEMMLOWP_INTERNAL_PACK_H_
+
+#include <cstring>
+
+#include "internal/block_params.h"
+#include "internal/kernel.h"
+#include "internal/common.h"
+#include "internal/allocator.h"
+
+namespace gemmlowp {
+
+// A PackedSideBlock instance is a packed block of either the LHS or RHS
+// (whence the generic 'Side' name).
+//
+// 'Packed' means that it is laid out in the storage order that
+// is expected by the specified kernel format. From a block of the input
+// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
+// or PackRhs().
+template <typename KernelSideFormat>
+class PackedSideBlock {
+ public:
+ PackedSideBlock(Side side, Allocator* allocator,
+ const BlockParams& block_params,
+ int rank_one_update_multiplier)
+ : allocator_(allocator),
+ rank_one_update_multiplier_(rank_one_update_multiplier),
+ pos_(0) {
+ GetSideBlockParams(side, &params_, block_params);
+ data_handle_ =
+ allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
+ rank_one_update_handle_ =
+ allocator_->Reserve<std::int32_t>(params_.l2_width);
+ }
+
+ ~PackedSideBlock() {}
+
+ void seek_run(int start_width, int start_depth) const {
+ int kernel_run_depth =
+ std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
+ pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
+ }
+
+ void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
+
+ void seek_forward_n_cells(int n) const {
+ pos_ += n * KernelSideFormat::Cell::kSize;
+ }
+
+ const std::uint8_t* current_data() const {
+ return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+ }
+
+ std::uint8_t* current_data() {
+ return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
+ }
+
+ std::int32_t* rank_one_update() {
+ return allocator_->GetPointer<std::int32_t>(rank_one_update_handle_);
+ }
+
+ const std::int32_t* rank_one_update() const {
+ return allocator_->GetPointer<const std::int32_t>(rank_one_update_handle_);
+ }
+
+ std::int32_t rank_one_update_multiplier() const {
+ return rank_one_update_multiplier_;
+ }
+
+ const SideBlockParams& params() const { return params_; }
+
+ private:
+ // The block size parameters that this PackedSizeBlock follows.
+ // The L2 parameters determine its overall size, while the L1 parameters,
+ // together with the kernel format template parameter, determine
+ // the fine details of the storage/traversal order.
+ SideBlockParams params_;
+
+ // Pointer to the allocator provided by the caller. Not owned.
+ // The Allocator is assumed to outlive the PackedSideBlock.
+ Allocator* const allocator_;
+
+ // Handle on the buffer backing this packed block. Owned.
+ Allocator::Handle data_handle_;
+
+ // Handle on the additional buffer backing the rank-one-update vector
+ // associated with this block. Owned.
+ Allocator::Handle rank_one_update_handle_;
+
+ // The constant multiplier of the rank one update vector.
+ std::int32_t rank_one_update_multiplier_;
+
+ // pos_ is the current position in the buffer, which we access
+ // sequentially, like a file.
+ // The idea is that we pack data in the same order as it is
+ // going to be traversed during the computation, which for
+ // cache-friendliness reasons is complicated to random-access,
+ // as the offsets calculations would be intricate. So we
+ // give up random-access addressing, and instead content ourselves
+ // with sequential access.
+ //
+ // pos_ is mutable because during the computation we will want to
+ // be able to iterate on the data in a const PackedSideBlock.
+ mutable int pos_;
+};
+
+// WidthMajor and DepthMajor are custom phrases modelled after the
+// standard terminology 'row-major' and 'column-major'. Their meaning
+// should be transparent once one has read the explanation in kernel.h:
+// for example, in the Lhs, the 'width' dimension is the rows dimension,
+// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
+// Another way to put it: WidthMajor means that contiguous storage is used
+// for entries having the same 'width' index.
+enum class SideMapOrder { WidthMajor, DepthMajor };
+
+// Similar to MatrixMap from map.h, but in terms of width/depth instead of
+// rows/columns. Used to address blocks of the input LHS/RHS matrices when
+// packing them.
+template <typename tScalar, SideMapOrder tOrder>
+class SideMap {
+ public:
+ typedef tScalar Scalar;
+ static const SideMapOrder kOrder = tOrder;
+
+ SideMap(Scalar* data, int width, int depth, int stride)
+ : data_(data), width_(width), depth_(depth), stride_(stride) {}
+
+ SideMap(const SideMap& other)
+ : data_(other.data_),
+ width_(other.width_),
+ depth_(other.depth_),
+ stride_(other.stride_) {}
+
+ int width() const { return width_; }
+ int depth() const { return depth_; }
+ int stride() const { return stride_; }
+ int width_stride() const {
+ return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
+ }
+ int depth_stride() const {
+ return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
+ }
+ Scalar* data() const { return data_; }
+ Scalar* data(int w, int d) const {
+ return data_ + w * width_stride() + d * depth_stride();
+ }
+ Scalar operator()(int w, int d) const { return *data(w, d); }
+ Scalar& operator()(int w, int d) { return *data(w, d); }
+
+ SideMap block(int start_width, int start_depth, int block_width,
+ int block_depth) const {
+ assert(start_width >= 0);
+ assert(start_width + block_width <= width_);
+ assert(start_depth >= 0);
+ assert(start_depth + block_depth <= depth_);
+
+ return SideMap(data(start_width, start_depth), block_width, block_depth,
+ stride_);
+ }
+
+ private:
+ Scalar* data_; // not owned.
+ int width_, depth_, stride_;
+};
+
+// Generic (slow) packing code.
+template <typename SrcMapType, typename KernelSideFormat>
+class PackSideBlockImplGeneric {
+ public:
+ typedef typename KernelSideFormat::Cell CellFormat;
+ static const int kLhsCells = KernelSideFormat::kCells;
+ static const int kCellWidth = CellFormat::kWidth;
+ static const int kKernelWidth = CellFormat::kWidth * kLhsCells;
+ static const int kCellDepth = CellFormat::kDepth;
+
+ virtual ~PackSideBlockImplGeneric() {}
+
+ PackSideBlockImplGeneric(PackedSideBlock<KernelSideFormat>* packed_side_block,
+ const SrcMapType& src_map)
+ : packed_side_block_(packed_side_block), src_map_(src_map) {}
+
+ // The public entry point to pack a block.
+ void PackL2() {
+ memset(packed_side_block_->rank_one_update(), 0,
+ sizeof(std::int32_t) * packed_side_block_->params().l2_width);
+ for (int d = 0; d < src_map_.depth();
+ d += packed_side_block_->params().l1_depth) {
+ int ds = std::min<int>(packed_side_block_->params().l1_depth,
+ src_map_.depth() - d);
+
+ for (int w = 0; w < src_map_.width();
+ w += packed_side_block_->params().l1_width) {
+ int ws = std::min<int>(packed_side_block_->params().l1_width,
+ src_map_.width() - w);
+
+ PackL1(w, ws, d, ds);
+ }
+ }
+ }
+
+ PackedSideBlock<KernelSideFormat>* packed_side_block() const {
+ return packed_side_block_;
+ }
+
+ const SrcMapType& src_map() const { return src_map_; }
+
+ protected:
+ // PackRun packs only a run i.e. is the inner loop in the depth dimension.
+ // This is what subclasses may override to provide optimized code paths.
+ // Optimized implementations may still fall back to this generic code
+ // to handle unaligned boundaries.
+ virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+ for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+ for (int w = 0; w < width; w++) {
+ Prefetch(src_map_.data(start_width + w, start_depth + d));
+ }
+ }
+ for (int d = 0; d < depth; d += kCellDepth) {
+ // The next loop's boundary is kKernelWidth, not width,
+ // because we always pack whole kernels so that the
+ // compute stage doesn't need to worry about unaligned kernel sizes.
+ for (int w = 0; w < +kKernelWidth; w += kCellWidth) {
+ PackUnalignedCell(start_width + w, start_depth + d);
+ }
+ }
+ }
+
+ private:
+ // The intermediate-level loops, between PackL2 and PackRun.
+ void PackL1(int start_width, int width, int start_depth, int depth) {
+ for (int w = 0; w < width; w += kKernelWidth) {
+ int ws = std::min(+kKernelWidth, width - w);
+ packed_side_block_->seek_run(start_width + w, start_depth);
+ PackRun(start_width + w, ws, start_depth, depth);
+ }
+ }
+
+ // Reference un-optimized implementation of the packing of a cell;
+ // also serves as a fallback to handle unaligned edges.
+ void PackUnalignedCell(int start_width, int start_depth) {
+ std::uint8_t* dst_ptr = packed_side_block_->current_data();
+ std::int32_t* dst_rank_one_update =
+ packed_side_block_->rank_one_update() + start_width;
+ std::int32_t dst_rank_one_update_multiplier =
+ packed_side_block_->rank_one_update_multiplier();
+
+ memset(dst_ptr, 0, sizeof(std::uint8_t) * CellFormat::kSize);
+
+ if (start_width < src_map_.width() && start_depth < src_map_.depth()) {
+ int width = std::min<int>(+kCellWidth, src_map_.width() - start_width);
+ int depth = std::min<int>(+kCellDepth, src_map_.depth() - start_depth);
+ auto src_block = src_map_.block(start_width, start_depth, width, depth);
+
+ for (int w = 0; w < width; w++) {
+ for (int d = 0; d < depth; d++) {
+ std::uint8_t s = src_block(w, d);
+ dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = s;
+ dst_rank_one_update[w] += s * dst_rank_one_update_multiplier;
+ }
+ }
+ }
+
+ packed_side_block_->seek_next_cell();
+ }
+
+ // The PackedSideBlock being packed, i.e. the 'destination'.
+ PackedSideBlock<KernelSideFormat>* const packed_side_block_;
+
+ // A map on the block of the original matrix block being packed,
+ // i.e. the 'source'.
+ const SrcMapType& src_map_;
+};
+
+// The packing code that we actually use. Defaults to using the above
+// generic code; optimized paths can be inserted by specializing this
+// template. See e.g. pack_neon.h.
+template <typename SrcMapType, typename KernelSideFormat>
+class PackSideBlockImpl
+ : public PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> {
+ public:
+ typedef PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> Base;
+
+ PackSideBlockImpl(PackedSideBlock<KernelSideFormat>* packed_side_block,
+ const SrcMapType& src_map)
+ : Base(packed_side_block, src_map) {}
+};
+
+// Packs a block of the input LHS matrix, into a PackedSideBlock
+template <typename KernelSideFormat, typename MatrixMapType>
+void PackLhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) {
+ ScopedProfilingLabel label("pack LHS");
+ static_assert(MatrixMapType::kOrder == MapOrder::RowMajor,
+ "only row-major LHS is supported at the moment.");
+ static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor;
+ typedef typename MatrixMapType::Scalar Scalar;
+ typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+ SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
+ typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType;
+ ImplType impl(dst, src_side_map);
+ impl.PackL2();
+}
+
+// Packs a block of the input RHS matrix, into a PackedSideBlock
+template <typename KernelSideFormat, typename MatrixMapType>
+void PackRhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) {
+ ScopedProfilingLabel label("pack RHS");
+ static_assert(MatrixMapType::kOrder == MapOrder::ColMajor,
+ "only col-major RHS is supported at the moment.");
+ static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor;
+ typedef typename MatrixMapType::Scalar Scalar;
+ typedef SideMap<Scalar, kSideMapOrder> SideMapType;
+ SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
+ typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType;
+ ImplType impl(dst, src_side_map);
+ impl.PackL2();
+}
+
+} // namespace gemmlowp
+
+#ifdef GEMMLOWP_NEON
+#include "internal/pack_neon.h"
+#endif
+
+#endif // GEMMLOWP_INTERNAL_PACK_H_
diff --git a/internal/pack_neon.h b/internal/pack_neon.h
new file mode 100644
index 0000000..c437489
--- /dev/null
+++ b/internal/pack_neon.h
@@ -0,0 +1,938 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// pack_neon.h: optimized NEON specializations of the templates in pack.h.
+
+#ifndef GEMMLOWP_INTERNAL_PACK_NEON_H_
+#define GEMMLOWP_INTERNAL_PACK_NEON_H_
+
+#include "internal/pack.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+// Specialization for 3 Cells of width 4, depth 2.
+// This is the LHS format used by NEONKernel12x4Depth2.
+typedef KernelSideFormat<CellFormat<4, 2>, 3> SideFormat3Cells4x2;
+template <typename SrcMapType>
+class PackSideBlockImpl<SrcMapType, SideFormat3Cells4x2>
+ : public PackSideBlockImplGeneric<SrcMapType, SideFormat3Cells4x2> {
+ public:
+ typedef SideFormat3Cells4x2 SideFormat;
+ typedef PackSideBlockImplGeneric<SrcMapType, SideFormat> Base;
+
+ PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+ const SrcMapType& src_map)
+ : Base(packed_side_block, src_map) {}
+
+ protected:
+ static const int KernelRows = SideFormat::kWidth;
+
+ virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+ // Fall back to generic path for packing too narrow runs.
+ if (width < SideFormat::kWidth) {
+ Base::PackRun(start_width, width, start_depth, depth);
+ return;
+ }
+
+ const std::uint8_t* src_ptr =
+ Base::src_map().data(start_width, start_depth);
+ const int stride = Base::src_map().stride();
+ assert(src_ptr + stride ==
+ Base::src_map().data(start_width + 1, start_depth));
+ assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+ // Prefetch data.
+ for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+ for (int i = 0; i < KernelRows; i++) {
+ Prefetch(src_ptr + i * stride + d);
+ }
+ }
+
+ const int AlignedDepth16 = RoundDown<16>(depth);
+ if (AlignedDepth16) {
+ // Fast inner loop for handling multiples of 16 levels of depth
+ ScopedProfilingLabel label("optimized kernel");
+
+ std::int32_t* rank_one_update_ptr =
+ Base::packed_side_block()->rank_one_update() + start_width;
+
+ std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+ const std::uint8_t* dst_end_ptr = dst_ptr + KernelRows * AlignedDepth16;
+
+ __attribute__((aligned(32))) std::uint8_t buf[KernelRows * 16];
+ __attribute__((aligned(16))) std::int32_t sumsbuf[12];
+ asm volatile(
+ "mov r4, %[src_ptr]\n"
+ "mov r3, %[dst_ptr]\n"
+
+ // We will accumulate the rank_one_update sums in q12--q14.
+ "vmov.s32 q12, #0\n"
+ "vmov.s32 q13, q12\n"
+ "vmov.s32 q14, q12\n"
+
+ // Main loop.
+ "loop_PackSideBlockImplNEON4x3x2_%=:\n"
+
+ // Load a 12x16 block into the 12 registers q0--q11.
+ // So each of these 12 registers contains 16 entries of
+ // one line, and there are 12 lines being processed.
+ "mov r0, r4\n"
+ "add r4, #16\n"
+ "vld1.8 {d0,d1}, [r0], %[stride]\n"
+ "vld1.8 {d2,d3}, [r0], %[stride]\n"
+ "vld1.8 {d4,d5}, [r0], %[stride]\n"
+ "vld1.8 {d6,d7}, [r0], %[stride]\n"
+ "vld1.8 {d8,d9}, [r0], %[stride]\n"
+ "vld1.8 {d10,d11}, [r0], %[stride]\n"
+ "vld1.8 {d12,d13}, [r0], %[stride]\n"
+ "vld1.8 {d14,d15}, [r0], %[stride]\n"
+ "vld1.8 {d16,d17}, [r0], %[stride]\n"
+ "vld1.8 {d18,d19}, [r0], %[stride]\n"
+ "vld1.8 {d20,d21}, [r0], %[stride]\n"
+ "vld1.8 {d22,d23}, [r0], %[stride]\n"
+
+ // The CellOrder is the opposite of the MapOrder here
+ // so we need to transpose the data in each cell.
+ // We do so using an auxiliary buffer and the vst4 instruction,
+ // which takes 4 registers and stores them interleaved.
+ "mov r1, %[buf]\n"
+ "vst4.8 {d0, d2, d4, d6}, [r1:256]!\n"
+ "vst4.8 {d1, d3, d5, d7}, [r1:256]!\n"
+ "vst4.8 {d8, d10, d12, d14}, [r1:256]!\n"
+ "vst4.8 {d9, d11, d13, d15}, [r1:256]!\n"
+ "vst4.8 {d16, d18, d20, d22}, [r1:256]!\n"
+ "vst4.8 {d17, d19, d21, d23}, [r1:256]!\n"
+
+ // Reload the data from our auxiliary buffer back into
+ // the same regisers; it is now transposed so each of the
+ // 24 d-registers from d0 to d23 now contains one cell,
+ // just not yet in the right sequence. We will still have
+ // to permute those cells.
+ "mov r1, %[buf]\n"
+ "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n"
+ "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n"
+ "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n"
+ "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n"
+ "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n"
+ "vld1.8 {d20,d21,d22,d23}, [r1:256]!\n"
+
+ // Store these cells back to memory, now in the right
+ // sequence.
+ "vst1.8 d0, [r3:64]!\n"
+ "vst1.8 d8, [r3:64]!\n"
+ "vst1.8 d16, [r3:64]!\n"
+ "vst1.8 d1, [r3:64]!\n"
+ "vst1.8 d9, [r3:64]!\n"
+ "vst1.8 d17, [r3:64]!\n"
+ "vst1.8 d2, [r3:64]!\n"
+ "vst1.8 d10, [r3:64]!\n"
+ "vst1.8 d18, [r3:64]!\n"
+ "vst1.8 d3, [r3:64]!\n"
+ "vst1.8 d11, [r3:64]!\n"
+ "vst1.8 d19, [r3:64]!\n"
+ "vst1.8 d4, [r3:64]!\n"
+ "vst1.8 d12, [r3:64]!\n"
+ "vst1.8 d20, [r3:64]!\n"
+ "vst1.8 d5, [r3:64]!\n"
+ "vst1.8 d13, [r3:64]!\n"
+ "vst1.8 d21, [r3:64]!\n"
+ "vst1.8 d6, [r3:64]!\n"
+ "vst1.8 d14, [r3:64]!\n"
+ "vst1.8 d22, [r3:64]!\n"
+ "vst1.8 d7, [r3:64]!\n"
+ "vst1.8 d15, [r3:64]!\n"
+ "vst1.8 d23, [r3:64]!\n"
+
+ // Now we are done packing this 12x16 block. We still have
+ // to accumulate the rank-one-update sums for it.
+
+ // Add 8-bit values pair-wise into 16-bit values.
+ "vaddl.u8 q0, d0, d1\n"
+ "vaddl.u8 q1, d2, d3\n"
+ "vaddl.u8 q2, d4, d5\n"
+ "vaddl.u8 q3, d6, d7\n"
+ "vaddl.u8 q4, d8, d9\n"
+ "vaddl.u8 q5, d10, d11\n"
+ "vaddl.u8 q6, d12, d13\n"
+ "vaddl.u8 q7, d14, d15\n"
+ "vaddl.u8 q8, d16, d17\n"
+ "vaddl.u8 q9, d18, d19\n"
+ "vaddl.u8 q10, d20, d21\n"
+ "vaddl.u8 q11, d22, d23\n"
+
+ // Add 16-bit values pair-wise into 32-bit values.
+ "vaddl.u16 q0, d0, d1\n"
+ "vaddl.u16 q1, d2, d3\n"
+ "vaddl.u16 q2, d4, d5\n"
+ "vaddl.u16 q3, d6, d7\n"
+ "vaddl.u16 q4, d8, d9\n"
+ "vaddl.u16 q5, d10, d11\n"
+ "vaddl.u16 q6, d12, d13\n"
+ "vaddl.u16 q7, d14, d15\n"
+ "vaddl.u16 q8, d16, d17\n"
+ "vaddl.u16 q9, d18, d19\n"
+ "vaddl.u16 q10, d20, d21\n"
+ "vaddl.u16 q11, d22, d23\n"
+
+ // Accumulate the 32-bit sums into our accumulators q12--q14.
+ "vadd.s32 q12, q12, q0\n"
+ "vadd.s32 q13, q13, q4\n"
+ "vadd.s32 q14, q14, q8\n"
+ "vadd.s32 q12, q12, q1\n"
+ "vadd.s32 q13, q13, q5\n"
+ "vadd.s32 q14, q14, q9\n"
+ "vadd.s32 q12, q12, q2\n"
+ "vadd.s32 q13, q13, q6\n"
+ "vadd.s32 q14, q14, q10\n"
+ "vadd.s32 q12, q12, q3\n"
+ "vadd.s32 q13, q13, q7\n"
+ "vadd.s32 q14, q14, q11\n"
+
+ // End of main loop.
+ "cmp r3, %[dst_end_ptr]\n"
+ "bne loop_PackSideBlockImplNEON4x3x2_%=\n"
+
+ // Store our rank-one-update accumulator registers to the
+ // sums buffer.
+ "mov r0, %[sumsbuf]\n"
+ "vst1.32 {d24, d25}, [r0:128]!\n"
+ "vst1.32 {d26, d27}, [r0:128]!\n"
+ "vst1.32 {d28, d29}, [r0:128]!\n"
+
+ : // no outputs
+ : // inputs
+ [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+ [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf),
+ [sumsbuf] "r"(sumsbuf)
+ : // clobbers
+ "cc", "memory", "r0", "r1", "r3", "r4",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+
+ // Accumulate the final rank_one_update vector.
+ int32x4x3_t sums;
+ sums.val[0] = vld1q_s32(sumsbuf);
+ sums.val[1] = vld1q_s32(sumsbuf + 4);
+ sums.val[2] = vld1q_s32(sumsbuf + 8);
+ sums.val[0] = vmulq_n_s32(
+ sums.val[0], Base::packed_side_block()->rank_one_update_multiplier());
+ sums.val[1] = vmulq_n_s32(
+ sums.val[1], Base::packed_side_block()->rank_one_update_multiplier());
+ sums.val[2] = vmulq_n_s32(
+ sums.val[2], Base::packed_side_block()->rank_one_update_multiplier());
+
+ int32x4x3_t old_sums;
+ old_sums.val[0] = vld1q_s32(rank_one_update_ptr + 0);
+ old_sums.val[1] = vld1q_s32(rank_one_update_ptr + 4);
+ old_sums.val[2] = vld1q_s32(rank_one_update_ptr + 8);
+ sums.val[0] = vaddq_s32(sums.val[0], old_sums.val[0]);
+ sums.val[1] = vaddq_s32(sums.val[1], old_sums.val[1]);
+ sums.val[2] = vaddq_s32(sums.val[2], old_sums.val[2]);
+ vst1q_s32(rank_one_update_ptr + 0, sums.val[0]);
+ vst1q_s32(rank_one_update_ptr + 4, sums.val[1]);
+ vst1q_s32(rank_one_update_ptr + 8, sums.val[2]);
+ }
+
+ // We are done handling groups of 16 levels of depth; there may be
+ // a leftover for which we use the generic path.
+ Base::packed_side_block()->seek_forward_n_cells(
+ SideFormat::kCells * AlignedDepth16 / SideFormat::kDepth);
+ Base::PackRun(start_width, width, start_depth + AlignedDepth16,
+ depth - AlignedDepth16);
+ }
+};
+
+// Specialization for 5 Cells of width 4, depth 4.
+// This is the LHS format used by NEONKernel20x1Depth4.
+typedef KernelSideFormat<CellFormat<4, 4>, 5> SideFormat5Cells4x4;
+template <typename LhsMapType>
+class PackSideBlockImpl<LhsMapType, SideFormat5Cells4x4>
+ : public PackSideBlockImplGeneric<LhsMapType, SideFormat5Cells4x4> {
+ public:
+ typedef SideFormat5Cells4x4 SideFormat;
+ typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base;
+
+ PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+ const LhsMapType& src_map)
+ : Base(packed_side_block, src_map) {}
+
+ protected:
+ virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+ // Fall back to generic path for packing too narrow runs.
+ if (width < SideFormat::kWidth) {
+ Base::PackRun(start_width, width, start_depth, depth);
+ return;
+ }
+
+ const std::uint8_t* src_ptr =
+ Base::src_map().data(start_width, start_depth);
+ const int stride = Base::src_map().stride();
+ assert(src_ptr + stride ==
+ Base::src_map().data(start_width + 1, start_depth));
+ assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+ // Prefetch data.
+ for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+ for (int i = 0; i < SideFormat::kWidth; i++) {
+ Prefetch(src_ptr + i * stride + d);
+ }
+ }
+
+ const int AlignedDepth8 = RoundDown<8>(depth);
+ if (AlignedDepth8) {
+ // Fast inner loop for handling multiples of 8 levels of depth
+ ScopedProfilingLabel label("optimized kernel");
+
+ std::int32_t* rank_one_update_ptr =
+ Base::packed_side_block()->rank_one_update() + start_width;
+
+ std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+ const std::uint8_t* dst_end_ptr =
+ dst_ptr + SideFormat::kWidth * AlignedDepth8;
+
+ __attribute__((aligned(32))) std::uint8_t buf[SideFormat::kWidth * 8];
+ __attribute__((aligned(16))) std::int32_t sumsbuf[20];
+ asm volatile(
+ "mov r4, %[src_ptr]\n"
+ "mov r3, %[dst_ptr]\n"
+
+ // We will accumulate the rank_one_update sums in q10--q14.
+ "vmov.s32 q10, #0\n"
+ "vmov.s32 q11, q10\n"
+ "vmov.s32 q12, q10\n"
+ "vmov.s32 q13, q10\n"
+ "vmov.s32 q14, q10\n"
+
+ // Main loop.
+ "loop_PackSideBlockImplNEON4x5x4_%=:\n"
+
+ // Load a 20x8 block into the 20 registers d0--d19.
+ // So each of these 20 registers contains 8 entries of
+ // one line, and there are 20 lines being processed.
+ "mov r0, r4\n"
+ "add r4, #8\n"
+ "vld1.8 d0, [r0], %[stride]\n"
+ "vld1.8 d1, [r0], %[stride]\n"
+ "vld1.8 d2, [r0], %[stride]\n"
+ "vld1.8 d3, [r0], %[stride]\n"
+ "vld1.8 d4, [r0], %[stride]\n"
+ "vld1.8 d5, [r0], %[stride]\n"
+ "vld1.8 d6, [r0], %[stride]\n"
+ "vld1.8 d7, [r0], %[stride]\n"
+ "vld1.8 d8, [r0], %[stride]\n"
+ "vld1.8 d9, [r0], %[stride]\n"
+ "vld1.8 d10, [r0], %[stride]\n"
+ "vld1.8 d11, [r0], %[stride]\n"
+ "vld1.8 d12, [r0], %[stride]\n"
+ "vld1.8 d13, [r0], %[stride]\n"
+ "vld1.8 d14, [r0], %[stride]\n"
+ "vld1.8 d15, [r0], %[stride]\n"
+ "vld1.8 d16, [r0], %[stride]\n"
+ "vld1.8 d17, [r0], %[stride]\n"
+ "vld1.8 d18, [r0], %[stride]\n"
+ "vld1.8 d19, [r0], %[stride]\n"
+
+ // The CellOrder is the opposite of the MapOrder here
+ // so we need to transpose the data in each cell.
+ // We do so using an auxiliary buffer and the vst4 instruction,
+ // which takes 4 registers and stores them interleaved.
+ "mov r1, %[buf]\n"
+ "vst4.8 {d0, d1, d2, d3}, [r1:256]!\n"
+ "vst4.8 {d4, d5, d6, d7}, [r1:256]!\n"
+ "vst4.8 {d8, d9, d10, d11}, [r1:256]!\n"
+ "vst4.8 {d12, d13, d14, d15}, [r1:256]!\n"
+ "vst4.8 {d16, d17, d18, d19}, [r1:256]!\n"
+
+ // Reload the data from our auxiliary buffer back into
+ // the same regisers; it is now transposed so each of the
+ // 20 d-registers from d0 to d19 now contains half of one cell,
+ // just not yet in the right sequence. We will still have
+ // to permute those cells.
+ "mov r1, %[buf]\n"
+ "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n"
+ "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n"
+ "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n"
+ "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n"
+ "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n"
+
+ // Store these cells back to memory, now in the right
+ // sequence.
+ "vst1.8 d0, [r3:64]!\n"
+ "vst1.8 d1, [r3:64]!\n"
+ "vst1.8 d4, [r3:64]!\n"
+ "vst1.8 d5, [r3:64]!\n"
+ "vst1.8 d8, [r3:64]!\n"
+ "vst1.8 d9, [r3:64]!\n"
+ "vst1.8 d12, [r3:64]!\n"
+ "vst1.8 d13, [r3:64]!\n"
+ "vst1.8 d16, [r3:64]!\n"
+ "vst1.8 d17, [r3:64]!\n"
+ "vst1.8 d2, [r3:64]!\n"
+ "vst1.8 d3, [r3:64]!\n"
+ "vst1.8 d6, [r3:64]!\n"
+ "vst1.8 d7, [r3:64]!\n"
+ "vst1.8 d10, [r3:64]!\n"
+ "vst1.8 d11, [r3:64]!\n"
+ "vst1.8 d14, [r3:64]!\n"
+ "vst1.8 d15, [r3:64]!\n"
+ "vst1.8 d18, [r3:64]!\n"
+ "vst1.8 d19, [r3:64]!\n"
+
+ // Now we are done packing this 20x8 block. We still have
+ // to accumulate the rank-one-update sums for it.
+
+ // Add 8-bit values pair-wise into 16-bit values.
+ "vaddl.u8 q0, d0, d1\n"
+ "vaddl.u8 q1, d2, d3\n"
+ "vaddl.u8 q2, d4, d5\n"
+ "vaddl.u8 q3, d6, d7\n"
+ "vaddl.u8 q4, d8, d9\n"
+ "vaddl.u8 q5, d10, d11\n"
+ "vaddl.u8 q6, d12, d13\n"
+ "vaddl.u8 q7, d14, d15\n"
+ "vaddl.u8 q8, d16, d17\n"
+ "vaddl.u8 q9, d18, d19\n"
+
+ // Add 16-bit values pair-wise into 32-bit values.
+ "vaddl.u16 q0, d0, d1\n"
+ "vaddl.u16 q1, d2, d3\n"
+ "vaddl.u16 q2, d4, d5\n"
+ "vaddl.u16 q3, d6, d7\n"
+ "vaddl.u16 q4, d8, d9\n"
+ "vaddl.u16 q5, d10, d11\n"
+ "vaddl.u16 q6, d12, d13\n"
+ "vaddl.u16 q7, d14, d15\n"
+ "vaddl.u16 q8, d16, d17\n"
+ "vaddl.u16 q9, d18, d19\n"
+
+ // Accumulate the 32-bit sums into our accumulators q10--q14.
+ "vadd.s32 q10, q10, q0\n"
+ "vadd.s32 q11, q11, q2\n"
+ "vadd.s32 q12, q12, q4\n"
+ "vadd.s32 q13, q13, q6\n"
+ "vadd.s32 q14, q14, q8\n"
+ "vadd.s32 q10, q10, q1\n"
+ "vadd.s32 q11, q11, q3\n"
+ "vadd.s32 q12, q12, q5\n"
+ "vadd.s32 q13, q13, q7\n"
+ "vadd.s32 q14, q14, q9\n"
+
+ // End of main loop.
+ "cmp r3, %[dst_end_ptr]\n"
+ "bne loop_PackSideBlockImplNEON4x5x4_%=\n"
+
+ // Store our rank-one-update accumulator registers to the
+ // sums buffer.
+ "mov r0, %[sumsbuf]\n"
+ "vst1.32 {d20, d21}, [r0:128]!\n"
+ "vst1.32 {d22, d23}, [r0:128]!\n"
+ "vst1.32 {d24, d25}, [r0:128]!\n"
+ "vst1.32 {d26, d27}, [r0:128]!\n"
+ "vst1.32 {d28, d29}, [r0:128]!\n"
+ : // no outputs
+ : // inputs
+ [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+ [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf),
+ [sumsbuf] "r"(sumsbuf)
+ : // clobbers
+ "cc", "memory", "r0", "r1", "r3", "r4",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+
+ // Accumulate the final rank_one_update vector.
+ int32x4_t sums[5];
+ sums[0] = vld1q_s32(sumsbuf);
+ sums[1] = vld1q_s32(sumsbuf + 4);
+ sums[2] = vld1q_s32(sumsbuf + 8);
+ sums[3] = vld1q_s32(sumsbuf + 12);
+ sums[4] = vld1q_s32(sumsbuf + 16);
+
+ sums[0] = vmulq_n_s32(
+ sums[0], Base::packed_side_block()->rank_one_update_multiplier());
+ sums[1] = vmulq_n_s32(
+ sums[1], Base::packed_side_block()->rank_one_update_multiplier());
+ sums[2] = vmulq_n_s32(
+ sums[2], Base::packed_side_block()->rank_one_update_multiplier());
+ sums[3] = vmulq_n_s32(
+ sums[3], Base::packed_side_block()->rank_one_update_multiplier());
+ sums[4] = vmulq_n_s32(
+ sums[4], Base::packed_side_block()->rank_one_update_multiplier());
+
+ int32x4_t old_sums[5];
+ old_sums[0] = vld1q_s32(rank_one_update_ptr + 0);
+ old_sums[1] = vld1q_s32(rank_one_update_ptr + 4);
+ old_sums[2] = vld1q_s32(rank_one_update_ptr + 8);
+ old_sums[3] = vld1q_s32(rank_one_update_ptr + 12);
+ old_sums[4] = vld1q_s32(rank_one_update_ptr + 16);
+
+ sums[0] = vaddq_s32(sums[0], old_sums[0]);
+ sums[1] = vaddq_s32(sums[1], old_sums[1]);
+ sums[2] = vaddq_s32(sums[2], old_sums[2]);
+ sums[3] = vaddq_s32(sums[3], old_sums[3]);
+ sums[4] = vaddq_s32(sums[4], old_sums[4]);
+
+ vst1q_s32(rank_one_update_ptr + 0, sums[0]);
+ vst1q_s32(rank_one_update_ptr + 4, sums[1]);
+ vst1q_s32(rank_one_update_ptr + 8, sums[2]);
+ vst1q_s32(rank_one_update_ptr + 12, sums[3]);
+ vst1q_s32(rank_one_update_ptr + 16, sums[4]);
+ }
+
+ // We are done handling groups of 8 levels of depth; there may be
+ // a leftover for which we use the generic path.
+ Base::packed_side_block()->seek_forward_n_cells(
+ SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth);
+ Base::PackRun(start_width, width, start_depth + AlignedDepth8,
+ depth - AlignedDepth8);
+ }
+};
+
+// Specialization for 1 Cell of width 8, depth 4.
+// This is the LHS format used by NEONKernel8x1Depth4.
+typedef KernelSideFormat<CellFormat<8, 4>, 1> SideFormat1Cell8x4;
+template <typename LhsMapType>
+class PackSideBlockImpl<LhsMapType, SideFormat1Cell8x4>
+ : public PackSideBlockImplGeneric<LhsMapType, SideFormat1Cell8x4> {
+ public:
+ typedef SideFormat1Cell8x4 SideFormat;
+
+ typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base;
+
+ PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+ const LhsMapType& src_map)
+ : Base(packed_side_block, src_map) {}
+
+ protected:
+ virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+ // Fall back to generic path for packing too narrow runs.
+ if (width < SideFormat::kWidth) {
+ Base::PackRun(start_width, width, start_depth, depth);
+ return;
+ }
+
+ const std::uint8_t* src_ptr =
+ Base::src_map().data(start_width, start_depth);
+ const int stride = Base::src_map().stride();
+ assert(src_ptr + stride ==
+ Base::src_map().data(start_width + 1, start_depth));
+ assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1));
+
+ // Prefetch data.
+ for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
+ for (int i = 0; i < SideFormat::kWidth; i++) {
+ Prefetch(src_ptr + i * stride + d);
+ }
+ }
+
+ const int AlignedDepth8 = RoundDown<8>(depth);
+ if (AlignedDepth8) {
+ // Fast inner loop for handling multiples of 8 levels of depth
+ ScopedProfilingLabel label("optimized kernel");
+
+ std::int32_t* rank_one_update_ptr =
+ Base::packed_side_block()->rank_one_update() + start_width;
+
+ std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+ const std::uint8_t* dst_end_ptr =
+ dst_ptr + SideFormat::kWidth * AlignedDepth8;
+
+ __attribute__((aligned(16))) std::int32_t sumsbuf[8];
+ asm volatile(
+ "mov r4, %[src_ptr]\n"
+ "mov r3, %[dst_ptr]\n"
+
+ // We will accumulate the rank_one_update sums in q10--q11.
+ "vmov.s32 q10, #0\n"
+ "vmov.s32 q11, q10\n"
+
+ // Main loop.
+ "loop_PackSideBlockImplNEON8x1x4_%=:\n"
+
+ // Load a 8x8 block into the 8 registers d0--d7.
+ // So each of these 8 registers contains 8 entries of
+ // one line, and there are 8 lines being processed.
+ "mov r0, r4\n"
+ "add r4, #8\n"
+ "vld1.8 d0, [r0], %[stride]\n"
+ "vld1.8 d1, [r0], %[stride]\n"
+ "vld1.8 d2, [r0], %[stride]\n"
+ "vld1.8 d3, [r0], %[stride]\n"
+ "vld1.8 d4, [r0], %[stride]\n"
+ "vld1.8 d5, [r0], %[stride]\n"
+ "vld1.8 d6, [r0], %[stride]\n"
+ "vld1.8 d7, [r0], %[stride]\n"
+
+ // The CellOrder is the opposite of the MapOrder here
+ // so we need to transpose the data in each cell.
+ // Fortunately in this case, since what we have to transpose
+ // is a 8x8 block of 8-bit values, we can do so in-place with
+ // vtrn instructions; no need for an auxiliary buffer here.
+ "vtrn.8 d0, d1\n"
+ "vtrn.8 d2, d3\n"
+ "vtrn.8 d4, d5\n"
+ "vtrn.8 d6, d7\n"
+ "vtrn.16 q0, q1\n"
+ "vtrn.16 q2, q3\n"
+ "vtrn.32 q0, q2\n"
+ "vtrn.32 q1, q3\n"
+
+ // Store the packed data to memory.
+ "vst1.8 {d0, d1, d2, d3}, [r3:64]!\n"
+ "vst1.8 {d4, d5, d6, d7}, [r3:64]!\n"
+
+ // Now we are done packing this 20x8 block. We still have
+ // to accumulate the rank-one-update sums for it.
+
+ // Add 8-bit values pair-wise into 16-bit values.
+ "vaddl.u8 q0, d0, d1\n"
+ "vaddl.u8 q1, d2, d3\n"
+ "vaddl.u8 q2, d4, d5\n"
+ "vaddl.u8 q3, d6, d7\n"
+
+ // Add these 16-bit values into the 32-bit accumulators q10--q11.
+ "vaddw.u16 q10, q10, d0\n"
+ "vaddw.u16 q11, q11, d1\n"
+ "vaddw.u16 q10, q10, d2\n"
+ "vaddw.u16 q11, q11, d3\n"
+ "vaddw.u16 q10, q10, d4\n"
+ "vaddw.u16 q11, q11, d5\n"
+ "vaddw.u16 q10, q10, d6\n"
+ "vaddw.u16 q11, q11, d7\n"
+
+ // End of main loop.
+ "cmp r3, %[dst_end_ptr]\n"
+ "bne loop_PackSideBlockImplNEON8x1x4_%=\n"
+
+ // Store our rank-one-update accumulator registers to the
+ // sums buffer.
+ "mov r0, %[sumsbuf]\n"
+ "vst1.32 {d20, d21}, [r0:128]!\n"
+ "vst1.32 {d22, d23}, [r0:128]!\n"
+ : // no outputs
+ : // inputs
+ [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
+ [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride),
+ [sumsbuf] "r"(sumsbuf)
+ : // clobbers
+ "cc", "memory", "r0", "r3", "r4",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+
+ // Accumulate the final rank_one_update vector.
+ int32x4_t sums[2];
+ sums[0] = vld1q_s32(sumsbuf);
+ sums[1] = vld1q_s32(sumsbuf + 4);
+
+ sums[0] = vmulq_n_s32(
+ sums[0], Base::packed_side_block()->rank_one_update_multiplier());
+ sums[1] = vmulq_n_s32(
+ sums[1], Base::packed_side_block()->rank_one_update_multiplier());
+
+ int32x4_t old_sums[2];
+ old_sums[0] = vld1q_s32(rank_one_update_ptr + 0);
+ old_sums[1] = vld1q_s32(rank_one_update_ptr + 4);
+
+ sums[0] = vaddq_s32(sums[0], old_sums[0]);
+ sums[1] = vaddq_s32(sums[1], old_sums[1]);
+
+ vst1q_s32(rank_one_update_ptr + 0, sums[0]);
+ vst1q_s32(rank_one_update_ptr + 4, sums[1]);
+ }
+
+ // We are done handling groups of 8 levels of depth; there may be
+ // a leftover for which we use the generic path.
+ Base::packed_side_block()->seek_forward_n_cells(
+ SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth);
+ Base::PackRun(start_width, width, start_depth + AlignedDepth8,
+ depth - AlignedDepth8);
+ }
+};
+
+// Partial specialization for 1 Cell of width 4, and any Depth.
+// This is the RHS format used by NEONKernel12x4Depth2.
+template <int Depth>
+using SideFormat1Cell4xD = KernelSideFormat<CellFormat<4, Depth>, 1>;
+template <typename RhsMapType, int Depth>
+class PackSideBlockImpl<RhsMapType, SideFormat1Cell4xD<Depth>>
+ : public PackSideBlockImplGeneric<RhsMapType, SideFormat1Cell4xD<Depth>> {
+ public:
+ typedef SideFormat1Cell4xD<Depth> SideFormat;
+
+ typedef PackSideBlockImplGeneric<RhsMapType, SideFormat> Base;
+
+ PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block,
+ const RhsMapType& src_map)
+ : Base(packed_side_block, src_map) {}
+
+ protected:
+ virtual void PackRun(int start_width, int width, int start_depth, int depth) {
+ ScopedProfilingLabel label("optimized kernel");
+
+ std::int32_t* rank_one_update_ptr =
+ Base::packed_side_block()->rank_one_update() + start_width;
+ const int AlignedDepth16 = RoundDown<16>(depth);
+ const std::uint8_t* src_line0ptr =
+ width < 1 ? nullptr : Base::src_map().data(start_width, start_depth);
+ const std::uint8_t* src_line1ptr =
+ width < 2 ? nullptr
+ : Base::src_map().data(start_width + 1, start_depth);
+ const std::uint8_t* src_line2ptr =
+ width < 3 ? nullptr
+ : Base::src_map().data(start_width + 2, start_depth);
+ const std::uint8_t* src_line3ptr =
+ width < 4 ? nullptr
+ : Base::src_map().data(start_width + 3, start_depth);
+ std::uint8_t* dst_ptr = Base::packed_side_block()->current_data();
+ const std::uint8_t* dst_end_ptr =
+ dst_ptr + SideFormat::kWidth * AlignedDepth16;
+ int32x4x4_t local_col_sums;
+ local_col_sums.val[0] = vdupq_n_s32(0);
+ local_col_sums.val[1] = vdupq_n_s32(0);
+ local_col_sums.val[2] = vdupq_n_s32(0);
+ local_col_sums.val[3] = vdupq_n_s32(0);
+
+ switch (width) {
+ case 4:
+ while (dst_ptr != dst_end_ptr) {
+ uint8x16x4_t src_regs;
+ src_regs.val[0] = vld1q_u8(src_line0ptr);
+ src_regs.val[1] = vld1q_u8(src_line1ptr);
+ src_regs.val[2] = vld1q_u8(src_line2ptr);
+ src_regs.val[3] = vld1q_u8(src_line3ptr);
+ uint16x8x4_t s;
+ s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+ vget_high_u8(src_regs.val[0]));
+ s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+ vget_high_u8(src_regs.val[1]));
+ s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+ vget_high_u8(src_regs.val[2]));
+ s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]),
+ vget_high_u8(src_regs.val[3]));
+ local_col_sums.val[0] =
+ vaddq_s32(local_col_sums.val[0],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+ local_col_sums.val[1] =
+ vaddq_s32(local_col_sums.val[1],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+ local_col_sums.val[2] =
+ vaddq_s32(local_col_sums.val[2],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+ local_col_sums.val[3] =
+ vaddq_s32(local_col_sums.val[3],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[3]), vget_high_u16(s.val[3]))));
+ vst4q_u8(dst_ptr, src_regs);
+ src_line0ptr += 16;
+ src_line1ptr += 16;
+ src_line2ptr += 16;
+ src_line3ptr += 16;
+ dst_ptr += SideFormat::kWidth * 16;
+ }
+ break;
+ case 3:
+ while (dst_ptr != dst_end_ptr) {
+ uint8x16x4_t src_regs;
+ src_regs.val[0] = vld1q_u8(src_line0ptr);
+ src_regs.val[1] = vld1q_u8(src_line1ptr);
+ src_regs.val[2] = vld1q_u8(src_line2ptr);
+ src_regs.val[3] = vdupq_n_u8(0);
+ uint16x8x3_t s;
+ s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+ vget_high_u8(src_regs.val[0]));
+ s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+ vget_high_u8(src_regs.val[1]));
+ s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+ vget_high_u8(src_regs.val[2]));
+ local_col_sums.val[0] =
+ vaddq_s32(local_col_sums.val[0],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+ local_col_sums.val[1] =
+ vaddq_s32(local_col_sums.val[1],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+ local_col_sums.val[2] =
+ vaddq_s32(local_col_sums.val[2],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+ vst4q_u8(dst_ptr, src_regs);
+ src_line0ptr += 16;
+ src_line1ptr += 16;
+ src_line2ptr += 16;
+ dst_ptr += SideFormat::kWidth * 16;
+ }
+ break;
+ case 2:
+ while (dst_ptr != dst_end_ptr) {
+ uint8x16x4_t src_regs;
+ src_regs.val[0] = vld1q_u8(src_line0ptr);
+ src_regs.val[1] = vld1q_u8(src_line1ptr);
+ src_regs.val[2] = vdupq_n_u8(0);
+ src_regs.val[3] = vdupq_n_u8(0);
+ uint16x8x2_t s;
+ s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+ vget_high_u8(src_regs.val[0]));
+ s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+ vget_high_u8(src_regs.val[1]));
+ local_col_sums.val[0] =
+ vaddq_s32(local_col_sums.val[0],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+ local_col_sums.val[1] =
+ vaddq_s32(local_col_sums.val[1],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+ vst4q_u8(dst_ptr, src_regs);
+ src_line0ptr += 16;
+ src_line1ptr += 16;
+ dst_ptr += SideFormat::kWidth * 16;
+ }
+ break;
+ case 1:
+ while (dst_ptr != dst_end_ptr) {
+ uint8x16x4_t src_regs;
+ src_regs.val[0] = vld1q_u8(src_line0ptr);
+ src_regs.val[1] = vdupq_n_u8(0);
+ src_regs.val[2] = vdupq_n_u8(0);
+ src_regs.val[3] = vdupq_n_u8(0);
+ uint16x8x2_t s;
+ s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+ vget_high_u8(src_regs.val[0]));
+ local_col_sums.val[0] =
+ vaddq_s32(local_col_sums.val[0],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+ vst4q_u8(dst_ptr, src_regs);
+ src_line0ptr += 16;
+ dst_ptr += SideFormat::kWidth * 16;
+ }
+ break;
+ default:
+ abort();
+ }
+
+ int extra = depth - AlignedDepth16;
+
+ if (extra) {
+ __attribute__((aligned(16))) std::uint8_t line0extra[16];
+ __attribute__((aligned(16))) std::uint8_t line1extra[16];
+ __attribute__((aligned(16))) std::uint8_t line2extra[16];
+ __attribute__((aligned(16))) std::uint8_t line3extra[16];
+ __attribute__((aligned(16))) std::uint8_t dstextra[64];
+
+ for (int i = 0; i < extra; i++) {
+ line0extra[i] = width >= 1 ? src_line0ptr[i] : 0;
+ line1extra[i] = width >= 2 ? src_line1ptr[i] : 0;
+ line2extra[i] = width >= 3 ? src_line2ptr[i] : 0;
+ line3extra[i] = width >= 4 ? src_line3ptr[i] : 0;
+ }
+ for (int i = extra; i < 16; i++) {
+ line0extra[i] = 0;
+ line1extra[i] = 0;
+ line2extra[i] = 0;
+ line3extra[i] = 0;
+ }
+ src_line0ptr = line0extra;
+ src_line1ptr = line1extra;
+ src_line2ptr = line2extra;
+ src_line3ptr = line3extra;
+
+ {
+ uint8x16x4_t src_regs;
+ src_regs.val[0] = vld1q_u8(src_line0ptr);
+ src_regs.val[1] = vld1q_u8(src_line1ptr);
+ src_regs.val[2] = vld1q_u8(src_line2ptr);
+ src_regs.val[3] = vld1q_u8(src_line3ptr);
+ uint16x8x4_t s;
+ s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]),
+ vget_high_u8(src_regs.val[0]));
+ s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]),
+ vget_high_u8(src_regs.val[1]));
+ s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]),
+ vget_high_u8(src_regs.val[2]));
+ s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]),
+ vget_high_u8(src_regs.val[3]));
+ local_col_sums.val[0] =
+ vaddq_s32(local_col_sums.val[0],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[0]), vget_high_u16(s.val[0]))));
+ local_col_sums.val[1] =
+ vaddq_s32(local_col_sums.val[1],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[1]), vget_high_u16(s.val[1]))));
+ local_col_sums.val[2] =
+ vaddq_s32(local_col_sums.val[2],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[2]), vget_high_u16(s.val[2]))));
+ local_col_sums.val[3] =
+ vaddq_s32(local_col_sums.val[3],
+ vreinterpretq_s32_u32(vaddl_u16(
+ vget_low_u16(s.val[3]), vget_high_u16(s.val[3]))));
+ vst4q_u8(dstextra, src_regs);
+ }
+
+ for (int i = 0; i < 4 * extra; i++) {
+ dst_ptr[i] = dstextra[i];
+ }
+ }
+
+ // Accumulate the final rank_one_update vector.
+ std::int32_t r[4];
+ if (width >= 1) {
+ vst1q_s32(r, local_col_sums.val[0]);
+ rank_one_update_ptr[0] +=
+ Base::packed_side_block()->rank_one_update_multiplier() *
+ (r[0] + r[1] + r[2] + r[3]);
+ }
+ if (width >= 2) {
+ vst1q_s32(r, local_col_sums.val[1]);
+ rank_one_update_ptr[1] +=
+ Base::packed_side_block()->rank_one_update_multiplier() *
+ (r[0] + r[1] + r[2] + r[3]);
+ }
+ if (width >= 3) {
+ vst1q_s32(r, local_col_sums.val[2]);
+ rank_one_update_ptr[2] +=
+ Base::packed_side_block()->rank_one_update_multiplier() *
+ (r[0] + r[1] + r[2] + r[3]);
+ }
+ if (width >= 4) {
+ vst1q_s32(r, local_col_sums.val[3]);
+ rank_one_update_ptr[3] +=
+ Base::packed_side_block()->rank_one_update_multiplier() *
+ (r[0] + r[1] + r[2] + r[3]);
+ }
+ }
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_PACK_NEON_H_
diff --git a/internal/single_thread_gemm.h b/internal/single_thread_gemm.h
new file mode 100644
index 0000000..0ec4201
--- /dev/null
+++ b/internal/single_thread_gemm.h
@@ -0,0 +1,103 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// single_thread_gemm.h: Single-threaded GEMM implementation.
+// This is a good place to start reading code, as it shows the overall
+// structure of a GEMM and is much simpler than multi_thread_gemm.h.
+
+#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
+
+#include <cassert>
+
+#include "public/map.h"
+#include "internal/allocator.h"
+#include "internal/pack.h"
+#include "internal/unpack.h"
+#include "internal/compute.h"
+#include "internal/kernel.h"
+
+namespace gemmlowp {
+
+class SingleThreadGemmContext {
+ public:
+ Allocator* allocator() { return &allocator_; }
+
+ protected:
+ Allocator allocator_;
+};
+
+template <typename KernelFormat, typename Scalar, MapOrder LhsOrder,
+ MapOrder RhsOrder, MapOrder ResultOrder>
+void SingleThreadGemm(SingleThreadGemmContext* context,
+ const KernelBase& kernel,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
+
+ assert(lhs.cols() == rhs.rows());
+
+ int rows = result->rows();
+ int cols = result->cols();
+ int depth = lhs.cols();
+
+ Allocator* allocator = context->allocator();
+
+ BlockParams block_params;
+ block_params.Init<KernelFormat>(rows, cols, depth, 1);
+
+ PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(
+ Side::Lhs, allocator, block_params, rhs_offset);
+ PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(
+ Side::Rhs, allocator, block_params, lhs_offset);
+
+ PackedResultInt32 packed_result(allocator, block_params);
+
+ allocator->Commit();
+
+ const bool pack_rhs_once = block_params.l2_cols == cols;
+
+ if (pack_rhs_once) {
+ PackRhs(&packed_rhs, rhs);
+ }
+
+ for (int r = 0; r < rows; r += block_params.l2_rows) {
+ int rs = std::min(block_params.l2_rows, rows - r);
+
+ PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
+
+ for (int c = 0; c < cols; c += block_params.l2_cols) {
+ int cs = std::min(block_params.l2_cols, cols - c);
+
+ if (!pack_rhs_once) {
+ PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
+ }
+
+ Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
+
+ auto result_block = result->block(r, c, rs, cs);
+ UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth,
+ result_offset, result_mult_int, result_shift);
+ }
+ }
+
+ allocator->Decommit();
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
diff --git a/internal/unpack.h b/internal/unpack.h
new file mode 100644
index 0000000..55ca204
--- /dev/null
+++ b/internal/unpack.h
@@ -0,0 +1,100 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack.h: unpacking the result blocks computed by compute.h,
+// storing them into the destination matrix.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_H_
+#define GEMMLOWP_INTERNAL_UNPACK_H_
+
+#include "internal/allocator.h"
+#include "internal/block_params.h"
+#include "internal/pack.h"
+
+#ifdef GEMMLOWP_NEON
+#include "internal/unpack_neon.h"
+#endif
+
+namespace gemmlowp {
+
+class PackedResultInt32 {
+ Allocator* allocator_;
+ Allocator::Handle matrix_handle_;
+ const BlockParams& block_params_;
+
+ public:
+ PackedResultInt32(Allocator* _allocator, const BlockParams& _block_params)
+ : allocator_(_allocator), block_params_(_block_params) {
+ matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows *
+ block_params_.l2_cols);
+ }
+
+ ~PackedResultInt32() {}
+
+ MatrixMap<std::int32_t, MapOrder::ColMajor> Map() {
+ return MatrixMap<std::int32_t, MapOrder::ColMajor>(
+ allocator_->GetPointer<std::int32_t>(matrix_handle_),
+ block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+ }
+
+ MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const {
+ return MatrixMap<const std::int32_t, MapOrder::ColMajor>(
+ allocator_->GetPointer<const std::int32_t>(matrix_handle_),
+ block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows);
+ }
+};
+
+template <typename ResultBlockType, typename PackedResult,
+ typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResultImpl(ResultBlockType* dst, const PackedResult& src,
+ const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+ const PackedSideBlock<KernelRhsFormat>& packed_rhs,
+ int depth, std::int32_t result_offset,
+ std::int32_t result_mult_int, std::int32_t result_shift) {
+ std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
+ packed_rhs.rank_one_update_multiplier() * depth;
+ // No top-level blocking in the depth dimension at the moment.
+ // Too much loss of precision.
+ for (int c = 0; c < dst->cols(); c++) {
+ for (int r = 0; r < dst->rows(); r++) {
+ std::int32_t q = *src.data(r, c);
+ q += packed_lhs.rank_one_update()[r] + packed_rhs.rank_one_update()[c] +
+ rank0update;
+ q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
+ result_shift;
+ (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
+ }
+ }
+}
+
+template <typename ResultBlockType, typename PackedResult,
+ typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResult(ResultBlockType* dst, const PackedResult& src,
+ const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+ const PackedSideBlock<KernelRhsFormat>& packed_rhs, int depth,
+ std::int32_t result_offset, std::int32_t result_mult_int,
+ std::int32_t result_shift) {
+ ScopedProfilingLabel label("unpack");
+#ifdef GEMMLOWP_NEON
+ UnpackResultImplNEON(dst, src.Map(), packed_lhs, packed_rhs, depth,
+ result_offset, result_mult_int, result_shift);
+#else
+ UnpackResultImpl(dst, src.Map(), packed_lhs, packed_rhs, depth, result_offset,
+ result_mult_int, result_shift);
+#endif
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_UNPACK_H_
diff --git a/internal/unpack_neon.h b/internal/unpack_neon.h
new file mode 100644
index 0000000..17bfe78
--- /dev/null
+++ b/internal/unpack_neon.h
@@ -0,0 +1,170 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// unpack_neon.h: optimized NEON specializations of the templates in unpack.h.
+
+#ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_
+#define GEMMLOWP_INTERNAL_UNPACK_NEON_H_
+
+#include "internal/unpack.h"
+
+#include <arm_neon.h>
+
+namespace gemmlowp {
+
+template <typename ResultBlockType, typename PackedResult,
+ typename KernelLhsFormat, typename KernelRhsFormat>
+void UnpackResultImplNEON(ResultBlockType* dst, const PackedResult& src,
+ const PackedSideBlock<KernelLhsFormat>& packed_lhs,
+ const PackedSideBlock<KernelRhsFormat>& packed_rhs,
+ int depth, std::int32_t result_offset,
+ std::int32_t result_mult_int,
+ std::int32_t result_shift) {
+ ScopedProfilingLabel label("optimized kernel");
+
+ std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() *
+ packed_rhs.rank_one_update_multiplier() * depth;
+ std::int32_t preshift_offset = 1 << (result_shift - 1);
+ int32x4_t shift_reg = vdupq_n_s32(-result_shift);
+ for (int c = 0; c < dst->cols(); c++) {
+ std::uint8_t* dst_ptr = dst->data(0, c);
+ const std::int32_t* src_ptr = src.data(0, c);
+ const std::int32_t* rank_one_update_ptr = packed_lhs.rank_one_update();
+ std::int32_t rank1update = packed_rhs.rank_one_update()[c];
+ std::int32_t constant_offset = rank1update + rank0update + result_offset;
+
+ int dst_rows_aligned4 = RoundDown<4>(dst->rows());
+ int dst_rows_aligned16 = RoundDown<16>(dst->rows());
+
+ if (dst_rows_aligned16) {
+ std::uint8_t* dst_end_ptr = dst_ptr + dst_rows_aligned16;
+
+ asm volatile(
+ "vdup.32 q12, %[constant_offset]\n"
+ "vdup.32 q13, %[preshift_offset]\n"
+ "neg r3, %[result_shift]\n"
+ "vdup.32 q14, r3\n"
+ "vdup.32 q15, %[result_mult_int]\n"
+
+ "loop_UnpackResultImplNEON_%=:\n"
+
+ // Load a 16x1 block of the packed result matrix
+ // (so 16 contiguous entries in one column).
+ "vld1.32 {d0, d1, d2, d3}, [%[src_ptr]]!\n"
+ "vld1.32 {d4, d5, d6, d7}, [%[src_ptr]]!\n"
+
+ // Load entries the LHS rank one update vector.
+ "vld1.32 {d8, d9, d10, d11}, "
+ "[%[rank_one_update_ptr]:256]!\n"
+ "vld1.32 {d12, d13, d14, d15}, "
+ "[%[rank_one_update_ptr]:256]!\n"
+
+ // Apply the LHS rank one update.
+ "vadd.s32 q0, q0, q4\n"
+ "vadd.s32 q1, q1, q5\n"
+ "vadd.s32 q2, q2, q6\n"
+ "vadd.s32 q3, q3, q7\n"
+
+ // Add the constant offset
+ // (which includes the RHS rank one update, see above).
+ "vadd.s32 q0, q0, q12\n"
+ "vadd.s32 q1, q1, q12\n"
+ "vadd.s32 q2, q2, q12\n"
+ "vadd.s32 q3, q3, q12\n"
+
+ // Multiply by the result multiplier
+ "vmul.s32 q0, q0, q15\n"
+ "vmul.s32 q1, q1, q15\n"
+ "vmul.s32 q2, q2, q15\n"
+ "vmul.s32 q3, q3, q15\n"
+
+ // Add the pre-shift offset (so that the shift is rounding)
+ "vadd.s32 q0, q0, q13\n"
+ "vadd.s32 q1, q1, q13\n"
+ "vadd.s32 q2, q2, q13\n"
+ "vadd.s32 q3, q3, q13\n"
+
+ // Shift right (shift left by negative offset).
+ "vshl.s32 q0, q0, q14\n"
+ "vshl.s32 q1, q1, q14\n"
+ "vshl.s32 q2, q2, q14\n"
+ "vshl.s32 q3, q3, q14\n"
+
+ // So far we had signed 32bit values; now we cast them down
+ // to unsigned 8bit, saturating.
+ "vqmovn.s32 d8, q0\n"
+ "vqmovn.s32 d9, q1\n"
+ "vqmovn.s32 d10, q2\n"
+ "vqmovn.s32 d11, q3\n"
+ "vqmovun.s16 d0, q4\n"
+ "vqmovun.s16 d1, q5\n"
+
+ // Store result into the destination matrix.
+ "vst1.8 {d0, d1}, [%[dst_ptr]]!\n"
+
+ // End of the loop.
+ "cmp %[dst_ptr], %[dst_end_ptr]\n"
+ "bne loop_UnpackResultImplNEON_%=\n"
+
+ : // outputs
+ [dst_ptr] "+r"(dst_ptr), [src_ptr] "+r"(src_ptr),
+ [rank_one_update_ptr] "+r"(rank_one_update_ptr)
+ : // inputs
+ [dst_end_ptr] "r"(dst_end_ptr),
+ [constant_offset] "r"(constant_offset),
+ [result_mult_int] "r"(result_mult_int),
+ [preshift_offset] "r"(preshift_offset),
+ [result_shift] "r"(result_shift)
+ : // clobbers
+ "cc", "memory", "r3",
+ // note: someone on internet says that quad registers are
+ // unsupported in the clobber list!
+ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
+ "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20",
+ "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30",
+ "d31");
+ }
+
+ // We have finished handling groups of 16 entries at once; now
+ // try to handle 4 entries at once.
+ for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) {
+ int32x4_t q = vld1q_s32(src_ptr);
+ q = vaddq_s32(q, vld1q_s32(rank_one_update_ptr));
+ q = vaddq_s32(q, vdupq_n_s32(constant_offset));
+ q = vmulq_n_s32(q, result_mult_int);
+ q = vaddq_s32(q, vdupq_n_s32(preshift_offset));
+ q = vshlq_s32(q, shift_reg);
+ int16x4_t q_16 = vqmovn_s32(q);
+ uint8x8_t q_8 = vqmovun_s16(vcombine_s16(q_16, q_16));
+ vst1_lane_u32(reinterpret_cast<std::uint32_t*>(dst_ptr),
+ vreinterpret_u32_u8(q_8), 0);
+ dst_ptr += 4;
+ src_ptr += 4;
+ rank_one_update_ptr += 4;
+ }
+ // We have finished handling 4 entries at once; now handle
+ // remaining entries one by one.
+ for (int r = dst_rows_aligned4; r < dst->rows(); r++) {
+ std::int32_t q = src(r, c);
+ q += packed_lhs.rank_one_update()[r] + rank1update + rank0update;
+ q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >>
+ result_shift;
+ (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q;
+ }
+ }
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_
diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h
new file mode 100644
index 0000000..b1592c8
--- /dev/null
+++ b/profiling/instrumentation.h
@@ -0,0 +1,217 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// instrumentation.h: contains the definitions needed to
+// instrument code for profiling:
+// ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
+//
+// profiler.h is only needed to drive the profiler:
+// StartProfiling, FinishProfiling.
+//
+// See the usage example in profiler.h.
+
+#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
+#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
+
+#include <pthread.h>
+#include <cstdint>
+#include <cassert>
+#include <cstdlib>
+#include <algorithm>
+
+#ifdef GEMMLOWP_PROFILING
+#include <set>
+#include <cstdio>
+#include <cstring>
+#endif
+
+// We should always use C++11 thread_local; unfortunately that
+// isn't fully supported on Apple yet.
+#ifdef __APPLE__
+#define GEMMLOWP_THREAD_LOCAL static __thread
+#else
+#define GEMMLOWP_THREAD_LOCAL thread_local
+#endif
+
+namespace gemmlowp {
+
+inline void ReleaseBuildAssertion(bool condition, const char* msg) {
+ if (!condition) {
+ fprintf(stderr, "gemmlowp error: %s\n", msg);
+ abort();
+ }
+}
+
+// To be used as template parameter for GlobalLock.
+// GlobalLock<ProfilerLockId> is the profiler global lock:
+// registering threads, starting profiling, finishing profiling, and
+// the profiler itself as it samples threads, all need to lock it.
+struct ProfilerLockId;
+
+// A very plain global lock. Templated in LockId so we can have multiple
+// locks, one for each LockId type.
+template <typename LockId>
+class GlobalLock {
+ static pthread_mutex_t* Mutex() {
+ static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
+ return &m;
+ }
+
+ public:
+ static void Lock() { pthread_mutex_lock(Mutex()); }
+ static void Unlock() { pthread_mutex_unlock(Mutex()); }
+};
+
+// A very simple RAII helper to lock and unlock a GlobalLock
+template <typename LockId>
+struct AutoGlobalLock {
+ AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
+ ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
+};
+
+// MemoryBarrier is purely a compile-time thing; it tells two things
+// to the compiler:
+// 1) It prevents reordering code across it
+// (thanks to the 'volatile' after 'asm')
+// 2) It requires the compiler to assume that any value previously
+// read from memory, may have changed. Thus it offers an alternative
+// to using 'volatile' variables.
+inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
+
+// Profiling definitions. Two paths: when profiling is enabled,
+// and when profiling is disabled.
+#ifdef GEMMLOWP_PROFILING
+// This code path is when profiling is enabled.
+
+// A pseudo-call-stack. Contrary to a real call-stack, this only
+// contains pointers to literal strings that were manually entered
+// in the instrumented code (see ScopedProfilingLabel).
+struct ProfilingStack {
+ static const std::size_t kMaxSize = 15;
+ typedef const char* LabelsArrayType[kMaxSize];
+ LabelsArrayType labels;
+ std::size_t size;
+
+ ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
+
+ void Push(const char* label) {
+ MemoryBarrier();
+ ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
+ labels[size] = label;
+ MemoryBarrier();
+ size++;
+ MemoryBarrier();
+ }
+
+ void Pop() {
+ MemoryBarrier();
+ ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
+ size--;
+ MemoryBarrier();
+ }
+
+ void UpdateTop(const char* new_label) {
+ MemoryBarrier();
+ assert(size);
+ labels[size - 1] = new_label;
+ MemoryBarrier();
+ }
+
+ ProfilingStack& operator=(const ProfilingStack& other) {
+ memcpy(this, &other, sizeof(ProfilingStack));
+ return *this;
+ }
+
+ bool operator==(const ProfilingStack& other) const {
+ return !memcmp(this, &other, sizeof(ProfilingStack));
+ }
+};
+
+static_assert(
+ !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
+ "ProfilingStack should have power-of-two size to fit in cache lines");
+
+struct ThreadInfo;
+
+// The global set of threads being profiled.
+inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
+ static std::set<ThreadInfo*> v;
+ return v;
+}
+
+struct ThreadInfo {
+ pthread_key_t key; // used only to get a callback at thread exit.
+ ProfilingStack stack;
+
+ ThreadInfo() {
+ pthread_key_create(&key, ThreadExitCallback);
+ pthread_setspecific(key, this);
+ }
+
+ static void ThreadExitCallback(void* ptr) {
+ AutoGlobalLock<ProfilerLockId> lock;
+ ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
+ ThreadsUnderProfiling().erase(self);
+ pthread_key_delete(self->key);
+ }
+};
+
+inline ThreadInfo& ThreadLocalThreadInfo() {
+ GEMMLOWP_THREAD_LOCAL ThreadInfo i;
+ return i;
+}
+
+// ScopedProfilingLabel is how one instruments code for profiling
+// with this profiler. Construct local ScopedProfilingLabel variables,
+// passing a literal string describing the local code. Profile
+// samples will then be annotated with this label, while it is in scope
+// (whence the name --- also known as RAII).
+// See the example in profiler.h.
+class ScopedProfilingLabel {
+ ProfilingStack* profiling_stack_;
+
+ public:
+ explicit ScopedProfilingLabel(const char* label)
+ : profiling_stack_(&ThreadLocalThreadInfo().stack) {
+ profiling_stack_->Push(label);
+ }
+
+ ~ScopedProfilingLabel() { profiling_stack_->Pop(); }
+
+ void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
+};
+
+// To be called once on each thread to be profiled.
+inline void RegisterCurrentThreadForProfiling() {
+ AutoGlobalLock<ProfilerLockId> lock;
+ ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
+}
+
+#else // not GEMMLOWP_PROFILING
+// This code path is when profiling is disabled.
+
+// This empty definition of ScopedProfilingLabel ensures that
+// it has zero runtime overhead when profiling is disabled.
+struct ScopedProfilingLabel {
+ explicit ScopedProfilingLabel(const char*) {}
+ void Update(const char*) {}
+};
+
+inline void RegisterCurrentThreadForProfiling() {}
+
+#endif
+
+} // end namespace gemmlowp
+
+#endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
diff --git a/profiling/profiler.h b/profiling/profiler.h
new file mode 100644
index 0000000..9ea7a9f
--- /dev/null
+++ b/profiling/profiler.h
@@ -0,0 +1,373 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// profiler.h: a simple sampling profiler that's always just one #include away!
+//
+// Overview
+// ========
+//
+// This profiler only samples a pseudo-stack, not the actual call stack.
+// The code to be profiled needs to be instrumented with
+// pseudo-stack "labels", see ScopedProfilingLabel.
+// Using pseudo-stacks allows this profiler to be very simple, low-overhead,
+// portable, and independent of compilation details such as function inlining
+// and frame pointers. The granularity of instrumentation can be freely chosen,
+// and it is possible to get some annotate-like detail, i.e. detail within one
+// function without splitting it into multiple functions.
+//
+// This profiler should remain small and simple; its key feature is to fit in
+// a single header file so that there should never be a reason to refrain
+// from profiling. More complex and feature-rich alternatives are
+// readily available. This one offers a strict superset of its
+// functionality: https://github.com/bgirard/GeckoProfiler, including
+// intertwining pseudostacks with real call stacks, more annotation options,
+// and advanced visualization.
+//
+// Usage
+// =====
+//
+// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
+// not enabled, profiling instrumentation from common.h
+// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
+// is still defined but does nothing. On the other hand,
+// when profiling is not enabled, it is an error to #include the
+// present file.
+//
+// 1. Each thread can opt in to profiling by calling
+// RegisterCurrentThreadForProfiling() defined in common.h.
+// This can be done at any time, before or during profiling.
+// No sample will be collected from a thread until
+// it has called RegisterCurrentThreadForProfiling().
+//
+// 2. Instrument your code to be profiled with ScopedProfilingLabel,
+// which is a RAII helper defined in common.h. The identifier
+// names (some_label, etc) do not matter; what will show up
+// in the profile is the string passed to the constructor, which
+// must be a literal string. See the full example below.
+//
+// Note: the overhead of ScopedProfilingLabel is zero when not
+// enabling profiling (when not defining GEMMLOWP_PROFILING).
+//
+// 3. Use the profiler.h interface to control profiling. There are two
+// functions: StartProfiling() and FinishProfiling(). They must be
+// called on the same thread. FinishProfiling() prints the profile
+// on stdout.
+//
+// Full example
+// ============
+/*
+ #define GEMMLOWP_PROFILING
+ #include "profiling/instrumentation.h"
+ using namespace gemmlowp;
+
+ const int iters = 100000000;
+ volatile int i;
+
+ void Bar() {
+ ScopedProfilingLabel label("Bar");
+ for (i = 0; i < iters; i++) {}
+ }
+
+ void Foo() {
+ ScopedProfilingLabel label("Foo");
+ for (i = 0; i < iters; i++) {}
+ Bar();
+ }
+
+ void Init() {
+ RegisterCurrentThreadForProfiling();
+ }
+
+ #include "profiling/profiler.h"
+
+ int main() {
+ Init();
+ StartProfiling();
+ Foo();
+ FinishProfiling();
+ }
+*
+* Output:
+*
+ gemmlowp profile (1 threads, 304 samples)
+ 100.00% Foo
+ 51.32% other
+ 48.68% Bar
+ 0.00% other (outside of any label)
+*/
+//
+// Interpreting results
+// ====================
+//
+// Each node shows the absolute percentage, among all the samples,
+// of the number of samples that recorded the given pseudo-stack.
+// The percentages are *NOT* relative to the parent node. In addition
+// to your own labels, you will also see 'other' nodes that collect
+// the remainder of samples under the parent node that didn't fall into
+// any of the labelled child nodes. Example:
+//
+// 20% Foo
+// 12% Bar
+// 6% Xyz
+// 2% other
+//
+// This means that 20% of all labels were under Foo, of which 12%/20%==60%
+// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
+// under either Bar or Xyz.
+//
+// Typically, one wants to keep adding ScopedProfilingLabel's until
+// the 'other' nodes show low percentages.
+//
+// Interpreting results with multiple threads
+// ==========================================
+//
+// At each sample, each thread registered for profiling gets sampled once.
+// So if there is one "main thread" spending its time in MainFunc() and
+// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
+// samples will be in WorkerFunc, so the profile will look like this:
+//
+// 80% WorkerFunc
+// 20% MainFunc
+
+#ifndef GEMMLOWP_PROFILING_PROFILER_H_
+#define GEMMLOWP_PROFILING_PROFILER_H_
+
+#ifndef GEMMLOWP_PROFILING
+#error Profiling is not enabled!
+#endif
+
+#include <vector>
+
+#include "profiling/instrumentation.h"
+
+namespace gemmlowp {
+
+// A tree view of a profile.
+class ProfileTreeView {
+ struct Node {
+ std::vector<Node*> children;
+ const char* label;
+ std::size_t weight;
+ Node() : label(nullptr), weight(0) {}
+ ~Node() {
+ for (auto child : children) {
+ delete child;
+ }
+ }
+ };
+
+ static bool CompareNodes(Node* n1, Node* n2) {
+ return n1->weight > n2->weight;
+ }
+
+ Node root_;
+
+ void PrintNode(const Node* node, int level) const {
+ if (level) {
+ for (int i = 1; i < level; i++) {
+ printf(" ");
+ }
+ printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
+ }
+ for (auto child : node->children) {
+ PrintNode(child, level + 1);
+ }
+ }
+
+ static void AddStackToNode(const ProfilingStack& stack, Node* node,
+ std::size_t level) {
+ node->weight++;
+ if (stack.size == level) {
+ return;
+ }
+ Node* child_to_add_to = nullptr;
+ for (auto child : node->children) {
+ if (child->label == stack.labels[level]) {
+ child_to_add_to = child;
+ break;
+ }
+ }
+ if (!child_to_add_to) {
+ child_to_add_to = new Node;
+ child_to_add_to->label = stack.labels[level];
+ node->children.push_back(child_to_add_to);
+ }
+ AddStackToNode(stack, child_to_add_to, level + 1);
+ return;
+ }
+
+ void AddStack(const ProfilingStack& stack) {
+ AddStackToNode(stack, &root_, 0);
+ }
+
+ void AddOtherChildrenToNode(Node* node) {
+ std::size_t top_level_children_weight = 0;
+ for (auto c : node->children) {
+ AddOtherChildrenToNode(c);
+ top_level_children_weight += c->weight;
+ }
+ if (top_level_children_weight) {
+ Node* other_child = new Node;
+ other_child->label =
+ node == &root_ ? "other (outside of any label)" : "other";
+ other_child->weight = node->weight - top_level_children_weight;
+ node->children.push_back(other_child);
+ }
+ }
+
+ void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
+
+ void SortNode(Node* node) {
+ std::sort(node->children.begin(), node->children.end(), CompareNodes);
+ for (auto child : node->children) {
+ SortNode(child);
+ }
+ }
+
+ void Sort() { SortNode(&root_); }
+
+ public:
+ explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
+ for (auto stack : stacks) {
+ AddStack(stack);
+ }
+ AddOtherNodes();
+ Sort();
+ }
+
+ void Print() const {
+ printf("\n");
+ printf("gemmlowp profile (%d threads, %d samples)\n",
+ static_cast<int>(ThreadsUnderProfiling().size()),
+ static_cast<int>(root_.weight));
+ PrintNode(&root_, 0);
+ printf("\n");
+ }
+};
+
+// This function is the only place that determines our sampling frequency.
+inline void WaitOneProfilerTick() {
+ static const int millisecond = 1000000;
+
+#if defined __arm__ || defined __aarch64__
+ // Reduced sampling frequency on mobile devices helps limit time and memory
+ // overhead there.
+ static const int interval = 10 * millisecond;
+#else
+ static const int interval = 1 * millisecond;
+#endif
+
+ timespec ts;
+ ts.tv_sec = 0;
+ ts.tv_nsec = interval;
+ nanosleep(&ts, nullptr);
+}
+
+// This is how we track whether we've already started profiling,
+// to guard against misuse of the API.
+inline bool& IsProfiling() {
+ static bool b;
+ return b;
+}
+
+// This is how we tell the profiler thread to finish.
+inline bool& ProfilerThreadShouldFinish() {
+ static bool b;
+ return b;
+}
+
+// The profiler thread. See ProfilerThreadFunc.
+inline pthread_t& ProfilerThread() {
+ static pthread_t t;
+ return t;
+}
+
+// Records a stack from a running thread.
+// The tricky part is that we're not interrupting the thread.
+// This is OK because we're looking at a pseudo-stack of labels,
+// not at the real thread stack, and if the pseudo-stack changes
+// while we're recording it, we are OK with getting either the
+// old or the new stack. Note that ProfilingStack::Pop
+// only decrements the size, and doesn't null the popped label,
+// so if we're concurrently recording it, it shouldn't change
+// under our feet until another label is pushed, at which point
+// we are OK with getting either this new label or the old one.
+// In the end, the key atomicity property that we are relying on
+// here is that pointers are changed atomically, and the labels
+// are pointers (to literal strings).
+inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
+ assert(!dst->size);
+ while (dst->size < thread->stack.size) {
+ dst->labels[dst->size] = thread->stack.labels[dst->size];
+ dst->size++;
+ MemoryBarrier(); // thread->stack can change at any time
+ }
+}
+
+// The profiler thread's entry point.
+// Note that a separate thread is to be started each time we call
+// StartProfiling(), and finishes when we call FinishProfiling().
+// So here we only need to handle the recording and reporting of
+// a single profile.
+inline void* ProfilerThreadFunc(void*) {
+ assert(ProfilerThread() == pthread_self());
+
+ // Since we only handle one profile per profiler thread, the
+ // profile data (the array of recorded stacks) can be a local variable here.
+ std::vector<ProfilingStack> stacks;
+
+ while (!ProfilerThreadShouldFinish()) {
+ WaitOneProfilerTick();
+ {
+ AutoGlobalLock<ProfilerLockId> lock;
+ for (auto t : ThreadsUnderProfiling()) {
+ ProfilingStack s;
+ RecordStack(t, &s);
+ stacks.push_back(s);
+ }
+ }
+ }
+
+ // Profiling is finished and we now report the results.
+ ProfileTreeView(stacks).Print();
+
+ return nullptr;
+}
+
+// Starts recording samples.
+inline void StartProfiling() {
+ AutoGlobalLock<ProfilerLockId> lock;
+ ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
+ IsProfiling() = true;
+ ProfilerThreadShouldFinish() = false;
+ pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
+}
+
+// Stops recording samples, and prints a profile tree-view on stdout.
+inline void FinishProfiling() {
+ {
+ AutoGlobalLock<ProfilerLockId> lock;
+ ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
+ // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
+ // as the scary comments below should make clear.
+ // Should we use a condition variable?
+ ProfilerThreadShouldFinish() = true;
+ } // must release the lock here to avoid deadlock with profiler thread.
+ pthread_join(ProfilerThread(), nullptr);
+ IsProfiling() = false; // yikes, this should be guarded by the lock!
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_PROFILING_PROFILER_H_
diff --git a/public/gemmlowp.h b/public/gemmlowp.h
new file mode 100644
index 0000000..7b0f35e
--- /dev/null
+++ b/public/gemmlowp.h
@@ -0,0 +1,52 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// gemmlowp.h: the main public interface header of gemmlowp.
+
+#ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_
+#define GEMMLOWP_PUBLIC_GEMMLOWP_H_
+
+#include "public/map.h"
+#include "internal/multi_thread_gemm.h"
+#include "internal/kernel_default.h"
+
+namespace gemmlowp {
+
+class GemmContext : public MultiThreadGemmContext {};
+
+// Computes a general matrix product ("GEMM").
+// The meaning of the offsets, result_mult_int and result_shift
+// parameters is the same as in the standard EightBitIntGemm interface
+// (which is also implemented in the eight_bit_int_gemm directory).
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+ MapOrder ResultOrder>
+void Gemm(GemmContext* context, const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ if (rhs.cols() > DefaultKernelForGEMM::Format::kCols / 2) {
+ MultiThreadGemm<DefaultKernelForGEMM::Format>(
+ context, DefaultKernelForGEMM(), lhs, rhs, result, lhs_offset,
+ rhs_offset, result_offset, result_mult_int, result_shift);
+ } else {
+ MultiThreadGemm<DefaultKernelForGEMV::Format>(
+ context, DefaultKernelForGEMV(), lhs, rhs, result, lhs_offset,
+ rhs_offset, result_offset, result_mult_int, result_shift);
+ }
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_PUBLIC_GEMMLOWP_H_
diff --git a/public/map.h b/public/map.h
new file mode 100644
index 0000000..ba92c08
--- /dev/null
+++ b/public/map.h
@@ -0,0 +1,77 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// map.h: a minimalist view-existing-buffer-as-a-matrix class,
+// which is how gemmlowp interfaces with external matrix data.
+
+#ifndef GEMMLOWP_PUBLIC_MAP_H_
+#define GEMMLOWP_PUBLIC_MAP_H_
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// The two storage orders allowed to map buffers as matrices: ColMajor
+// means column-major, RowMajor means row-major.
+enum class MapOrder { ColMajor, RowMajor };
+
+// A MatrixMap is a view of an existing buffer as a matrix. It does not own
+// the buffer.
+template <typename tScalar, MapOrder tOrder>
+class MatrixMap {
+ public:
+ typedef tScalar Scalar;
+ static const MapOrder kOrder = tOrder;
+
+ protected:
+ Scalar* data_; // not owned.
+ int rows_, cols_, stride_;
+
+ public:
+ MatrixMap(Scalar* data, int rows, int cols, int stride)
+ : data_(data), rows_(rows), cols_(cols), stride_(stride) {}
+
+ MatrixMap(const MatrixMap& other)
+ : data_(other.data_),
+ rows_(other.rows_),
+ cols_(other.cols_),
+ stride_(other.stride_) {}
+
+ int rows() const { return rows_; }
+ int cols() const { return cols_; }
+ int stride() const { return stride_; }
+ int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; }
+ int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; }
+ Scalar* data() const { return data_; }
+ Scalar* data(int row, int col) const {
+ return data_ + row * rows_stride() + col * cols_stride();
+ }
+ Scalar operator()(int row, int col) const { return *data(row, col); }
+ Scalar& operator()(int row, int col) { return *data(row, col); }
+
+ MatrixMap block(int start_row, int start_col, int block_rows,
+ int block_cols) const {
+ assert(start_row >= 0);
+ assert(start_row + block_rows <= rows_);
+ assert(start_col >= 0);
+ assert(start_col + block_cols <= cols_);
+
+ return MatrixMap(data(start_row, start_col), block_rows, block_cols,
+ stride_);
+ }
+};
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_PUBLIC_MAP_H_
diff --git a/scripts/prepare-device-for-benchmarking.sh b/scripts/prepare-device-for-benchmarking.sh
new file mode 100755
index 0000000..16ac2eb
--- /dev/null
+++ b/scripts/prepare-device-for-benchmarking.sh
@@ -0,0 +1,138 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Puts device in a special state giving optimal benchmark results.
+# Not very realistic wrt real-world conditions. We are more interested
+# in performance on devices in default state.
+
+#!/bin/bash
+
+echo "disabling mpdecision..."
+adb root
+adb remount
+adb shell "mv /system/bin/mpdecision /system/bin/mpdecision-dontfind" > /dev/null
+
+echo "rebooting device..."
+adb reboot
+adb wait-for-device
+
+echo "restarting adbd as root..."
+isroot=0
+while [ $isroot -eq 0 ]
+do
+ adb root > /dev/null
+ if [ $? -eq 0 ]
+ then
+ isroot=1
+ fi
+ echo " retrying in 1 s..."
+ sleep 1
+done
+
+echo "querying ro.hardware..."
+hardware="`adb shell getprop ro.hardware`"
+while [ "$hardware" == "" ]
+do
+ echo "retrying in 1 s..."
+ sleep 1
+ hardware="`adb shell getprop ro.hardware`"
+done
+
+echo "got ro.hardware=$hardware"
+
+shouldstopui=0
+if [[ "$#" =~ .*--stop-ui.* ]]
+then
+ shouldstopui=1
+else
+ if [[ "$hardware" =~ sprout.* ]]
+ then
+ echo "detected Android One (sprout). Will default to leaving the UI on."
+ else
+ echo "Will default to stopping the UI."
+ shouldstopui=1
+ fi
+fi
+
+if [ $shouldstopui -ne 0 ]
+then
+ echo "stopping the UI..."
+ isshellstopped=0
+ while [ $isshellstopped -eq 0 ]
+ do
+ adb shell stop > /dev/null
+ if [ $? -eq 0 ]
+ then
+ isshellstopped=1
+ fi
+ echo " retrying in 1 s..."
+ sleep 1
+ done
+fi
+
+waitsec=10
+echo "sleeping $waitsec s before changing CPU settings, to work around a race with Android startup..."
+sleep $waitsec
+
+echo "bringing all cores online..."
+for cpu in `seq 0 3`
+do
+ file="/sys/devices/system/cpu/cpu$cpu/online"
+ if [ -e $file ]
+ then
+ echo " cpu $cpu"
+ echo "echo 1 > $file; exit" | adb shell > /dev/null
+ if [ $? -ne 0 ]
+ then
+ echo "WARNING: failed to bring cpu $cpu online ($file)"
+ fi
+ fi
+done
+
+echo "setting performance governor..."
+for cpu in `seq 0 3`
+do
+ file="/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor"
+ if [ -e $file ]
+ then
+ echo " cpu $cpu"
+ adb shell "echo performance > $file" > /dev/null
+ if [ $? -ne 0 ]
+ then
+ echo "WARNING: failed to set cpufreq governor ($file)"
+ fi
+ fi
+done
+
+cpuloadlowsecs=0
+echo "waiting for CPU load to settle down..."
+while [ $cpuloadlowsecs -lt 5 ]
+do
+ cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`"
+ if [ "$cpuload" -lt "2" ]
+ then
+ cpuloadlowsecs=$((cpuloadlowsecs+1))
+ echo "CPU load has been low for $cpuloadlowsecs s..."
+ else
+ cpuloadlowsecs=0
+ echo "CPU load isn't low enough ($cpuload %)..."
+ fi
+ sleep 1
+done
+
+if [ $shouldstopui -eq 0 ]
+then
+ echo "OK, the device might be ready now, but the UI is still running,"
+ echo "so take a look at the screen to check if it's not doing something special."
+fi
diff --git a/scripts/restore-device-normal-state.sh b/scripts/restore-device-normal-state.sh
new file mode 100755
index 0000000..583a24d
--- /dev/null
+++ b/scripts/restore-device-normal-state.sh
@@ -0,0 +1,42 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Un-does the effect of prepare-device-for-benchmarking.sh
+
+#!/bin/bash
+
+echo "restoring mpdecision..."
+adb root
+adb remount
+adb shell "mv /system/bin/mpdecision-dontfind /system/bin/mpdecision" > /dev/null
+
+echo "rebooting device..."
+adb reboot
+adb wait-for-device
+
+cpuloadlowsecs=0
+echo "waiting for CPU load to settle down..."
+while [ $cpuloadlowsecs -lt 5 ]
+do
+ cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`"
+ if [ "$cpuload" -lt "2" ]
+ then
+ cpuloadlowsecs=$((cpuloadlowsecs+1))
+ echo "CPU load has been low for $cpuloadlowsecs s..."
+ else
+ cpuloadlowsecs=0
+ echo "CPU load isn't low enough ($cpuload %)..."
+ fi
+ sleep 1
+done
diff --git a/scripts/test-android.sh b/scripts/test-android.sh
new file mode 100755
index 0000000..b5d83d7
--- /dev/null
+++ b/scripts/test-android.sh
@@ -0,0 +1,83 @@
+# Copyright 2014 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/bin/bash
+
+if [ -z "$CXX" ]
+then
+ echo "please set the CXX environment variable to point to your native Android toolchain C++ compiler"
+ exit 1
+fi
+
+default_cflags="-O3"
+
+if [ "$#" -eq 0 ]
+then
+ echo "Usage: $0 files... [cflags...]"
+ echo "All command-line parameters are passed along to the C++ compiler, so they can \
+be either source files, or compiler flags."
+ echo "Default cflags: $default_cflags"
+ echo "Relies on the CXX environment variable to point to an Android C++ toolchain compiler."
+ exit 1
+fi
+
+EXE=gemmlowp-android-binary
+
+$CXX \
+ --std=c++11 \
+ -Wall -Wextra -pedantic \
+ -fPIE -pie -mfpu=neon -mfloat-abi=softfp \
+ -lstdc++ -latomic \
+ -I . -I .. \
+ -o $EXE \
+ -Wno-unused-variable -Wno-unused-parameter \
+ $default_cflags \
+ $*
+
+if [ $? != 0 ]; then
+ echo "build failed"
+ exit 1
+fi
+
+adb root
+
+if [ $? != 0 ]; then
+ echo "$0: adb root failed"
+ exit 1
+fi
+
+adb shell mkdir -p /data/local/tmp
+
+if [ $? != 0 ]; then
+ echo "$0: adb shell failed to mkdir /data/local/tmp"
+ exit 1
+fi
+
+adb push $EXE /data/local/tmp
+
+if [ $? != 0 ]; then
+ echo "$0: adb push failed to write to /data/local/tmp"
+ exit 1
+fi
+
+adb shell "echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"
+
+adb shell "/data/local/tmp/$EXE" | tee "log-$EXE"
+
+if [ $? != 0 ]; then
+ echo "$0: adb shell failed to run binary on device"
+ exit 1
+fi
+
+adb shell "echo ondemand > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor"
diff --git a/test/benchmark.cc b/test/benchmark.cc
new file mode 100644
index 0000000..d59164a
--- /dev/null
+++ b/test/benchmark.cc
@@ -0,0 +1,210 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+#ifdef __APPLE__
+#include <sys/time.h>
+#endif
+
+#include <iostream>
+#include <ctime>
+#include <cstdint>
+#include <vector>
+#include <map>
+#include <cstdlib>
+
+#include "test/test.h"
+#include "public/gemmlowp.h"
+
+namespace gemmlowp {
+
+double time() {
+#ifdef __APPLE__
+ timeval t;
+ gettimeofday(&t, nullptr);
+ return t.tv_sec + 1e-6 * t.tv_usec;
+#else
+ timespec t;
+ clock_gettime(CLOCK_REALTIME, &t);
+ return t.tv_sec + 1e-9 * t.tv_nsec;
+#endif
+}
+
+const double min_accurate_duration = 1e-1;
+const std::size_t min_working_set_size = 16 * 1024 * 1024;
+
+template <typename Kernel, typename LhsType, typename RhsType,
+ typename ResultType>
+double gflops_for_gemm_size(GemmContext* context, int rows, int depth,
+ int cols) {
+ typedef std::uint8_t Scalar;
+
+ // set up the matrix pool
+
+ const std::size_t combined_three_matrices_sizes =
+ sizeof(Scalar) * (rows * depth + depth * cols + rows * cols);
+
+ const std::size_t matrix_pool_size =
+ 1 + min_working_set_size / combined_three_matrices_sizes;
+
+ std::vector<LhsType> lhs(matrix_pool_size);
+ std::vector<RhsType> rhs(matrix_pool_size);
+ std::vector<ResultType> result(matrix_pool_size);
+
+ lhs[0].Resize(rows, depth);
+ MakeConstant(&lhs[0], 128);
+ rhs[0].Resize(depth, cols);
+ MakeConstant(&rhs[0], 128);
+ result[0].Resize(rows, cols);
+ MakeZero(&result[0]);
+
+ for (std::size_t i = 1; i < matrix_pool_size; i++) {
+ lhs[i] = lhs[0];
+ rhs[i] = rhs[0];
+ result[i] = result[0];
+ }
+
+ const int depth_shift = static_cast<int>(
+ std::ceil(0.5 * std::log(static_cast<float>(depth)) / std::log(2.0f)));
+
+ // main benchmark loop
+
+ int iters_at_a_time = 1;
+ float time_per_iter = 0.0f;
+ std::size_t matrix_index = 0;
+
+ while (true) {
+ double starttime = time();
+ for (int i = 0; i < iters_at_a_time; i++) {
+ Gemm(context, lhs[matrix_index].const_map(),
+ rhs[matrix_index].const_map(), &result[matrix_index].map(), -75, -91,
+ 74980, 123, 18 + depth_shift);
+
+ matrix_index++;
+ if (matrix_index == matrix_pool_size) {
+ matrix_index = 0;
+ }
+ }
+ double endtime = time();
+
+ const float timing = static_cast<float>(endtime - starttime);
+
+ if (timing >= min_accurate_duration) {
+ time_per_iter = timing / iters_at_a_time;
+ break;
+ }
+
+ iters_at_a_time *= 2;
+ }
+
+ return 2e-9 * rows * depth * cols / time_per_iter;
+}
+
+void benchmark(GemmContext* context) {
+#ifdef GEMMLOWP_TEST_KERNEL
+ typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMM;
+ typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMV;
+#else
+ typedef gemmlowp::DefaultKernelForGEMM KernelForGEMM;
+ typedef gemmlowp::DefaultKernelForGEMV KernelForGEMV;
+#endif
+
+ std::map<std::tuple<int, int, int>, std::vector<double>> benchmark_results;
+
+ std::vector<std::tuple<int, int, int>> benchmark_sizes;
+ benchmark_sizes.emplace_back(10, 10, 10);
+ benchmark_sizes.emplace_back(20, 20, 20);
+ benchmark_sizes.emplace_back(30, 30, 30);
+ benchmark_sizes.emplace_back(40, 40, 40);
+ benchmark_sizes.emplace_back(50, 50, 50);
+ benchmark_sizes.emplace_back(60, 60, 60);
+ benchmark_sizes.emplace_back(64, 256, 147);
+ benchmark_sizes.emplace_back(100, 100, 1);
+ benchmark_sizes.emplace_back(100, 100, 100);
+ benchmark_sizes.emplace_back(100, 1000, 100);
+ benchmark_sizes.emplace_back(1000, 1000, 1);
+ benchmark_sizes.emplace_back(1000, 1000, 10);
+ benchmark_sizes.emplace_back(1000, 1000, 100);
+ benchmark_sizes.emplace_back(1000, 1000, 1000);
+
+ const int repeat = 2;
+
+ typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType;
+ typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType;
+ typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType;
+
+#ifdef GEMMLOWP_TEST_PROFILE
+ gemmlowp::RegisterCurrentThreadForProfiling();
+ gemmlowp::StartProfiling();
+#endif
+
+ // We don't record the first repetition, it's just warm-up.
+ for (int r = 0; r < repeat + 1; r++) {
+ std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r"
+ << std::flush;
+ for (auto s : benchmark_sizes) {
+ double gflops = 0;
+ int rows = std::get<0>(s);
+ int depth = std::get<1>(s);
+ int cols = std::get<2>(s);
+ if (cols > KernelForGEMM::Format::kCols / 2) {
+ gflops =
+ gflops_for_gemm_size<KernelForGEMM, LhsType, RhsType, ResultType>(
+ context, rows, depth, cols);
+ } else {
+ gflops =
+ gflops_for_gemm_size<KernelForGEMV, LhsType, RhsType, ResultType>(
+ context, rows, depth, cols);
+ }
+ if (r > 0) {
+ benchmark_results[s].emplace_back(gflops);
+ }
+ }
+ }
+
+ std::cout << " \r"
+ << std::flush;
+
+#ifdef GEMMLOWP_TEST_PROFILE
+ gemmlowp::FinishProfiling();
+#endif
+
+ std::cout.precision(4);
+
+ for (auto b : benchmark_results) {
+ sort(b.second.begin(), b.second.end());
+ std::cout << std::get<0>(b.first) << "x" << std::get<1>(b.first) << "x"
+ << std::get<2>(b.first) << " : " << b.second.back() << " GFlops/s"
+ << std::endl;
+ }
+ std::cout << std::endl;
+}
+
+} // end namespace gemmlowp
+
+int main() {
+ {
+ gemmlowp::GemmContext context;
+ std::cout << "Benchmarking default mode (typically multi-threaded)..."
+ << std::endl;
+ gemmlowp::benchmark(&context);
+ }
+
+ {
+ gemmlowp::GemmContext context;
+ context.set_max_num_threads(1);
+ std::cout << "Benchmarking single-threaded mode..." << std::endl;
+ gemmlowp::benchmark(&context);
+ }
+}
diff --git a/test/test.cc b/test/test.cc
new file mode 100644
index 0000000..5574640
--- /dev/null
+++ b/test/test.cc
@@ -0,0 +1,526 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <unistd.h>
+#include <iostream>
+#include <ctime>
+#include <cstdint>
+#include <vector>
+#include <cstdlib>
+#include <string>
+
+#include "public/gemmlowp.h"
+#include "internal/kernel_reference.h"
+#include "eight_bit_int_gemm/eight_bit_int_gemm.h"
+
+namespace gemmlowp {
+
+struct ReferenceEightBitIntGemmContext {
+ ReferenceEightBitIntGemmContext()
+ : saturated_0_values(0), saturated_255_values(0) {}
+
+ int saturated_0_values, saturated_255_values;
+};
+
+void ReferenceEightBitIntGemm(ReferenceEightBitIntGemmContext* context, int m,
+ int n, int k, const uint8_t* a, int32_t a_offset,
+ int lda, const uint8_t* b, int32_t b_offset,
+ int ldb, uint8_t* c, int32_t c_offset,
+ int32_t c_mult_int, int32_t c_shift, int ldc) {
+ context->saturated_0_values = 0;
+ context->saturated_255_values = 0;
+ int i, j, l;
+
+ for (j = 0; j < n; j++) {
+ for (i = 0; i < m; i++) {
+ int32_t total = 0;
+ for (l = 0; l < k; l++) {
+ const int a_index = ((i * lda) + l);
+ const uint8_t a_as_byte = a[a_index];
+ const int32_t a_as_int = ((static_cast<std::int32_t>(a_as_byte)) + a_offset);
+ const int b_index = ((j * ldb) + l);
+ const uint8_t b_as_byte = b[b_index];
+ const int32_t b_as_int = ((static_cast<std::int32_t>(b_as_byte)) + b_offset);
+ const int32_t mult_as_int = (a_as_int * b_as_int);
+ total += mult_as_int;
+ }
+ const int c_index = ((ldc * i) + j);
+ int32_t output =
+ ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1))) >>
+ c_shift);
+ if (output >= 255) {
+ output = 255;
+ context->saturated_255_values++;
+ }
+ if (output <= 0) {
+ output = 0;
+ context->saturated_0_values++;
+ }
+ c[c_index] = static_cast<std::uint8_t>(output);
+ }
+ }
+}
+
+// *GemmWrapper's allow to wrap various Gemm functions in a uniform
+// interface, so we can use the same testing code to test all of them
+
+template <typename Kernel, typename Scalar, MapOrder LhsOrder,
+ MapOrder RhsOrder, MapOrder ResultOrder>
+struct SingleThreadGemmWrapper {
+ static const int kLhsBitDepth = Kernel::kLhsBitDepth;
+ static const int kRhsBitDepth = Kernel::kRhsBitDepth;
+
+ static const char* Name() {
+ static char buf[256];
+ snprintf(buf, sizeof(buf),
+ "SingleThreadGemm, Kernel: %s", Kernel().Name());
+ return buf;
+ }
+
+ typedef SingleThreadGemmContext Context;
+
+ static void Gemm(Context* context,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ SingleThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder,
+ ResultOrder>(context, Kernel(), lhs, rhs, result,
+ lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+ }
+};
+
+template <typename Kernel, typename Scalar, MapOrder LhsOrder,
+ MapOrder RhsOrder, MapOrder ResultOrder>
+struct MultiThreadGemmWrapper {
+ static const int kLhsBitDepth = Kernel::kLhsBitDepth;
+ static const int kRhsBitDepth = Kernel::kRhsBitDepth;
+
+ static const char* Name() {
+ static char buf[256];
+ snprintf(buf, sizeof(buf),
+ "MultiThreadGemm, Kernel: %s", Kernel().Name());
+ return buf;
+ }
+
+ typedef MultiThreadGemmContext Context;
+
+ static void Gemm(Context* context,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ MultiThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder,
+ ResultOrder>(context, Kernel(), lhs, rhs, result,
+ lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+ }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+ MapOrder ResultOrder>
+struct PublicGemmWrapper {
+ static const int kLhsBitDepth = 8;
+ static const int kRhsBitDepth = 8;
+
+ static const char* Name() { return "public Gemm"; }
+
+ typedef GemmContext Context;
+
+ static void Gemm(Context* context,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ gemmlowp::Gemm<uint8_t, LhsOrder, RhsOrder, ResultOrder>(
+ context, lhs, rhs, result, lhs_offset, rhs_offset, result_offset,
+ result_mult_int, result_shift);
+ }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+ MapOrder ResultOrder>
+struct EightBitIntGemmWrapper {
+ static const int kLhsBitDepth = 8;
+ static const int kRhsBitDepth = 8;
+
+ static const char* Name() { return "EightBitIntGemm"; }
+
+ typedef void Context;
+
+ static void Gemm(Context*, const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ eight_bit_int_gemm::EightBitIntGemm(
+ rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset,
+ rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(),
+ result_offset, result_mult_int, result_shift, result->stride());
+ }
+};
+
+template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder,
+ MapOrder ResultOrder>
+struct ReferenceEightBitIntGemmWrapper {
+ static const int kLhsBitDepth = 8;
+ static const int kRhsBitDepth = 8;
+
+ static const char* Name() { return "ReferenceEightBitIntGemm"; }
+
+ typedef ReferenceEightBitIntGemmContext Context;
+
+ static void Gemm(Context* context,
+ const MatrixMap<const Scalar, LhsOrder>& lhs,
+ const MatrixMap<const Scalar, RhsOrder>& rhs,
+ MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift) {
+ ReferenceEightBitIntGemm(
+ context, rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset,
+ rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(),
+ result_offset, result_mult_int, result_shift, result->stride());
+ }
+};
+
+// Our approach to choosing result_shift values for testing, is bisection.
+// This function takes an interval, [result_shift_min .. result_shift_max].
+// If too much saturation occurred in either direction, it bisects accordingly,
+// recursing until the interval contains only one value.
+// The primary reason why we prefer this over computing optimal shift values,
+// is that we actually want to exercise some saturation, as there is nontrivial
+// code handling that in gemmlowp.
+// Secondarily, this is faster than computing optimal shifts, since in 90% of
+// cases the first-tried shift value 16 turns out to be good enough.
+template <typename GemmWrapper, typename LhsType, typename RhsType,
+ typename ResultType>
+void test_gemm_impl(typename GemmWrapper::Context* context, const LhsType& lhs,
+ const RhsType& rhs, ResultType* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int,
+ int result_shift_min, int result_shift_max) {
+ const int rows = lhs.rows();
+ const int cols = rhs.cols();
+ Check(lhs.cols() == rhs.rows());
+ const int depth = lhs.cols();
+
+ const int result_shift = (result_shift_min + result_shift_max) / 2;
+
+ GemmWrapper::Gemm(context, lhs.const_map(), rhs.const_map(), &result->map(),
+ lhs_offset, rhs_offset, result_offset, result_mult_int,
+ result_shift);
+
+ typedef typename ResultType::Scalar Scalar;
+ static const MapOrder kLhsOrder = LhsType::kOrder;
+ static const MapOrder kRhsOrder = RhsType::kOrder;
+ static const MapOrder kResultOrder = ResultType::kOrder;
+ ResultType ref_result(rows, cols);
+ ReferenceEightBitIntGemmContext reference_context;
+ ReferenceEightBitIntGemmWrapper<Scalar, kLhsOrder, kRhsOrder,
+ kResultOrder>::Gemm(&reference_context,
+ lhs.const_map(),
+ rhs.const_map(),
+ &ref_result.map(),
+ lhs_offset, rhs_offset,
+ result_offset,
+ result_mult_int,
+ result_shift);
+
+ const bool good = *result == ref_result;
+ printf("%s: %dx%dx%d, %s, offsets %d/%d/%d, mult %d, shift %d\n",
+ good ? "PASS" : "FAIL", rows, depth, cols, GemmWrapper::Name(),
+ lhs_offset, rhs_offset, result_offset, result_mult_int, result_shift);
+
+ if (!good) {
+ int maxdiff = 0;
+ int countdiff = 0;
+ for (int c = 0; c < result->cols(); c++) {
+ for (int r = 0; r < result->rows(); r++) {
+ int a = (*result)(r, c);
+ int b = ref_result(r, c);
+ if (a != b) {
+ countdiff++;
+ maxdiff = std::max(maxdiff, std::abs(a - b));
+ }
+ }
+ }
+ printf("max difference: %d\n", maxdiff);
+ printf("number of different places: %d\n", countdiff);
+ int bad_coeffs_printed = 0;
+ for (int c = 0; c < result->cols() && bad_coeffs_printed < 20; c++) {
+ for (int r = 0; r < result->rows() && bad_coeffs_printed < 20; r++) {
+ if (ref_result(r, c) != (*result)(r, c)) {
+ printf("bad coeff: at (%d, %d), expected %d, got %d\n", r, c,
+ ref_result(r, c), (*result)(r, c));
+ bad_coeffs_printed++;
+ }
+ }
+ }
+ }
+
+ Check(good);
+
+ if (result_shift_min != result_shift_max) {
+ const int max_allowed_saturated_values = result->size() / 16;
+
+ int new_result_shift_min = result_shift_min;
+ int new_result_shift_max = result_shift_max;
+ bool retry = false;
+
+ if (reference_context.saturated_0_values > max_allowed_saturated_values) {
+ new_result_shift_max = (result_shift_min + result_shift_max) / 2;
+ retry = true;
+ }
+
+ if (reference_context.saturated_255_values > max_allowed_saturated_values) {
+ new_result_shift_min = (result_shift_min + result_shift_max) / 2;
+ retry = true;
+ }
+
+ if (retry) {
+ test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset,
+ rhs_offset, result_offset, result_mult_int,
+ new_result_shift_min, new_result_shift_max);
+ }
+ }
+}
+
+template <typename GemmWrapper, typename LhsType, typename RhsType,
+ typename ResultType>
+void test_gemm(typename GemmWrapper::Context* context, const LhsType& lhs,
+ const RhsType& rhs, ResultType* result, int lhs_offset,
+ int rhs_offset, int result_offset, int result_mult_int) {
+ test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset, rhs_offset,
+ result_offset, result_mult_int, 0, 32);
+}
+enum class WhatParamsToTest {
+ AllCombos,
+ OnlyGenericCase,
+};
+
+template <typename GemmWrapper>
+void test_gemm(typename GemmWrapper::Context* context, int rows, int depth,
+ int cols,
+ WhatParamsToTest what_to_test = WhatParamsToTest::AllCombos) {
+ typedef std::uint8_t Scalar;
+ typedef Matrix<Scalar, MapOrder::RowMajor> LhsType;
+ LhsType lhs(rows, depth);
+ MakeRandom(&lhs, GemmWrapper::kLhsBitDepth);
+ typedef Matrix<Scalar, MapOrder::ColMajor> RhsType;
+ RhsType rhs(depth, cols);
+ MakeRandom(&rhs, GemmWrapper::kRhsBitDepth);
+ typedef Matrix<Scalar, MapOrder::ColMajor> ResultType;
+ ResultType result(rows, cols);
+ MakeZero(&result);
+
+ if (what_to_test == WhatParamsToTest::AllCombos) {
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 1);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 0, 0, 1);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 10, 0, 1);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 10, 1);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 10);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 10, 10, 10);
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, 256, 1, 17, 4);
+ }
+ test_gemm<GemmWrapper>(context, lhs, rhs, &result, -75, -91, 74980, 123);
+}
+
+template <typename Kernel>
+void test_gemm_kernel(MultiThreadGemmContext* context) {
+ typedef MultiThreadGemmWrapper<Kernel, uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor,
+ MapOrder::ColMajor> GemmWrapper;
+ test_gemm<GemmWrapper>(context, 1, 1, 1, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 2, 2, 2, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 3, 3, 3, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 4, 4, 4, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 5, 5, 5, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 9, 11, 13, WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 50, 50, 50, WhatParamsToTest::AllCombos);
+ test_gemm<GemmWrapper>(context, 500, 500, 500,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 100, 5000, 100,
+ WhatParamsToTest::OnlyGenericCase);
+}
+
+template <typename GemmWrapper>
+void test_gemm(typename GemmWrapper::Context* context) {
+ test_gemm<GemmWrapper>(context, 1, 1, 1);
+ test_gemm<GemmWrapper>(context, 2, 1, 1);
+ test_gemm<GemmWrapper>(context, 1, 2, 1);
+ test_gemm<GemmWrapper>(context, 1, 1, 2);
+ test_gemm<GemmWrapper>(context, 2, 2, 2);
+ test_gemm<GemmWrapper>(context, 3, 3, 3);
+ test_gemm<GemmWrapper>(context, 4, 4, 4);
+ test_gemm<GemmWrapper>(context, 5, 5, 5);
+ test_gemm<GemmWrapper>(context, 6, 6, 6);
+ test_gemm<GemmWrapper>(context, 3, 5, 7);
+ test_gemm<GemmWrapper>(context, 7, 3, 5);
+ test_gemm<GemmWrapper>(context, 5, 7, 3);
+ test_gemm<GemmWrapper>(context, 8, 8, 8);
+
+ test_gemm<GemmWrapper>(context, 16, 16, 16);
+ test_gemm<GemmWrapper>(context, 32, 32, 32);
+ test_gemm<GemmWrapper>(context, 64, 64, 64);
+ test_gemm<GemmWrapper>(context, 128, 128, 128);
+
+ test_gemm<GemmWrapper>(context, 17, 24, 31);
+ test_gemm<GemmWrapper>(context, 37, 55, 73);
+ test_gemm<GemmWrapper>(context, 57, 87, 117);
+ test_gemm<GemmWrapper>(context, 93, 83, 73);
+ test_gemm<GemmWrapper>(context, 109, 89, 99);
+ test_gemm<GemmWrapper>(context, 78, 101, 82);
+
+ test_gemm<GemmWrapper>(context, 512, 512, 512,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1024, 1024, 1024,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 567, 2345, 123,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 100, 5000, 100,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1, 1, 1000,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1000, 1, 1,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1, 1000, 1,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1, 1000, 1000,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1000, 1, 1000,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 1000, 1000, 1,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 777, 3456, 1,
+ WhatParamsToTest::OnlyGenericCase);
+ test_gemm<GemmWrapper>(context, 4567, 555, 1,
+ WhatParamsToTest::OnlyGenericCase);
+}
+
+template <typename GemmWrapper>
+void test_gemv(typename GemmWrapper::Context* context) {
+ test_gemm<GemmWrapper>(context, 2, 2, 1);
+ test_gemm<GemmWrapper>(context, 3, 3, 1);
+ test_gemm<GemmWrapper>(context, 4, 4, 1);
+ test_gemm<GemmWrapper>(context, 5, 5, 1);
+ test_gemm<GemmWrapper>(context, 6, 6, 1);
+ test_gemm<GemmWrapper>(context, 3, 5, 1);
+ test_gemm<GemmWrapper>(context, 7, 3, 1);
+ test_gemm<GemmWrapper>(context, 5, 7, 1);
+ test_gemm<GemmWrapper>(context, 8, 8, 1);
+ test_gemm<GemmWrapper>(context, 32, 32, 1);
+ test_gemm<GemmWrapper>(context, 128, 128, 1);
+ test_gemm<GemmWrapper>(context, 321, 123, 1);
+}
+
+void test() {
+#ifdef GEMMLOWP_TEST_PROFILE
+ RegisterCurrentThreadForProfiling();
+ StartProfiling();
+#endif
+
+ GemmContext context;
+
+ // Test the internal GEMM interfaces
+ test_gemm<
+ SingleThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(
+ &context);
+
+ test_gemm<
+ MultiThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(&context);
+
+ // Test the public GEMM interfaces
+ test_gemm<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor,
+ MapOrder::ColMajor>>(&context);
+
+ test_gemm<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(
+ &context);
+
+ // Test GEMV cases (internal interfaces)
+ test_gemv<
+ SingleThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(
+ &context);
+
+ test_gemv<
+ MultiThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(&context);
+
+ // Test GEMV cases (public interfaces)
+ test_gemv<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor,
+ MapOrder::ColMajor>>(&context);
+
+ test_gemv<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor,
+ MapOrder::ColMajor, MapOrder::ColMajor>>(
+ &context);
+
+ // Test specific kernels with various different formats,
+ // to exercises corner cases especially in the packing code.
+ test_gemm_kernel<
+ ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<1, 1>, 1>,
+ KernelSideFormat<CellFormat<1, 1>, 1>>>>(
+ &context);
+
+ test_gemm_kernel<
+ ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<3, 4>, 2>,
+ KernelSideFormat<CellFormat<5, 4>, 3>>>>(
+ &context);
+
+ test_gemm_kernel<
+ ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<5, 3>, 3>,
+ KernelSideFormat<CellFormat<4, 3>, 2>>>>(
+ &context);
+
+ test_gemm_kernel<
+ ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>,
+ KernelSideFormat<CellFormat<4, 3>, 1>>>>(
+ &context);
+
+ test_gemm_kernel<
+ ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>,
+ KernelSideFormat<CellFormat<2, 3>, 2>>>>(
+ &context);
+
+// Test all our optimized kernels, even if they are not used
+// at the moment, as they might be handy later and so it's
+// useful to keep them functional for now.
+#ifdef GEMMLOWP_NEON
+ test_gemm_kernel<gemmlowp::NEONKernel12x4Depth2>(&context);
+ test_gemm_kernel<gemmlowp::NEONKernel20x1Depth4>(&context);
+ test_gemm_kernel<gemmlowp::NEONKernel8x1Depth4>(&context);
+#endif
+
+#ifdef GEMMLOWP_TEST_PROFILE
+ FinishProfiling();
+#endif
+
+ std::cerr << "All tests passed." << std::endl;
+
+ // We have been testing the eight_bit_int_gemm, so we should free its
+ // persistent
+ // resources now to avoid having leak-checking tools report leaks.
+ eight_bit_int_gemm::FreePersistentResources();
+}
+
+} // end namespace gemmlowp
+
+int main() { gemmlowp::test(); }
diff --git a/test/test.h b/test/test.h
new file mode 100644
index 0000000..f630674
--- /dev/null
+++ b/test/test.h
@@ -0,0 +1,118 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// test.h: shared testing helpers.
+
+#ifndef GEMMLOWP_TEST_TEST_H_
+#define GEMMLOWP_TEST_TEST_H_
+
+#ifdef GEMMLOWP_TEST_PROFILE
+#define GEMMLOWP_PROFILING
+#include "profiling/profiler.h"
+#endif
+
+#include <iostream>
+#include <cstdlib>
+#include <vector>
+
+#include "public/map.h"
+
+namespace gemmlowp {
+
+inline int Random() {
+ // Use ugly old rand() since this doesn't need to be high quality.
+ return rand();
+}
+
+inline void Check(bool b) { ReleaseBuildAssertion(b, "test failed"); }
+
+// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
+// since it only maps existing data. In tests though, we need to
+// create our own matrices.
+template <typename tScalar, MapOrder tOrder>
+class Matrix : public MatrixMap<tScalar, tOrder> {
+ public:
+ typedef MatrixMap<tScalar, tOrder> Map;
+ typedef MatrixMap<const tScalar, tOrder> ConstMap;
+ typedef typename Map::Scalar Scalar;
+ using Map::kOrder;
+ using Map::rows_;
+ using Map::cols_;
+ using Map::stride_;
+ using Map::data_;
+
+ public:
+ Matrix() : Map(nullptr, 0, 0, 0) {}
+
+ Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
+
+ Matrix(const Matrix& other) { *this = other; }
+
+ Matrix& operator=(const Matrix& other) {
+ Resize(other.rows_, other.cols_);
+ memcpy(data_, other.data_, size() * sizeof(Scalar));
+ return *this;
+ }
+
+ friend bool operator==(const Matrix& a, const Matrix& b) {
+ return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
+ !memcmp(a.data_, b.data_, a.size());
+ }
+
+ void Resize(int rows, int cols) {
+ rows_ = rows;
+ cols_ = cols;
+ stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
+ storage.resize(size());
+ data_ = storage.data();
+ }
+
+ int size() const { return rows_ * cols_; }
+
+ Map& map() { return *static_cast<Map*>(this); }
+
+ ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
+
+ protected:
+ std::vector<Scalar> storage;
+};
+
+template <typename MatrixType>
+void MakeRandom(MatrixType* m, int bits) {
+ typedef typename MatrixType::Scalar Scalar;
+ const Scalar mask = (1 << bits) - 1;
+ for (int c = 0; c < m->cols(); c++) {
+ for (int r = 0; r < m->rows(); r++) {
+ (*m)(r, c) = Random() & mask;
+ }
+ }
+}
+
+template <typename MatrixType>
+void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) {
+ for (int c = 0; c < m->cols(); c++) {
+ for (int r = 0; r < m->rows(); r++) {
+ (*m)(r, c) = val;
+ }
+ }
+}
+
+template <typename MatrixType>
+void MakeZero(MatrixType* m) {
+ MakeConstant(m, 0);
+}
+
+} // namespace gemmlowp
+
+#endif // GEMMLOWP_TEST_TEST_H_
diff --git a/test/test_allocator.cc b/test/test_allocator.cc
new file mode 100644
index 0000000..9e76b79
--- /dev/null
+++ b/test/test_allocator.cc
@@ -0,0 +1,54 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+#include "internal/allocator.h"
+
+namespace gemmlowp {
+
+void test_allocator(Allocator* a, int max_array_size) {
+ const std::size_t int32_array_size = Random() % max_array_size;
+ auto handle_to_int32_array = a->Reserve<std::int32_t>(int32_array_size);
+ const std::size_t int8_array_size = Random() % max_array_size;
+ auto handle_to_int8_array = a->Reserve<std::int8_t>(int8_array_size);
+ a->Commit();
+ std::int32_t* int32_array =
+ a->GetPointer<std::int32_t>(handle_to_int32_array);
+ std::int8_t* int8_array = a->GetPointer<std::int8_t>(handle_to_int8_array);
+ Check(int32_array == a->GetPointer<std::int32_t>(handle_to_int32_array));
+ Check(int8_array == a->GetPointer<std::int8_t>(handle_to_int8_array));
+ Check(
+ !(reinterpret_cast<std::uintptr_t>(int32_array) % Allocator::kAlignment));
+ Check(
+ !(reinterpret_cast<std::uintptr_t>(int8_array) % Allocator::kAlignment));
+ Check(reinterpret_cast<std::uintptr_t>(int8_array) >=
+ reinterpret_cast<std::uintptr_t>(int32_array + int32_array_size));
+ memset(int32_array, 0, sizeof(*int32_array) * int32_array_size);
+ memset(int8_array, 0, sizeof(*int8_array) * int8_array_size);
+ a->Decommit();
+}
+
+void test_allocator() {
+ Allocator allocator;
+
+ // Test allocating increasingly large sizes on the same allocator,
+ // starting with size 0.
+ for (int i = 1; i < 1000; i += 10) {
+ test_allocator(&allocator, i);
+ }
+}
+
+} // namespace gemmlowp
+
+int main() { gemmlowp::test_allocator(); }
diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc
new file mode 100644
index 0000000..1fa3c9d
--- /dev/null
+++ b/test/test_blocking_counter.cc
@@ -0,0 +1,102 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <pthread.h>
+#include <vector>
+
+#include "internal/multi_thread_gemm.h"
+
+namespace gemmlowp {
+
+class Thread {
+ public:
+ Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement)
+ : blocking_counter_(blocking_counter),
+ number_of_times_to_decrement_(number_of_times_to_decrement),
+ made_the_last_decrement_(false) {
+ pthread_create(&thread_, nullptr, ThreadFunc, this);
+ }
+
+ ~Thread() { Join(); }
+
+ bool Join() const {
+ pthread_join(thread_, nullptr);
+ return made_the_last_decrement_;
+ }
+
+ private:
+ Thread(const Thread& other) = delete;
+
+ void ThreadFunc() {
+ for (int i = 0; i < number_of_times_to_decrement_; i++) {
+ Check(!made_the_last_decrement_);
+ made_the_last_decrement_ = blocking_counter_->DecrementCount();
+ }
+ }
+
+ static void* ThreadFunc(void* ptr) {
+ static_cast<Thread*>(ptr)->ThreadFunc();
+ return nullptr;
+ }
+
+ BlockingCounter* const blocking_counter_;
+ const int number_of_times_to_decrement_;
+ pthread_t thread_;
+ bool made_the_last_decrement_;
+};
+
+void test_blocking_counter(BlockingCounter* blocking_counter, int num_threads,
+ int num_decrements_per_thread,
+ int num_decrements_to_wait_for) {
+ std::vector<Thread*> threads;
+ blocking_counter->Reset(num_decrements_to_wait_for);
+ for (int i = 0; i < num_threads; i++) {
+ threads.push_back(new Thread(blocking_counter, num_decrements_per_thread));
+ }
+ blocking_counter->Wait();
+
+ int num_threads_that_made_the_last_decrement = 0;
+ for (int i = 0; i < num_threads; i++) {
+ if (threads[i]->Join()) {
+ num_threads_that_made_the_last_decrement++;
+ }
+ delete threads[i];
+ }
+ Check(num_threads_that_made_the_last_decrement == 1);
+}
+
+void test_blocking_counter() {
+ BlockingCounter* blocking_counter = new BlockingCounter;
+
+ // repeating the entire test sequence ensures that we test
+ // non-monotonic changes.
+ for (int repeat = 1; repeat <= 2; repeat++) {
+ for (int num_threads = 1; num_threads <= 16; num_threads++) {
+ for (int num_decrements_per_thread = 1;
+ num_decrements_per_thread <= 64 * 1024;
+ num_decrements_per_thread *= 4) {
+ test_blocking_counter(blocking_counter, num_threads,
+ num_decrements_per_thread,
+ num_threads * num_decrements_per_thread);
+ }
+ }
+ }
+ delete blocking_counter;
+}
+
+} // end namespace gemmlowp
+
+int main() { gemmlowp::test_blocking_counter(); }
diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc
new file mode 100644
index 0000000..c1482f7
--- /dev/null
+++ b/test/test_math_helpers.cc
@@ -0,0 +1,134 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test/test.h"
+
+#include <limits>
+
+#include "internal/common.h"
+
+namespace gemmlowp {
+
+// Our math helpers don't intend to be reliable all the way to the
+// limit of representable range, wrt overflow.
+// We don't care for 2G sized matrices.
+// This test stops at half of the representable range.
+template <typename Integer>
+Integer ValueRangeCutoff() {
+ return std::numeric_limits<Integer>::max() / 2;
+}
+
+int RandomNonnegativeFarAwayFromOverflow() { return Random() % (1 << 24); }
+
+template <int Modulus>
+void test_round_up_down(int x) {
+ Check(x >= RoundDown<Modulus>(x));
+ Check(x < RoundDown<Modulus>(x) + Modulus);
+ Check(RoundDown<Modulus>(x) % Modulus == 0);
+
+ Check(x <= RoundUp<Modulus>(x));
+ Check(x > RoundUp<Modulus>(x) - Modulus);
+ Check(RoundUp<Modulus>(x) % Modulus == 0);
+}
+
+template <int Modulus>
+void test_round_up_down() {
+ for (int i = 0; i < 100; i++) {
+ test_round_up_down<Modulus>(i);
+ const int N = ValueRangeCutoff<int>();
+ test_round_up_down<Modulus>(Random() % N);
+ }
+}
+
+template <typename Integer>
+void test_ceil_quotient(Integer x, Integer y) {
+ Check(CeilQuotient(x, y) * y >= x);
+ Check(CeilQuotient(x, y) * y < x + y);
+}
+
+template <typename Integer>
+void test_ceil_quotient() {
+ const Integer N = ValueRangeCutoff<Integer>();
+ const Integer K = std::min(N, Integer(100));
+ for (Integer x = 0; x < K; x++) {
+ for (Integer y = 1; y < K; y++) {
+ test_ceil_quotient(x, y);
+ test_ceil_quotient(x, Integer(1 + (Random() % (N - 1))));
+ test_ceil_quotient(Integer(Random() % N), y);
+ test_ceil_quotient(Integer(Random() % N),
+ Integer(1 + (Random() % (N - 1))));
+ }
+ }
+}
+
+template <typename Integer>
+void test_round_up_to_next_power_of_two(Integer x) {
+ Check(RoundUpToPowerOfTwo(RoundUpToPowerOfTwo(x) == RoundUpToPowerOfTwo(x)));
+ Check(RoundUpToPowerOfTwo(x) >= x);
+ Check(x == 0 || RoundUpToPowerOfTwo(x) < 2 * x);
+ Check((RoundUpToPowerOfTwo(x) & (RoundUpToPowerOfTwo(x) - 1)) == 0);
+}
+
+template <typename Integer>
+void test_round_up_to_next_power_of_two() {
+ const Integer N = ValueRangeCutoff<Integer>();
+ const Integer K = std::min(N, Integer(100));
+ for (Integer x = 0; x < K; x++) {
+ test_round_up_to_next_power_of_two(x);
+ test_round_up_to_next_power_of_two(Random() % N);
+ }
+}
+
+void test_math_helpers() {
+ test_round_up_down<1>();
+ test_round_up_down<2>();
+ test_round_up_down<3>();
+ test_round_up_down<4>();
+ test_round_up_down<5>();
+ test_round_up_down<6>();
+ test_round_up_down<7>();
+ test_round_up_down<8>();
+ test_round_up_down<9>();
+ test_round_up_down<10>();
+ test_round_up_down<11>();
+ test_round_up_down<12>();
+ test_round_up_down<13>();
+ test_round_up_down<14>();
+ test_round_up_down<15>();
+ test_round_up_down<16>();
+
+ test_round_up_down<50>();
+ test_round_up_down<51>();
+
+ test_round_up_down<500>();
+ test_round_up_down<501>();
+
+ test_ceil_quotient<std::int8_t>();
+ test_ceil_quotient<std::uint8_t>();
+ test_ceil_quotient<std::int16_t>();
+ test_ceil_quotient<std::uint16_t>();
+ test_ceil_quotient<std::int32_t>();
+ test_ceil_quotient<std::uint32_t>();
+
+ test_round_up_to_next_power_of_two<std::int8_t>();
+ test_round_up_to_next_power_of_two<std::uint8_t>();
+ test_round_up_to_next_power_of_two<std::int16_t>();
+ test_round_up_to_next_power_of_two<std::uint16_t>();
+ test_round_up_to_next_power_of_two<std::int32_t>();
+ test_round_up_to_next_power_of_two<std::uint32_t>();
+}
+
+} // end namespace gemmlowp
+
+int main() { gemmlowp::test_math_helpers(); }