diff options
author | Benoit Jacob <benoitjacob@google.com> | 2015-06-25 15:50:59 -0400 |
---|---|---|
committer | Benoit Jacob <benoitjacob@google.com> | 2015-06-25 15:53:04 -0400 |
commit | 75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae (patch) | |
tree | c8e35a06c7d959e6ad0a90b4929305055919e3f8 | |
download | gemmlowp-75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae.tar.gz |
initial import
35 files changed, 6372 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4ff62a0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +*.o +*.ii +*.s +**/.DS_Store +? +?? +??? diff --git a/AUTHORS.txt b/AUTHORS.txt new file mode 100644 index 0000000..13a49e0 --- /dev/null +++ b/AUTHORS.txt @@ -0,0 +1,9 @@ +# This is the official list of gemmlowp authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS.txt file. +# See the latter for an explanation. + +# Names should be added to this file as: +# Name or Organization <email address> +# The email address is not required for organizations. + +Google Inc. diff --git a/CONTRIBUTING.txt b/CONTRIBUTING.txt new file mode 100644 index 0000000..e7244b3 --- /dev/null +++ b/CONTRIBUTING.txt @@ -0,0 +1,33 @@ +Want to contribute? Great! First, read this page (including the small print at the end). + +Before you contribute +===================== + +Before we can use your code, you must sign the Google Individual Contributor +License Agreement (CLA), + + https://developers.google.com/open-source/cla/individual?csw=1 + +which you can do online. The CLA is necessary mainly because you own the +copyright to your changes, even after your contribution becomes part of our +codebase, so we need your permission to use and distribute your code. We also +need to be sure of various other things—for instance that you'll tell us if you +know that your code infringes on other people's patents. You don't have to sign +the CLA until after you've submitted your code for review and a member has +approved it, but you must do it before we can put your code into our codebase. +Before you start working on a larger contribution, you should get in touch with +us first through the issue tracker with your idea so that we can help out and +possibly guide you. Coordinating up front makes it much easier to avoid +frustration later on. + +Code reviews +============ + +All submissions, including submissions by project members, require review. We +use Github pull requests for this purpose. + +The small print +=============== + +Contributions made by corporations are covered by a different agreement than +the one above, the Software Grant and Corporate Contributor License Agreement. diff --git a/CONTRIBUTORS.txt b/CONTRIBUTORS.txt new file mode 100644 index 0000000..e9396d6 --- /dev/null +++ b/CONTRIBUTORS.txt @@ -0,0 +1,13 @@ +# People who have agreed to one of the CLAs and can contribute patches. +# The AUTHORS.txt file lists the copyright holders; this file +# lists people. For example, Google employees are listed here +# but not in AUTHORS.txt, because Google holds the copyright. +# +# https://developers.google.com/open-source/cla/individual +# https://developers.google.com/open-source/cla/corporate +# +# Names should be added to this file as: +# Name <email address> + +Benoit Jacob <benoitjacob@google.com> +Pete Warden <petewarden@google.com> diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..29fcdc2 --- /dev/null +++ b/README.txt @@ -0,0 +1,126 @@ +gemmlowp: a small self-contained low-precision GEMM library +=========================================================== + +This is not a full linear algebra library, only a GEMM library: it only does +general matrix multiplication ("GEMM"). + + +Disclaimer +========== + +This is not an official Google product (experimental or otherwise), it is just +code that happens to be owned by Google. + + +Portability, target platforms/architectures +=========================================== + +Should be portable to any platform with some C++11 and POSIX support, +while we have optional optimized code paths for specifica architectures. + +Required: + C++11 (a small conservative subset of it) + +Required for some features: + * Some POSIX interfaces: + * pthreads (for multi-threaded operation and for profiling). + * sysconf (for multi-threaded operation to detect number of cores; + may be bypassed). + +Optional optimized code paths: +At the moment, we only have optimized code paths for ARM NEON SIMD. +Some are written in inline assembly, some are written in C++ using +intrinsics. Both GCC and Clang are supported. + + +Public interfaces +================= + +1. gemmlowp public interface +---------------------------- + + gemmlowp's main public interface is in the public/ subdirectory. The + header to include is + public/gemmlowp.h. + This is a headers-only library, so there is nothing to link to. + +2. EightBitIntGemm standard interface +------------------------------------- + + Additionally, the eight_bit_int_gemm/ subdirectory provides an + implementation of the standard EightBitIntGemm interface. The header + to include is + eight_bit_int_gemm/eight_bit_int_gemm.h + This is *NOT* a headers-only library, users need to link to + eight_bit_int_gemm/eight_bit_int_gemm.cc. + + +Testing +======= + +The test/ directory contains unit tests. The primary unit test is + test/test.cc +Since it covers also the EightBitIntGemm interface, it needs to be +linked against + eight_bit_int_gemm/eight_bit_int_gemm.cc + +The scripts/ directory contains a script to build and run a program +on an Android device: + scripts/test-android.sh + +It expects the CXX environment variable to point to an Android toolchain's +C++ compiler, and expects source files (and optionally, cflags) as +command-line parameters. To build and run the above-mentioned main unit test, +first set CXX e.g.: + +$ export CXX=/some/toolchains/arm-linux-androideabi-4.8/bin/arm-linux-androideabi-g++ + +Then run: + +$ ./scripts/test-android.sh test/test.cc eight_bit_int_gemm/eight_bit_int_gemm.cc + + +Profiling +========= + +The profiling/ subdirectory offers a very simple non-interrupting sampling +profiler that only requires pthreads (no signals). + +It relies on source code being instrumented with pseudo-stack labels. +See profiling/instrumentation.h. +A full example of using this profiler is given in profiling/profiler.h. + + +Low-precision? +============== + +"Low-precision" means that the input and output matrix entries are integers +on at most 8 bits. The scalar type is uint8_t. + +This isn't the same as just doing plain matrix arithmetic over uint8_t, +because that would overflow. To avoid overflow, we internally accumulate +results on more than 8 bits, and at the end we keep only some significant +8 bits. This relies on the caller providing suitable offset/multiplier/shift +parameters, which effectively govern how we extract some significant 8 bit +from our more-than-8bit temporary accumulators. See the extra function +parameters taken by Gemm() in public/gemmlowp.h or by EightBitIntGemm() in +eight_bit_int_gemm/eight_bit_int_gemm.h. + + +Performance goals +============================ + +Our performance goals differ from typical GEMM performance goals in the +following ways: + +1. We care not only about speed, but also about minimizing power usage. + We specifically care about charge usage in mobile/embedded devices. + This implies that we care doubly about minimizing memory bandwidth usage: + we care about it, like any GEMM, because of the impact on speed, and we + also care about it because it is a key factor of power usage. + +2. Most GEMMs are optimized primarily for large dense matrix sizes (>= 1000). + We do care about large sizes, but we also care specifically about the + typically smaller matrix sizes encountered in various mobile applications. + This means that we have to optimize for all sizes, not just for large enough + sizes. diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.cc b/eight_bit_int_gemm/eight_bit_int_gemm.cc new file mode 100644 index 0000000..4c52dee --- /dev/null +++ b/eight_bit_int_gemm/eight_bit_int_gemm.cc @@ -0,0 +1,89 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "eight_bit_int_gemm/eight_bit_int_gemm.h" + +// gemmlowp symbols should have hidden visibility. +// currently this is ensured in the build system by +// passing -finlines-visibility-hidden. TODO: it would be +// safer to hardcode it here with some #pragma's. +#include "public/gemmlowp.h" + +namespace gemmlowp { + +namespace eight_bit_int_gemm { + +namespace { + +// To be used as template parameter for GlobalLock. +// GlobalLock<EightBitIntGemmLockId> is the global lock +// on EightBitIntGemm entry points, protecting +// EightBitIntGemm's global state. +struct EightBitIntGemmLockId; + +// Global state: consists of one global GemmContext instance. +GemmContext* global_context; + +GemmContext* GetOrCreateGlobalContext() { + if (!global_context) { + global_context = new GemmContext; + } + return global_context; +} + +void DestroyGlobalContext() { + delete global_context; + global_context = nullptr; +} + +} // end anonymous namespace + +// Public interface entry points + +void EightBitIntGemm(int m, int n, int k, const std::uint8_t* a, + std::int32_t a_offset, int lda, const std::uint8_t* b, + std::int32_t b_offset, int ldb, std::uint8_t* c, + std::int32_t c_offset, std::int32_t c_mult_int, + std::int32_t c_shift, int ldc) { + AutoGlobalLock<EightBitIntGemmLockId> lock; + GemmContext* context = GetOrCreateGlobalContext(); + + MatrixMap<const std::uint8_t, MapOrder::RowMajor> lhs(b, n, k, ldb); + MatrixMap<const std::uint8_t, MapOrder::ColMajor> rhs(a, k, m, lda); + MatrixMap<std::uint8_t, MapOrder::ColMajor> result(c, n, m, ldc); + + const int lhs_offset = b_offset; + const int rhs_offset = a_offset; + const int result_offset = c_offset; + const int result_mult_int = c_mult_int; + const int result_shift = c_shift; + + Gemm(context, lhs, rhs, &result, lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); +} + +void SetMaxNumThreads(int n) { + AutoGlobalLock<EightBitIntGemmLockId> lock; + GemmContext* context = GetOrCreateGlobalContext(); + context->set_max_num_threads(n); +} + +void FreePersistentResources() { + AutoGlobalLock<EightBitIntGemmLockId> lock; + DestroyGlobalContext(); +} + +} // namespace eight_bit_int_gemm + +} // namespace gemmlowp diff --git a/eight_bit_int_gemm/eight_bit_int_gemm.h b/eight_bit_int_gemm/eight_bit_int_gemm.h new file mode 100644 index 0000000..1d2b1cb --- /dev/null +++ b/eight_bit_int_gemm/eight_bit_int_gemm.h @@ -0,0 +1,66 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// eight_bit_int_gemm.h: exposes the standard EightBitIntGemm interface. + +#ifndef GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ +#define GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ + +#include <cstdint> + +namespace gemmlowp { + +namespace eight_bit_int_gemm { + +// Concurrency / reentrancy notice +// =============================== +// +// This eight_bit_int_gemm has global singleton persistent state. +// A global lock ensures serialization of calls, so this library +// is fully reentrant but only one calling thread gets to actually run +// at a time, while other calling threads would wait. So it is safe +// albeit potentially slow to call the functions exposed here on +// multiple threads concurrently. +// +// Users who prefer a state-less, singleton-less interface, +// should use the main gemmlowp interface (public/gemmlowp.h) instead. + +// The main entry point to compute a Gemm. This is the standard +// EightBitIntGemm interface. +void EightBitIntGemm(int m, int n, int k, const std::uint8_t *a, + std::int32_t a_offset, int lda, const std::uint8_t *b, + std::int32_t b_offset, int ldb, std::uint8_t *c, + std::int32_t c_offset, std::int32_t c_mult_int, + std::int32_t c_shift, int ldc); + +// Frees any persistent resources +// (threads, thread pools, allocators, buffers, ...) +// that gemmlowp might hold. This is called automatically +// on thread exit, but one may also call it earlier, at any time. +void FreePersistentResources(); + +// Allows specifying the number of hardware threads, as a hint as to +// how many worker threads to use for sufficiently large Gemm's. +// We will never use more threads than that, but may use fewer, +// for instance on Gemm's that are too small to benefit from all +// available threads. The value 0 lets the implementation query +// the system to determine the number of hardware threads. +// Default value: 0. +void SetMaxNumThreads(int n); + +} // namespace eight_bit_int_gemm + +} // namespace gemmlowp + +#endif // GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ diff --git a/internal/allocator.h b/internal/allocator.h new file mode 100644 index 0000000..30b2bd8 --- /dev/null +++ b/internal/allocator.h @@ -0,0 +1,211 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// allocator.h: a buffer allocator that allows avoiding most of the +// malloc/free overhead, by: +// 1. Requiring all N allocations to be reserved in advance, and +// then commited at once, turning N allocations into 1. +// 2. Being persistent, the allocated storage is reused across commits, +// and only reallocated as needed when the commit size gets larger. +// +// This is driven by Android-specific needs: +// 1. On Android, the default (Bionic) allocator tends to aggressively +// unmap pages, which means that malloc/free can be surprisingly expensive. +// 2. On Android, stack allocations with alloca() can't be as large as on +// desktop platforms. +// +// General usage: +// 1. Reserve blocks by calling Reserve(), which returns a Handle. +// 2. Call Commit() once. +// 3. Now it is possible to get pointers to allocated buffers by calling +// GetPointer(). +// 4. Call Decommit() once. +// 5. The allocator is now reverted to its original state, except that +// it retained its allocated storage, so the next Commit() will be faster. +// The allocated storage is only freed when the Allocator object is +// destroyed. + +#ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_ +#define GEMMLOWP_INTERNAL_ALLOCATOR_H_ + +#include "internal/common.h" + +#if defined ANDROID || defined __ANDROID__ +#include <android/api-level.h> +#if __ANDROID_API__ < 16 +#include <malloc.h> +#define GEMMLOWP_USE_MEMALIGN +#endif +#endif + +namespace gemmlowp { + +enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 }; + +template <typename T> +struct GetTypeIdImpl {}; + +template <typename T> +inline TypeId GetTypeId() { + return GetTypeIdImpl<T>::Value; +} + +template <typename T> +struct GetTypeIdImpl<const T> : GetTypeIdImpl<T> {}; + +#define GEMMLOWP_REGISTER_TYPEID(type_, id) \ + template <> \ + struct GetTypeIdImpl<type_> { \ + static const TypeId Value = TypeId::id; \ + }; + +GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8) +GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8) +GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16) +GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16) +GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32) +GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32) + +class Allocator { + public: + Allocator() + : committed_(false), + storage_size_(0), + storage_(nullptr), + reserved_blocks_(0), + reserved_bytes_(0), + generation_(0) {} + + ~Allocator() { + assert(!committed_); + assert(!reserved_blocks_); + DeallocateStorage(); + } + + // Alignment of allocated blocks. + static const std::size_t kAlignment = kDefaultCacheLineSize; + + // This is all we need so far, and since the usage pattern is fixed, + // there is no point in allowing more until we need to. + static const std::size_t kMaxBlocks = 5; + + void Commit() { + assert(!committed_); + + if (reserved_bytes_ > storage_size_) { + DeallocateStorage(); + storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_); +#ifdef GEMMLOWP_USE_MEMALIGN + storage_ = memalign(kAlignment, storage_size_); +#else + if (posix_memalign(&storage_, kAlignment, storage_size_)) { + storage_ = nullptr; + } +#endif + } + + ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure"); + committed_ = true; + } + + void Decommit() { + assert(committed_); + committed_ = false; + generation_++; + + reserved_blocks_ = 0; + reserved_bytes_ = 0; + } + + // See generation_ + typedef std::size_t generation_t; + + // A handle on a reserved block. The user obtains + // one by calling Reserve() and, after committing, + // passes it to GetPointer(). + class Handle { + std::uint8_t index_; + generation_t generation_; + TypeId type_; + + friend class Allocator; + }; + + // Reserves a block sized for n elements of type T, and + // returns a handle to it. Must be called before committing. + template <typename T> + Handle Reserve(std::size_t n) { + assert(!committed_ && "can't reserve blocks while committed"); + assert(reserved_blocks_ < kMaxBlocks && + "didn't expect to allocate this many blocks"); + const std::size_t bytes = RoundUp<kAlignment>(n * sizeof(T)); + const std::size_t offset = reserved_bytes_; + const std::size_t index = reserved_blocks_; + + reserved_blocks_offsets_[index] = offset; + Handle h; + h.index_ = index; + h.generation_ = generation_; + h.type_ = GetTypeId<T>(); + + reserved_blocks_++; + reserved_bytes_ += bytes; + + return h; + } + + // Returns the pointer to the allocated buffer for the given handle. + // Must be called after committing. + template <typename T> + T* GetPointer(const Handle& h) const { + assert(committed_ && "can't get block pointers unless committed"); + assert(h.index_ < reserved_blocks_ && + "bad handle, points to inexistant block"); + assert(h.generation_ == generation_ && + "handle from earlier generation, have decommitted since"); + assert(h.type_ == GetTypeId<T>() && "type mismatch"); + std::size_t offset = reserved_blocks_offsets_[h.index_]; + std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(storage_) + offset; + return reinterpret_cast<T*>(addr); + } + + private: + void DeallocateStorage() { + assert(!committed_); + free(storage_); + storage_size_ = 0; + } + + // Set to true by Commit() and to false by Decommit(). Initially false. + bool committed_; + + // The actually allocated storage size and buffer pointer. + std::size_t storage_size_; + mutable void* storage_; + + // The number of blocks that have been reserved by Reserve(). + std::size_t reserved_blocks_; + // The number of bytes that have been reserved by Reserve(). + std::size_t reserved_bytes_; + // The offsets of reserved blocks into the storage buffer. + std::size_t reserved_blocks_offsets_[kMaxBlocks]; + + // The 'generation' is incremented on Decommit() and allows catching + // bad GetPointer() calls still referring to a previous commit. + generation_t generation_; +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_ALLOCATOR_H_ diff --git a/internal/block_params.h b/internal/block_params.h new file mode 100644 index 0000000..8545f94 --- /dev/null +++ b/internal/block_params.h @@ -0,0 +1,166 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// block_params.h: Logic to choose L1 and L2 block sizes +// to optimize cache-friendliness. + +#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ +#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ + +#include "internal/common.h" + +namespace gemmlowp { + +// A BlockParams instance contains a full description of all the block size +// parameters to be used by a Gemm. +// There are two nested levels of block subdivisions: first a subdivision +// into large blocks that should fit in last-level cache (what we call L2 here) +// and then another subdivision into smaller blocks that should fit in +// L1 cache. There is then actually a third level of subdivision to fit +// in registers, but we are not concerned with that here. +struct BlockParams { + // L1 block parameters determine the size of small blocks that should + // fit in L1 cache. + int l1_rows; + int l1_cols; + int l1_depth; + + // L2 block parameters determine the size of larger blocks that should + // fit in L2 cache. + int l2_rows; + int l2_cols; + int l2_depth; + + template <typename KernelFormat> + void Init(int rows, int cols, int depth, int num_threads) { + FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, &l2_rows, + &l2_cols, &l2_depth); + FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, &l1_rows, + &l1_cols, &l1_depth); + } + + template <typename KernelFormat> + static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads, + int* out_l2_rows, int* out_l2_cols, + int* out_l2_depth) { + int l2_rows = 0; + int l2_cols = 0; + int l2_depth = 0; + // No L2 blocking in the depth dimension at the moment. + // Too much loss of accuracy due to storing intermediate results in + // low precision. + // However, we still want to round l2_depth up to the next multiple + // of kernel depth, so as to avoid having to special-case unaligned depths. + l2_depth = RoundUp<KernelFormat::kDepth>(depth); + + const int l2_bytes_to_use = kDefaultL2CacheSize; + const float l2_rhs_factor = kDefaultL2RhsFactor; + + { + int max_cache_friendly_l2_cols = std::max( + 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth))); + int min_l2_cols_blocks = CeilQuotient(cols, max_cache_friendly_l2_cols); + l2_cols = + RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks)); + } + + { + int max_cache_friendly_l2_rows = + std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / + (num_threads * (l2_depth + 4 * l2_cols))); + int min_l2_rows_blocks = CeilQuotient(rows, max_cache_friendly_l2_rows); + l2_rows = + RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks)); + } + + *out_l2_rows = l2_rows; + *out_l2_cols = l2_cols; + *out_l2_depth = l2_depth; + } + + template <typename KernelFormat> + static void FindL1BlockSizes(int rows, int cols, int depth, int* out_l1_rows, + int* out_l1_cols, int* out_l1_depth) { + int l1_rows = 0; + int l1_cols = 0; + int l1_depth = 0; + + // L2 block sizes should already be multiples of kernel block sizes. + assert(rows % KernelFormat::kRows == 0); + assert(cols % KernelFormat::kCols == 0); + assert(depth % KernelFormat::kDepth == 0); + + // No L1 blocking in the columns dimension at the moment. + // Thought not to be needed. Similar to Eigen. + l1_cols = cols; + + const int l1_bytes_to_use = kDefaultL1CacheSize; + + { + int max_cache_friendly_l1_depth = std::max( + 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) / + (KernelFormat::kRows + KernelFormat::kCols)); + int min_l1_depth_blocks = + CeilQuotient(depth, max_cache_friendly_l1_depth); + l1_depth = RoundUp<KernelFormat::kDepth>( + CeilQuotient(depth, min_l1_depth_blocks)); + } + + { + int max_cache_friendly_l1_rows = + std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols)); + int min_l1_rows_blocks = CeilQuotient(rows, max_cache_friendly_l1_rows); + l1_rows = + RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks)); + } + + *out_l1_rows = l1_rows; + *out_l1_cols = l1_cols; + *out_l1_depth = l1_depth; + } +}; + +// A SideBlockParams instance contains only the block params relevant to +// one side (LHS or RHS), expressed in terms of 'width' instead of +// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means +// the number of rows, while in the RHS, 'width' means the number of columns. +// That allows us to write generic code that applies to either LHS or RHS. +struct SideBlockParams { + // L1 block parameters determine the size of small blocks that should + // fit in L1 cache. + int l1_width; + int l1_depth; + + // L2 block parameters determine the size of larger blocks that should + // fit in L2 cache. + int l2_width; + int l2_depth; +}; + +enum class Side { Lhs, Rhs }; + +inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params, + const BlockParams& block_params) { + side_block_params->l1_width = + side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols; + side_block_params->l2_width = + side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols; + + side_block_params->l1_depth = block_params.l1_depth; + side_block_params->l2_depth = block_params.l2_depth; +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ diff --git a/internal/common.h b/internal/common.h new file mode 100644 index 0000000..f7e643e --- /dev/null +++ b/internal/common.h @@ -0,0 +1,117 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// common.h: contains stuff that's used throughout gemmlowp +// and should always be available. + +#ifndef GEMMLOWP_INTERNAL_COMMON_H_ +#define GEMMLOWP_INTERNAL_COMMON_H_ + +#include <pthread.h> +#include <cstdint> +#include <cassert> +#include <cmath> +#include <cstdlib> +#include <algorithm> + +#include "profiling/instrumentation.h" + +#ifdef GEMMLOWP_PROFILING +#include <set> +#include <cstdio> +#include <cstring> +#endif + +// Detect NEON. It's important to check for both tokens. +#if (defined __ARM_NEON) || (defined __ARM_NEON__) +#define GEMMLOWP_NEON +#endif + +namespace gemmlowp { + +// Standard cache line size. Useful to optimize alignment and +// prefetches. Ideally we would query this at runtime, however +// 64 byte cache lines are the vast majority, and even if it's +// wrong on some device, it will be wrong by no more than a 2x factor, +// which should be acceptable. +const int kDefaultCacheLineSize = 64; + +// Default L1 and L2 data cache sizes. On x86, we should ideally query this at +// runtime. On ARM, the instruction to query this is privileged and +// Android kernels do not expose it to userspace. Fortunately, the majority +// of ARM devices have roughly comparable values: +// Nexus 5: L1 16k, L2 1M +// Android One: L1 32k, L2 512k +// The following values are equal to or somewhat lower than that, and were +// found to perform well on both the Nexus 5 and Android One. +// Of course, they would be too low for typical x86 CPUs where we would want +// to set the L2 value to (L3 cache size / number of cores) at least. +const int kDefaultL1CacheSize = 16 * 1024; +const int kDefaultL2CacheSize = 256 * 1024; + +// The proportion of the cache that we intend to use for storing +// RHS blocks. This should be between 0 and 1, and typically closer to 1, +// as we typically want to use most of the L2 cache for storing a large +// RHS block. +const float kDefaultL2RhsFactor = 0.90f; + +// Hints the CPU to prefetch the cache line containing ptr. +inline void Prefetch(const void* ptr) { +#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. + __builtin_prefetch(ptr); +#else + (void)ptr; +#endif +} + +// Returns the runtime argument rounded down to the nearest multiple of +// the fixed Modulus. +template <int Modulus> +int RoundDown(int i) { + return i - (i % Modulus); +} + +// Returns the runtime argument rounded up to the nearest multiple of +// the fixed Modulus. +template <int Modulus> +int RoundUp(int i) { + return RoundDown<Modulus>(i + Modulus - 1); +} + +// Returns the quotient a / b rounded up ('ceil') to the nearest integer. +template <typename Integer> +Integer CeilQuotient(Integer a, Integer b) { + return (a + b - 1) / b; +} + +// Returns the argument rounded up to the nearest power of two. +template <typename Integer> +Integer RoundUpToPowerOfTwo(Integer n) { + Integer i = n - 1; + i |= i >> 1; + i |= i >> 2; + i |= i >> 4; + i |= i >> 8; + i |= i >> 16; + return i + 1; +} + +template <int N> +struct IsPowerOfTwo { + static const bool value = !(N & (N - 1)); +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_COMMON_H_ diff --git a/internal/compute.h b/internal/compute.h new file mode 100644 index 0000000..4b2f56a --- /dev/null +++ b/internal/compute.h @@ -0,0 +1,103 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// compute.h: the central stage of the Gemm computation, operates +// on already-packed LHS and RHS blocks and calls the Gemm kernel +// to compute a block of the product. + +#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_ +#define GEMMLOWP_INTERNAL_COMPUTE_H_ + +#include "internal/kernel.h" +#include "internal/block_params.h" +#include "internal/pack.h" + +namespace gemmlowp { + +template <typename KernelFormat, typename PackedResult> +class ComputeImpl { + typedef typename KernelFormat::Lhs KernelLhsFormat; + typedef typename KernelFormat::Rhs KernelRhsFormat; + + const KernelBase& kernel_; + const BlockParams& block_params_; + + PackedResult* const packed_result_; + const PackedSideBlock<KernelLhsFormat>& packed_lhs_; + const PackedSideBlock<KernelRhsFormat>& packed_rhs_; + + public: + ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params, + PackedResult* _packed_result, + const PackedSideBlock<KernelLhsFormat>& _packed_lhs, + const PackedSideBlock<KernelRhsFormat>& _packed_rhs) + : kernel_(_kernel), + block_params_(_block_params), + packed_result_(_packed_result), + packed_lhs_(_packed_lhs), + packed_rhs_(_packed_rhs) {} + + void Compute() { + for (int d = 0; d < block_params_.l2_depth; d += block_params_.l1_depth) { + int ds = std::min(block_params_.l1_depth, block_params_.l2_depth - d); + + for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) { + int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r); + + ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds); + } + } + } + + private: + void ComputeRun(int start_row, int start_col, int start_depth, int depth) { + packed_lhs_.seek_run(start_row, start_depth); + packed_rhs_.seek_run(start_col, start_depth); + auto packed_result_block = packed_result_->Map().block( + start_row, start_col, KernelFormat::kRows, KernelFormat::kCols); + kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(), + packed_result_block.cols_stride(), packed_lhs_.current_data(), + packed_rhs_.current_data(), start_depth, depth); + } + + void ComputeL1(int start_row, int rows, int start_col, int cols, + int start_depth, int depth) { + assert(rows % KernelFormat::kRows == 0); + assert(cols % KernelFormat::kCols == 0); + assert(depth % KernelFormat::kDepth == 0); + + for (int c = 0; c < cols; c += KernelFormat::kCols) { + for (int r = 0; r < rows; r += KernelFormat::kRows) { + ComputeRun(start_row + r, start_col + c, start_depth, depth); + } + } + } +}; + +template <typename PackedResult, typename KernelLhsFormat, + typename KernelRhsFormat> +void Compute(const KernelBase& kernel, const BlockParams& block_params, + PackedResult* packed_result, + const PackedSideBlock<KernelLhsFormat>& packed_lhs, + const PackedSideBlock<KernelRhsFormat>& packed_rhs) { + ScopedProfilingLabel label("compute"); + ComputeImpl<KernelFormat<KernelLhsFormat, KernelRhsFormat>, PackedResult> + impl(kernel, block_params, packed_result, packed_lhs, packed_rhs); + + impl.Compute(); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_COMPUTE_H_ diff --git a/internal/kernel.h b/internal/kernel.h new file mode 100644 index 0000000..5cb5e69 --- /dev/null +++ b/internal/kernel.h @@ -0,0 +1,217 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// kernel.h: general definitions for kernels. + +#ifndef GEMMLOWP_INTERNAL_KERNEL_H_ +#define GEMMLOWP_INTERNAL_KERNEL_H_ + +#include "internal/common.h" + +namespace gemmlowp { + +// Explanation of general gemmlowp terminology +// =========================================== +// +// We use the following abbreviations: +// LHS = "left-hand side" +// RHS = "right-hand side" +// Sometimes when referring to either LHS or RHS, we just say a "Side". +// +// In a matrix product of a MxK matrix times a KxN matrix, +// we call K the 'depth'. Note that M is the number of rows +// of the result (and of the LHS), and N is the number of columns +// of the result (and of the RHS). +// +// In each of the LHS and RHS matrices, we call 'width' the +// other dimension, besides the depth. So in the LHS, 'width' +// is the number of rows, while in the RHS, 'width' is the number +// of columns. +// +// So in the LHS MxK matrix, the depth is K and the width in M. +// And in the RHS KxN matrix, the depth is K and the width in N. +// +// This is illustrated in this picture: +// +// RHS width +// <-----------------> +// +-----------------+ ^ +// | RHS | | Depth +// +-----------------+ v +// ^ +--+ +-----------------+ +// | |L | | | +// LHS width | |H | | Result | +// | |S | | | +// v +--+ +-----------------+ +// <--> +// Depth + +// Explanation of gemmlowp kernel formats and "cells" +// ================================================== +// +// Kernels operate on small LHS and RHS blocks that fit in registers. +// These blocks are stored contiguously in memory, but not always +// in a traditional column-major or row-major order; instead, +// they consist of a number of sub-blocks, which we call "cells", +// that are stored in column-major or row-major order. However, +// what really matters to us is not so much rows vs columns, but +// rather width vs depth. So we refer to "width-major" and "depth-major" +// storage orders. In the LHS, width-major means row-major, +// while in the RHS, width-major means column-major. +// There is also a third possibility, "diagonal order", +// which is unused at the moment. +// +// We aim to treat both sides, LHS and RHS, on an equal footing, +// so we call them both 'sides'. A KernelFormat thus is just a pair +// of KernelSideFormat's, one for LHS and one for RHS; each KernelSideFormat +// contains a CellFormat and a number of cells; cells are only ever +// stacked in the width dimension, which means stacked vertically in the +// LHS and stacked horizondally in the RHS. +// +// Example +// ======= +// +// Let's work out the data layout expected by a kernel having the +// following format (the struct names here are defined below in this file): +// +// KernelFormat< +// KernelSideFormat<CellFormat<3, 4>, 3>, +// KernelSideFormat<CellFormat<5, 4>, 2> +// > +// +// The LHS format, KernelSideFormat<CellFormat<3, 4>, 3>, means: +// 3 cells, each cell having dimensions (width=3, depth=4), laid out in +// DepthMajor order (the default value, see CellFormat). In the LHS, +// DepthMajor means column-major, so the LHS cells are of size 3x4 in +// column-major order, so the LHS layout is: +// +// 0 3 6 9 +// 1 4 8 10 +// 2 5 9 11 +// 12 15 18 21 +// 13 16 19 22 +// 14 17 20 23 +// 24 27 30 33 +// 25 28 31 34 +// 26 29 32 35 +// +// The RHS format, KernelSideFormat<CellFormat<5, 4>, 2>, means: +// 2 cells each having dimensions (width=5, depth=4), laid out in +// DepthMajor order (the default value, see CellFormat). In the RHS, +// DepthMajor means row-major, so the RHS cells are of size 4x5 in +// row-major order, so the RHS layout is: +// +// 0 1 2 3 4 20 21 22 23 24 +// 5 6 7 8 9 25 26 27 28 29 +// 10 11 12 13 14 30 31 32 33 34 +// 15 16 17 18 19 35 36 37 38 39 + +// CellOrder enumerates the possible storage orders (=layouts) for +// a cell (see explanation above). +enum class CellOrder { DepthMajor, WidthMajor, Diagonal }; + +// CellFormat describes how data is laid +// out in a cell. That is, a CellOrder together with actual dimensions. +template <int tWidth, int tDepth, CellOrder tOrder = CellOrder::DepthMajor> +struct CellFormat { + static const int kWidth = tWidth; + static const int kDepth = tDepth; + static const CellOrder kOrder = tOrder; + + static const int kSize = kWidth * kDepth; +}; + +// KernelSideFormat describes how data is laid out in a kernel side +// (i.e. LHS or RHS). That is, a CellFormat together with a number of +// cells. These cells are always stacked in the Width dimension. +// For example, in the LHS case, the Width dimension is the rows dimension, +// se we're saying that in the LHS, cells are stacked vertically. +// We never stack cells in the Depth dimension. +template <typename tCellFormat, int tCells> +struct KernelSideFormat { + typedef tCellFormat Cell; + static const int kCells = tCells; + static const int kWidth = kCells * Cell::kWidth; + static const int kDepth = Cell::kDepth; +}; + +// KernelFormat describes fully the input data layout that a kernel expects. +// It consists of two KernelSideFormat's, one for LHS and one for RHS. +template <typename tLhs, typename tRhs> +struct KernelFormat { + typedef tLhs Lhs; + typedef tRhs Rhs; + + static_assert(Lhs::Cell::kDepth == Rhs::Cell::kDepth, ""); + static const int kDepth = Lhs::Cell::kDepth; + static const int kRows = Lhs::Cell::kWidth * Lhs::kCells; + static const int kCols = Rhs::Cell::kWidth * Rhs::kCells; +}; + +inline const char* CellOrderName(CellOrder o) { + switch (o) { + case CellOrder::DepthMajor: + return "DepthMajor"; + case CellOrder::WidthMajor: + return "WidthMajor"; + case CellOrder::Diagonal: + return "Diagonal"; + default: + assert(false); + return nullptr; + } +} + +// Returns the offset into a cell, at which a given coefficient is stored. +template <typename CellFormat> +inline int OffsetIntoCell(int w, int d) { + switch (CellFormat::kOrder) { + case CellOrder::DepthMajor: + return w + d * CellFormat::kWidth; + case CellOrder::WidthMajor: + return d + w * CellFormat::kDepth; + case CellOrder::Diagonal: + assert(CellFormat::kWidth == CellFormat::kDepth); + static const int cell_width = CellFormat::kWidth; + return w + ((d + cell_width - w) % cell_width) * cell_width; + default: + assert(false); + return 0; + } +} + +// KernelBase is the virtual base class below all kernels. +// The idea is that we don't need to templatize all our code on the exact +// kernel type; we only need to templatize on kernel format. Kernels +// sharing the same format can thus share the same packing/unpacking code. +struct KernelBase { + virtual const char* Name() const = 0; + + // This is the kernel implementation. We use the word 'run' consistently + // throughout gemmlowp to mean an inner loop, the implementation of which + // is to be provided by a separate optimized function. + virtual void Run(std::int32_t* dst_ptr, int dst_row_stride, + int dst_col_stride, const std::uint8_t* lhs_ptr, + const std::uint8_t* rhs_ptr, int start_depth, + int run_depth) const = 0; + + static const int kLhsBitDepth = 8; + static const int kRhsBitDepth = 8; + + virtual ~KernelBase() {} +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_KERNEL_H_ diff --git a/internal/kernel_default.h b/internal/kernel_default.h new file mode 100644 index 0000000..3e7cd5d --- /dev/null +++ b/internal/kernel_default.h @@ -0,0 +1,41 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// kernel_default.h: Chooses default GEMM and GEMV kernels for the +// host platform. + +#ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ +#define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ + +#include "internal/common.h" + +#ifdef GEMMLOWP_NEON +#include "internal/kernel_neon.h" +namespace gemmlowp { +typedef NEONKernel12x4Depth2 DefaultKernelForGEMM; +typedef NEONKernel8x1Depth4 DefaultKernelForGEMV; +} +#else +#include "internal/kernel_reference.h" +namespace gemmlowp { +typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>, + KernelSideFormat<CellFormat<4, 4>, 2> > > + DefaultKernelForGEMM; +typedef ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 4>, 2>, + KernelSideFormat<CellFormat<1, 4>, 1> > > + DefaultKernelForGEMV; +} +#endif + +#endif // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ diff --git a/internal/kernel_neon.h b/internal/kernel_neon.h new file mode 100644 index 0000000..2177fc2 --- /dev/null +++ b/internal/kernel_neon.h @@ -0,0 +1,570 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// kernel_neon.h: a collection of NEON optimized kernels. +// Check in kernel_default.h which one(s) are actually used by default. +// Others are mere experiments; they are still covered by tests +// in case they might be useful some day. + +#ifndef GEMMLOWP_INTERNAL_KERNEL_NEON_H_ +#define GEMMLOWP_INTERNAL_KERNEL_NEON_H_ + +#include "internal/kernel.h" + +#include <cassert> + +namespace gemmlowp { + +// Our main GEMM kernel. +struct NEONKernel12x4Depth2 : KernelBase { + typedef KernelFormat<KernelSideFormat<CellFormat<4, 2>, 3>, + KernelSideFormat<CellFormat<4, 2>, 1> > Format; + + const char* Name() const override { return "NEON, 12x4, straight, depth 2"; } + + // TODO(benoitjacob): reorder function arguments so dst comes last + void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride, + const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, + int start_depth, int run_depth) const override { + ScopedProfilingLabel label("optimized kernel"); + + assert(dst_row_stride == 1); + + asm volatile( + // Clear accumulator registers (see layout below) + "vmov.s32 q4, #0\n" + "vmov.s32 q8, q4\n" + "vmov.s32 q12, q4\n" + "vmov.s32 q5, q4\n" + "vmov.s32 q9, q4\n" + "vmov.s32 q13, q4\n" + "vmov.s32 q6, q4\n" + "vmov.s32 q10, q4\n" + "vmov.s32 q14, q4\n" + "vmov.s32 q7, q4\n" + "vmov.s32 q11, q4\n" + "vmov.s32 q15, q4\n" + + /* Main loop */ + + "loop_NEONKernel12x4Depth2_%=:\n" + + // Overview of register layout: + // + // A 2x4 cell of Rhs is stored in 16bit in d0--d1 (q0). + // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in d2--d7 + // (q1--q3). + // A 12x4 block of accumulators is stored in 32bit in q4--q15. + // + // +-----+-----+-----+-----+ + // |d0[0]|d0[1]|d0[2]|d0[3]| + // Rhs +-----+-----+-----+-----+ + // |d1[0]|d1[1]|d1[2]|d1[3]| + // +-----+-----+-----+-----+ + // + // | | | | | + // + // Lhs | | | | | + // + // +--+--+ - - - - +-----+-----+-----+-----+ + // |d2|d3| | q4 | q5 | q6 | q7 | + // |d2|d3| | q4 | q5 | q6 | q7 | + // |d2|d3| | q4 | q5 | q6 | q7 | + // |d2|d3| | q4 | q5 | q6 | q7 | + // +--+--+ - - - - +-----+-----+-----+-----+ + // |d4|d5| | q8 | q9 | q10 | q11 | + // |d4|d5| | q8 | q9 | q10 | q11 | + // |d4|d5| | q8 | q9 | q10 | q11 | + // |d4|d5| | q8 | q9 | q10 | q11 | + // +--+--+ - - - - +-----+-----+-----+-----+ + // |d6|d7| | q12 | q13 | q14 | q15 | + // |d6|d7| | q12 | q13 | q14 | q15 | + // |d6|d7| | q12 | q13 | q14 | q15 | + // |d6|d7| | q12 | q13 | q14 | q15 | + // +--+--+ - - - - +-----+-----+-----+-----+ + // + // Accumulator + + // Load 1 Rhs cell of size 2x4 + "vld1.8 {d0}, [%[rhs_ptr]:64]!\n" + + // Load 3 Lhs cells of size 4x2 each + "vld1.8 {d2}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d4}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d6}, [%[lhs_ptr]:64]!\n" + + // Expand Lhs/Rhs cells to 16 bit. + "vmovl.u8 q0, d0\n" + "vmovl.u8 q1, d2\n" + "vmovl.u8 q2, d4\n" + "vmovl.u8 q3, d6\n" + + // Multiply-accumulate, level of depth 0 + "vmlal.u16 q4, d2, d0[0]\n" + "vmlal.u16 q5, d2, d0[1]\n" + "vmlal.u16 q6, d2, d0[2]\n" + "vmlal.u16 q7, d2, d0[3]\n" + "vmlal.u16 q8, d4, d0[0]\n" + "vmlal.u16 q9, d4, d0[1]\n" + "vmlal.u16 q10, d4, d0[2]\n" + "vmlal.u16 q11, d4, d0[3]\n" + "vmlal.u16 q12, d6, d0[0]\n" + "vmlal.u16 q13, d6, d0[1]\n" + "vmlal.u16 q14, d6, d0[2]\n" + "vmlal.u16 q15, d6, d0[3]\n" + + // Multiply-accumulate, level of depth 1 + "vmlal.u16 q4, d3, d1[0]\n" + "vmlal.u16 q5, d3, d1[1]\n" + "vmlal.u16 q6, d3, d1[2]\n" + "vmlal.u16 q7, d3, d1[3]\n" + "vmlal.u16 q8, d5, d1[0]\n" + "vmlal.u16 q9, d5, d1[1]\n" + "vmlal.u16 q10, d5, d1[2]\n" + "vmlal.u16 q11, d5, d1[3]\n" + "vmlal.u16 q12, d7, d1[0]\n" + "vmlal.u16 q13, d7, d1[1]\n" + "vmlal.u16 q14, d7, d1[2]\n" + "vmlal.u16 q15, d7, d1[3]\n" + + // Loop. Decrement loop index (depth) by 2, since we just handled 2 + // levels of depth (Kernel::kDepth=2). + "subs %[run_depth], #2\n" + "bne loop_NEONKernel12x4Depth2_%=\n" + + /* end of main loop */ + + /* Accumulate our local accumulator registers into the destination block + */ + + // Compute stride between consecutive columns, in bytes + "mov r0, #4\n" // multiply by 4 = sizeof(int32) + "mul %[dst_col_stride], r0\n" + + // If start_depth == 0, then there is no preexisting accumulator + // to accumulate, so we can simply store our result. + "cmp %[start_depth], #0\n" + "beq store_result_NEONKernel12x4Depth2_%=\n" + + "mov r0, %[dst_ptr]\n" + + // Load a column + "mov r1, r0\n" + "vld1.32 {d0, d1}, [r1]!\n" + "vld1.32 {d2, d3}, [r1]!\n" + "vld1.32 {d4, d5}, [r1]!\n" + // Accumulate a column + "vadd.s32 q4, q4, q0\n" + "vadd.s32 q8, q8, q1\n" + "vadd.s32 q12, q12, q2\n" + + "add r0, %[dst_col_stride]\n" + // Load a column + "mov r1, r0\n" + "vld1.32 {d0, d1}, [r1]!\n" + "vld1.32 {d2, d3}, [r1]!\n" + "vld1.32 {d4, d5}, [r1]!\n" + // Accumulate a column + "vadd.s32 q5, q5, q0\n" + "vadd.s32 q9, q9, q1\n" + "vadd.s32 q13, q13, q2\n" + + "add r0, %[dst_col_stride]\n" + // Load a column + "mov r1, r0\n" + "vld1.32 {d0, d1}, [r1]!\n" + "vld1.32 {d2, d3}, [r1]!\n" + "vld1.32 {d4, d5}, [r1]!\n" + // Accumulate a column + "vadd.s32 q6, q6, q0\n" + "vadd.s32 q10, q10, q1\n" + "vadd.s32 q14, q14, q2\n" + + "add r0, %[dst_col_stride]\n" + // Load a column + "mov r1, r0\n" + "vld1.32 {d0, d1}, [r1]!\n" + "vld1.32 {d2, d3}, [r1]!\n" + "vld1.32 {d4, d5}, [r1]!\n" + // Accumulate a column + "vadd.s32 q7, q7, q0\n" + "vadd.s32 q11, q11, q1\n" + "vadd.s32 q15, q15, q2\n" + + "store_result_NEONKernel12x4Depth2_%=:\n" + + "mov r0, %[dst_ptr]\n" + // Store a column + "mov r1, r0\n" + "vst1.32 {d8, d9}, [r1]!\n" + "vst1.32 {d16, d17}, [r1]!\n" + "vst1.32 {d24, d25}, [r1]!\n" + // Store a column + "add r0, %[dst_col_stride]\n" + "mov r1, r0\n" + "vst1.32 {d10, d11}, [r1]!\n" + "vst1.32 {d18, d19}, [r1]!\n" + "vst1.32 {d26, d27}, [r1]!\n" + // Store a column + "add r0, %[dst_col_stride]\n" + "mov r1, r0\n" + "vst1.32 {d12, d13}, [r1]!\n" + "vst1.32 {d20, d21}, [r1]!\n" + "vst1.32 {d28, d29}, [r1]!\n" + // Store a column + "add r0, %[dst_col_stride]\n" + "mov r1, r0\n" + "vst1.32 {d14, d15}, [r1]!\n" + "vst1.32 {d22, d23}, [r1]!\n" + "vst1.32 {d30, d31}, [r1]!\n" + : // outputs + [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), + [dst_ptr] "+r"(dst_ptr), + [run_depth] "+r"(run_depth) + : // inputs + [start_depth] "r"(start_depth), + [dst_col_stride] "r"(dst_col_stride) + : // clobbers + "cc", "memory", "r0", "r1", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + } +}; + +// Our main GEMV kernel. +struct NEONKernel8x1Depth4 : KernelBase { + typedef KernelFormat<KernelSideFormat<CellFormat<8, 4>, 1>, + KernelSideFormat<CellFormat<1, 4>, 1> > Format; + + const char* Name() const override { return "NEON, 8x1, straight, depth 4"; } + + void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride, + const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, + int start_depth, int run_depth) const override { + ScopedProfilingLabel label("optimized kernel"); + + assert(dst_row_stride == 1); + + asm volatile( + // Clear accumulator registers (see layout below) + "vmov.s32 q11, #0\n" + "vmov.s32 q12, q11\n" + + /* Main loop */ + + "loop_NEONKernel8x1Depth4_%=:\n" + + // Overview of register layout: + // + // A 4x1 cell of Rhs is stored in 16bit in d0. + // A 8x4 cell Lhs is stored in 16bit in d2--d9 (q1--q4). + // A block of accumulators of size 8x1 is stored in 32bit in q11--q12 + // + // +-----+ + // |d0[0]| + // +-----+ + // |d0[1]| + // +-----+ Rhs + // |d0[2]| + // +-----+ + // |d0[3]| + // +-----+ + // | | + // + // Lhs | | + // + // +--+--+--+--+ - - - - +-----+ + // |d2|d4|d6|d8| | q11 | + // |d2|d4|d6|d8| | q11 | + // |d2|d4|d6|d8| | q11 | + // |d2|d4|d6|d8| | q11 | + // +--+--+--+--+ - - - - +-----+ Accumulator + // |d3|d5|d7|d9| | q12 | + // |d3|d5|d7|d9| | q12 | + // |d3|d5|d7|d9| | q12 | + // |d3|d5|d7|d9| | q12 | + // +--+--+--+--+ - - - - +-----+ + + // Load 1 Rhs cell of size 4x1. + "vldr.32 d0, [%[rhs_ptr]]\n" + "add %[rhs_ptr], #4\n" + + // Load 1 Lhs cell of size 8x4. Each vld1 instruction loads 1 col. + "vld1.8 {d2}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d4}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d6}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d8}, [%[lhs_ptr]:64]!\n" + + // Expand Lhs/Rhs cells to 16 bit. + "vmovl.u8 q0, d0\n" // d1 is unused. + "vmovl.u8 q1, d2\n" + "vmovl.u8 q2, d4\n" + "vmovl.u8 q3, d6\n" + "vmovl.u8 q4, d8\n" + + // Multiply-accumulate, level of depth 0 + "vmlal.u16 q11, d2, d0[0]\n" + "vmlal.u16 q12, d3, d0[0]\n" + + // Multiply-accumulate, level of depth 1 + "vmlal.u16 q11, d4, d0[1]\n" + "vmlal.u16 q12, d5, d0[1]\n" + + // Multiply-accumulate, level of depth 2 + "vmlal.u16 q11, d6, d0[2]\n" + "vmlal.u16 q12, d7, d0[2]\n" + + // Multiply-accumulate, level of depth 3 + "vmlal.u16 q11, d8, d0[3]\n" + "vmlal.u16 q12, d9, d0[3]\n" + + // Loop. Decrement loop index (depth) by 4, since we just handled 4 + // levels + // of depth (Kernel::kDepth=4). + "subs %[run_depth], #4\n" + "bne loop_NEONKernel8x1Depth4_%=\n" + + /* end of main loop */ + + /* Accumulate our local accumulator registers into the destination block + */ + + "cmp %[start_depth], #0\n" + "beq store_result_NEONKernel8x1Depth4_%=\n" + + "mov r0, %[dst_ptr]\n" + // Load a column + "vld1.32 {d0, d1}, [r0]!\n" + "vld1.32 {d2, d3}, [r0]!\n" + + // Accumulate a column + "vadd.s32 q11, q11, q0\n" + "vadd.s32 q12, q12, q1\n" + + "store_result_NEONKernel8x1Depth4_%=:\n" + + "mov r0, %[dst_ptr]\n" + "vst1.32 {d22, d23}, [r0]!\n" + "vst1.32 {d24, d25}, [r0]!\n" + : // outputs + [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), + [run_depth] "+r"(run_depth) + : // inputs + [dst_col_stride] "r"(dst_col_stride), [start_depth] "r"(start_depth), + [dst_ptr] "r"(dst_ptr) + : // clobbers + "cc", "memory", "r0", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + } +}; + +// Another GEMV kernel, that performs best by itself; +// unfortunately GEMV is dominated by LHS packing and we haven't succeeded +// so far in making efficient packing code for this very wide (width 20) +// format. +struct NEONKernel20x1Depth4 : KernelBase { + typedef KernelFormat<KernelSideFormat<CellFormat<4, 4>, 5>, + KernelSideFormat<CellFormat<1, 4>, 1> > Format; + + const char* Name() const override { return "NEON, 20x1, straight, depth 4"; } + + void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride, + const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, + int start_depth, int run_depth) const override { + ScopedProfilingLabel label("optimized kernel"); + + assert(dst_row_stride == 1); + + asm volatile( + // Clear accumulator registers (see layout below) + "vmov.s32 q11, #0\n" + "vmov.s32 q12, q11\n" + "vmov.s32 q13, q11\n" + "vmov.s32 q14, q11\n" + "vmov.s32 q15, q11\n" + + /* Main loop */ + + "loop_NEONKernel20x1Depth4_%=:\n" + + // Overview of register layout: + // + // A 4x1 cell of Rhs is stored in 16bit in d0. + // 5 cells, of size 4x4 each, are stored in 16bit in d2--d21. + // A block of accumulators of size 20x1 is stored in 32bit + // in q11--q15. + // + // +-----+ + // |d0[0]| + // +-----+ + // |d0[1]| + // +-----+ Rhs + // |d0[2]| + // +-----+ + // |d0[3]| + // +-----+ + // | | + // + // Lhs | | + // + // +---+---+---+---+ - - - - +-----+ + // |d2 |d3 |d4 |d5 | | q11 | + // |d2 |d3 |d4 |d5 | | q11 | + // |d2 |d3 |d4 |d5 | | q11 | + // |d2 |d3 |d4 |d5 | | q11 | + // +---+---+---+---+ - - - - +-----+ + // |d6 |d7 |d8 |d9 | | q12 | + // |d6 |d7 |d8 |d9 | | q12 | + // |d6 |d7 |d8 |d9 | | q12 | + // |d6 |d7 |d8 |d9 | | q12 | + // +---+---+---+---+ - - - - +-----+ + // |d10|d11|d12|d13| | q13 | + // |d10|d11|d12|d13| | q13 | Accumulator + // |d10|d11|d12|d13| | q13 | + // |d10|d11|d12|d13| | q13 | + // +---+---+---+---+ - - - - +-----+ + // |d14|d15|d16|d17| | q14 | + // |d14|d15|d16|d17| | q14 | + // |d14|d15|d16|d17| | q14 | + // |d14|d15|d16|d17| | q14 | + // +---+---+---+---+ - - - - +-----+ + // |d18|d19|d20|d21| | q15 | + // |d18|d19|d20|d21| | q15 | + // |d18|d19|d20|d21| | q15 | + // |d18|d19|d20|d21| | q15 | + // +---+---+---+---+ - - - - +-----+ + + // Load 1 Rhs cell + "vldr.32 d0, [%[rhs_ptr]]\n" + "add %[rhs_ptr], #4\n" + + // Load 10 Lhs cells + "vld1.8 {d2}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d4}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d6}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d8}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d10}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d12}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d14}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d16}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d18}, [%[lhs_ptr]:64]!\n" + "vld1.8 {d20}, [%[lhs_ptr]:64]!\n" + + // Expand Lhs/Rhs cells to 16 bit. + "vmovl.u8 q0, d0\n" + "vmovl.u8 q1, d2\n" + "vmovl.u8 q2, d4\n" + "vmovl.u8 q3, d6\n" + "vmovl.u8 q4, d8\n" + "vmovl.u8 q5, d10\n" + "vmovl.u8 q6, d12\n" + "vmovl.u8 q7, d14\n" + "vmovl.u8 q8, d16\n" + "vmovl.u8 q9, d18\n" + "vmovl.u8 q10, d20\n" + + // Multiply-accumulate, level of depth 0 + "vmlal.u16 q11, d2, d0[0]\n" + "vmlal.u16 q12, d6, d0[0]\n" + "vmlal.u16 q13, d10, d0[0]\n" + "vmlal.u16 q14, d14, d0[0]\n" + "vmlal.u16 q15, d18, d0[0]\n" + + // Multiply-accumulate, level of depth 1 + "vmlal.u16 q11, d3, d0[1]\n" + "vmlal.u16 q12, d7, d0[1]\n" + "vmlal.u16 q13, d11, d0[1]\n" + "vmlal.u16 q14, d15, d0[1]\n" + "vmlal.u16 q15, d19, d0[1]\n" + + // Multiply-accumulate, level of depth 2 + "vmlal.u16 q11, d4, d0[2]\n" + "vmlal.u16 q12, d8, d0[2]\n" + "vmlal.u16 q13, d12, d0[2]\n" + "vmlal.u16 q14, d16, d0[2]\n" + "vmlal.u16 q15, d20, d0[2]\n" + + // Multiply-accumulate, level of depth 3 + "vmlal.u16 q11, d5, d0[3]\n" + "vmlal.u16 q12, d9, d0[3]\n" + "vmlal.u16 q13, d13, d0[3]\n" + "vmlal.u16 q14, d17, d0[3]\n" + "vmlal.u16 q15, d21, d0[3]\n" + + // Loop. Decrement loop index (depth) by 4, since we just handled 4 + // levels + // of depth (Kernel::kDepth=4). + "subs %[run_depth], #4\n" + "bne loop_NEONKernel20x1Depth4_%=\n" + + /* end of main loop */ + + /* Accumulate our local accumulator registers into the destination block + */ + + "cmp %[start_depth], #0\n" + "beq store_result_NEONKernel20x1Depth4_%=\n" + + "mov r0, %[dst_ptr]\n" + // Load a column + "vld1.32 {d0, d1}, [r0]!\n" + "vld1.32 {d2, d3}, [r0]!\n" + "vld1.32 {d4, d5}, [r0]!\n" + "vld1.32 {d6, d7}, [r0]!\n" + "vld1.32 {d8, d9}, [r0]!\n" + + // Accumulate a column + "vadd.s32 q11, q11, q0\n" + "vadd.s32 q12, q12, q1\n" + "vadd.s32 q13, q13, q2\n" + "vadd.s32 q14, q14, q3\n" + "vadd.s32 q15, q15, q4\n" + + "store_result_NEONKernel20x1Depth4_%=:\n" + + "mov r0, %[dst_ptr]\n" + "vst1.32 {d22, d23}, [r0]!\n" + "vst1.32 {d24, d25}, [r0]!\n" + "vst1.32 {d26, d27}, [r0]!\n" + "vst1.32 {d28, d29}, [r0]!\n" + "vst1.32 {d30, d31}, [r0]!\n" + : // outputs + [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr), + [run_depth] "+r"(run_depth) + : // inputs + [dst_ptr] "r"(dst_ptr), [dst_col_stride] "r"(dst_col_stride), + [start_depth] "r"(start_depth) + : // clobbers + "cc", "memory", "r0", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_KERNEL_NEON_H_ diff --git a/internal/kernel_reference.h b/internal/kernel_reference.h new file mode 100644 index 0000000..2c65087 --- /dev/null +++ b/internal/kernel_reference.h @@ -0,0 +1,119 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// kernel_reference.h: a reference kernel for CPU architectures where we don't +// have optimized kernels yet. Also useful for testing, as it's templatized +// to have any arbitrary format, allowing tests to cover all sorts of corner +// cases. + +#ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ +#define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ + +#include "internal/kernel.h" + +#include <cstring> +#include <cstdio> + +namespace gemmlowp { + +// This kernel is templatized in an arbitrary Format template parameter, +// allowing it to have any arbitrary format. +template <typename tFormat> +struct ReferenceKernel : KernelBase { + typedef tFormat Format; + + const char* Name() const override { + static char buf[256]; + snprintf(buf, sizeof(buf), + "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)", + Format::Lhs::kCells, Format::Lhs::Cell::kWidth, + Format::Lhs::Cell::kDepth, + CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells, + Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth, + CellOrderName(Format::Rhs::Cell::kOrder)); + return buf; + } + + void Run(std::int32_t* dst_ptr, int dst_row_stride, int dst_col_stride, + const std::uint8_t* lhs_ptr, const std::uint8_t* rhs_ptr, + int start_depth, int run_depth) const override { + std::int32_t accumulator[Format::kRows * Format::kCols]; + memset(accumulator, 0, sizeof(accumulator)); + + const int run_depth_cells = run_depth / Format::kDepth; + + // The outer loop is over the depth dimension. + for (int dc = 0; dc < run_depth_cells; dc++) { + // The next two loops are over cells of the Lhs (stacked vertically), + // and over cells of the Rhs (stacked horizontally). + for (int rc = 0; rc < Format::Lhs::kCells; rc++) { + const std::uint8_t* lhs_cell_ptr = lhs_ptr + + (dc * Format::Lhs::kCells + rc) * + Format::Lhs::Cell::kWidth * + Format::kDepth; + for (int cc = 0; cc < Format::Rhs::kCells; cc++) { + const std::uint8_t* rhs_cell_ptr = rhs_ptr + + (dc * Format::Rhs::kCells + cc) * + Format::Rhs::Cell::kWidth * + Format::kDepth; + + // Now we are inside one cell of the Lhs and inside one cell + // of the Rhs, so the remaining inner loops are just + // traditional three loops of matrix multiplication. + for (int di = 0; di < Format::kDepth; di++) { + for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) { + for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) { + const std::uint8_t* lhs_coeff_ptr = + lhs_cell_ptr + + OffsetIntoCell<typename Format::Lhs::Cell>(ri, di); + const std::uint8_t* rhs_coeff_ptr = + rhs_cell_ptr + + OffsetIntoCell<typename Format::Rhs::Cell>(ci, di); + std::int32_t* accumulator_coeff_ptr = + accumulator + (ri + rc * Format::Lhs::Cell::kWidth) + + (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows; + *accumulator_coeff_ptr += + std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr); + } + } + } + } + } + } + + if (start_depth == 0) { + // start_depth == 0 means we haven't accumulated anything yet, so we need + // to overwrite the accumulator, as it hasn't been initialized to zero. + for (int r = 0; r < Format::kRows; r++) { + for (int c = 0; c < Format::kCols; c++) { + dst_ptr[r * dst_row_stride + c * dst_col_stride] = + accumulator[r + c * Format::kRows]; + } + } + } else { + // We have already accumulated stuff, so we need to continue accumulating + // instead of just overwriting. + for (int r = 0; r < Format::kRows; r++) { + for (int c = 0; c < Format::kCols; c++) { + dst_ptr[r * dst_row_stride + c * dst_col_stride] += + accumulator[r + c * Format::kRows]; + } + } + } + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ diff --git a/internal/multi_thread_gemm.h b/internal/multi_thread_gemm.h new file mode 100644 index 0000000..b008034 --- /dev/null +++ b/internal/multi_thread_gemm.h @@ -0,0 +1,496 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// multi_thread_gemm.h: Multi-threaded GEMM entry point. +// Readers note: To understand this file, it is useful to first +// read and understand the much simpler single_thread_gemm.h. + +#ifndef GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ +#define GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ + +#include <pthread.h> +#include <unistd.h> +#include <vector> + +#include "internal/single_thread_gemm.h" + +namespace gemmlowp { + +// A BlockingCounter lets one thread to wait for N events to occur. +// This is how the master thread waits for all the worker threads +// to have finished working. +class BlockingCounter { + public: + BlockingCounter() + : cond_(PTHREAD_COND_INITIALIZER), + mutex_(PTHREAD_MUTEX_INITIALIZER), + count_(0) {} + + // Sets/resets the counter; initial_count is the number of + // decrementing events that the Wait() call will be waiting for. + void Reset(int initial_count) { + pthread_mutex_lock(&mutex_); + assert(count_ == 0); + count_ = initial_count; + pthread_mutex_unlock(&mutex_); + } + + // Decrements the counter; if the counter hits zero, signals + // the thread that was waiting for that, and returns true. + // Otherwise (if the decremented count is still nonzero), + // returns false. + bool DecrementCount() { + pthread_mutex_lock(&mutex_); + assert(count_ > 0); + count_--; + if (count_ == 0) { + pthread_cond_signal(&cond_); + } + bool retval = count_ == 0; + pthread_mutex_unlock(&mutex_); + return retval; + } + + // Waits for the N other threads (N having been set by Reset()) + // to hit the BlockingCounter. + void Wait() { + ScopedProfilingLabel label("BlockingCounter::Wait"); + pthread_mutex_lock(&mutex_); + while (count_) { + pthread_cond_wait(&cond_, &mutex_); + } + pthread_mutex_unlock(&mutex_); + } + + private: + pthread_cond_t cond_; + pthread_mutex_t mutex_; + int count_; +}; + +// A workload for a worker. +struct Task { + Task() : local_allocator(nullptr) {} + virtual ~Task() {} + virtual void Run() const = 0; + Allocator* local_allocator; +}; + +// A worker thread. +class Worker { + public: + enum class State { + ThreadStartup, // The initial state before the thread main loop runs. + Ready, // Is not working, has not yet received new work to do. + HasWork, // Has work to do. + ExitAsSoonAsPossible // Should exit at earliest convenience. + }; + + explicit Worker(BlockingCounter* counter_to_decrement_when_ready) + : task_(nullptr), + state_cond_(PTHREAD_COND_INITIALIZER), + state_mutex_(PTHREAD_MUTEX_INITIALIZER), + state_(State::ThreadStartup), + counter_to_decrement_when_ready_(counter_to_decrement_when_ready) { + pthread_create(&thread_, nullptr, ThreadFunc, this); + } + + ~Worker() { + ChangeState(State::ExitAsSoonAsPossible); + pthread_join(thread_, nullptr); + } + + // Changes State; may be called from either the worker thread + // or the master thread; however, not all state transitions are legal, + // which is guarded by assertions. + void ChangeState(State new_state) { + ScopedProfilingLabel label("Worker::ChangeState"); + pthread_mutex_lock(&state_mutex_); + assert(new_state != state_); + switch (state_) { + case State::ThreadStartup: + assert(new_state == State::Ready); + break; + case State::Ready: + assert(new_state == State::HasWork || + new_state == State::ExitAsSoonAsPossible); + break; + case State::HasWork: + assert(new_state == State::Ready || + new_state == State::ExitAsSoonAsPossible); + break; + default: + abort(); + } + state_ = new_state; + pthread_cond_signal(&state_cond_); + if (state_ == State::Ready) { + counter_to_decrement_when_ready_->DecrementCount(); + } + pthread_mutex_unlock(&state_mutex_); + } + + // Thread entry point. + void ThreadFunc() { + ScopedProfilingLabel label("Worker::ThreadFunc"); + RegisterCurrentThreadForProfiling(); + + ChangeState(State::Ready); + + // Thread main loop + while (true) { + // Get a state to act on + pthread_mutex_lock(&state_mutex_); + switch (state_) { + case State::ExitAsSoonAsPossible: + case State::HasWork: + break; + case State::Ready: + // In the 'Ready' state, we have nothing to do but to wait until + // we switch to another state. + while (state_ == State::Ready) { + ScopedProfilingLabel label("Worker::ThreadFunc waiting"); + pthread_cond_wait(&state_cond_, &state_mutex_); + } + break; + default: + abort(); + } + State state_to_act_upon = state_; + pthread_mutex_unlock(&state_mutex_); + + // We now have a state to act on, so act. + switch (state_to_act_upon) { + case State::HasWork: + // Got work to do! So do it, and then revert to 'Ready' state. + assert(task_); + task_->Run(); + delete task_; + task_ = nullptr; + ChangeState(State::Ready); + break; + case State::ExitAsSoonAsPossible: + return; + default: + abort(); + } + } + } + + static void* ThreadFunc(void* arg) { + static_cast<Worker*>(arg)->ThreadFunc(); + return nullptr; + } + + // Called by the master thead to give this worker work to do. + // It is only legal to call this if the worker + void StartWork(Task* task) { + assert(!task_); + task->local_allocator = &local_allocator_; + task_ = task; + assert(state_ == State::Ready); + ChangeState(State::HasWork); + } + + private: + // The underlying thread. + pthread_t thread_; + + // The task to be worked on. + const Task* task_; + + // The condition variable and mutex guarding state changes. + pthread_cond_t state_cond_; + pthread_mutex_t state_mutex_; + + // The state enum tells if we're currently working, waiting for work, etc. + State state_; + + // Each thread had a local allocator so they can allocate temporary + // buffers without blocking each other. + Allocator local_allocator_; + + // pointer to the master's thread BlockingCounter object, to notify the + // master thread of when this worker switches to the 'Ready' state. + BlockingCounter* const counter_to_decrement_when_ready_; +}; + +// A very simple pool of workers, that only allows the very +// specific parallelization pattern that we use here: +// a fixed number of workers can be given work, and one then +// waits for all of them to finish. +class WorkersPool { + public: + WorkersPool() {} + + ~WorkersPool() { + for (auto w : workers_) { + delete w; + } + } + + BlockingCounter& counter_to_decrement_when_ready() { + return counter_to_decrement_when_ready_; + } + + // Give work to a specific worker. + void StartWorker(int index, Task* task_) { + assert(static_cast<std::size_t>(index) < workers_.size()); + workers_[index]->StartWork(task_); + } + + // Ensures that the pool has at least the given count of workers. + // If any new worker has to be created, this function waits for it to + // be ready. + void CreateWorkers(std::size_t workers_count) { + if (workers_.size() >= workers_count) { + return; + } + counter_to_decrement_when_ready_.Reset(workers_count - workers_.size()); + while (workers_.size() < workers_count) { + workers_.push_back(new Worker(&counter_to_decrement_when_ready_)); + } + counter_to_decrement_when_ready_.Wait(); + } + + private: + // copy construction disallowed + WorkersPool(const WorkersPool&) = delete; + + // The workers in this pool. They are owned by the pool: + // the pool creates workers and destroys them in its destructor. + std::vector<Worker*> workers_; + + // The BlockingCounter used to wait for the workers. + BlockingCounter counter_to_decrement_when_ready_; +}; + +// The task we use to implement a multi-threaded Gemm: a block of the +// RHS has been packed by the master thread; each worker thread +// then has to pack a block of the LHS and accumulate the Gemm of these +// packed LHS and RHS blocks. +template <typename KernelFormat, typename Scalar, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder> +struct GemmWithPackedRhsTask : Task { + GemmWithPackedRhsTask( + const KernelBase& _kernel, const MatrixMap<const Scalar, LhsOrder>& _lhs, + const PackedSideBlock<typename KernelFormat::Rhs>& _packed_rhs, + MatrixMap<Scalar, ResultOrder>* _result, int _lhs_offset, int _rhs_offset, + int _result_offset, int _result_mult_int, int _result_shift) + : kernel(_kernel), + lhs(_lhs), + packed_rhs(_packed_rhs), + result(*_result), + lhs_offset(_lhs_offset), + rhs_offset(_rhs_offset), + result_offset(_result_offset), + result_mult_int(_result_mult_int), + result_shift(_result_shift) {} + + void Run() const override { + ScopedProfilingLabel label("GemmWithPackedRhsTask"); + + const int rows = result.rows(); + const int cols = result.cols(); + const int depth = lhs.cols(); + + BlockParams block_params; + block_params.Init<KernelFormat>(rows, cols, depth, 1); + + PackedSideBlock<typename KernelFormat::Lhs> packed_lhs( + Side::Lhs, local_allocator, block_params, rhs_offset); + + PackedResultInt32 packed_result(local_allocator, block_params); + + local_allocator->Commit(); + + for (int c = 0; c < cols; c += block_params.l2_cols) { + int cs = std::min(block_params.l2_cols, cols - c); + + for (int r = 0; r < rows; r += block_params.l2_rows) { + int rs = std::min(block_params.l2_rows, rows - r); + + PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); + + Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs); + + auto result_block = result.block(r, c, rs, cs); + UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, + depth, result_offset, result_mult_int, result_shift); + } + } + + local_allocator->Decommit(); + } + + const KernelBase& kernel; + const MatrixMap<const Scalar, LhsOrder> lhs; + const PackedSideBlock<typename KernelFormat::Rhs> packed_rhs; + MatrixMap<Scalar, ResultOrder> result; + int lhs_offset; + int rhs_offset; + int result_offset; + int result_mult_int; + int result_shift; +}; + +class MultiThreadGemmContext : public SingleThreadGemmContext { + public: + MultiThreadGemmContext() : max_num_threads_(0) {} + + void set_max_num_threads(int n) { max_num_threads_ = n; } + + int max_num_threads() const { return max_num_threads_; } + + WorkersPool* workers_pool() { return &workers_pool_; } + + protected: + // The workers pool used by MultiThreadGemm. Making + // this part of the context allows it to be persistent, + // avoiding recreating threads on every Gemm. + WorkersPool workers_pool_; + + // The maximum number of worker threads to use (in addition + // to the master thread). + // The default value 0 means the default behavior of + // detecting the number of hardware threads. Nonzero values mean + // skipping and overriding hardware detection. + int max_num_threads_; +}; + +// Determines how many worker threads should be used for a given Gemm +// operation. +template <int KernelRows> +inline int HowManyWorkers(MultiThreadGemmContext* context, int rows, int cols, + int depth) { + // First check if the user set an explicit maximum number of threads. + int max_count = context->max_num_threads(); + if (!max_count) { + // No user-set maximum number of threads, so we need to + // do some hardware detection. + // This is expensive to query so we do it only once. + // Too bad for dynamicness. Also, we dont use the c++11 standard getter + // because Google's coding style currently bans #include <thread_>. + static const int hardware_threads_count = + static_cast<int>(sysconf(_SC_NPROCESSORS_CONF)); + + max_count = hardware_threads_count; + } + + // Basic calculation: take into account max pool size, and + // how many rows we have to feed our kernel. + int workers_count = std::min(max_count, CeilQuotient(rows, KernelRows)); + + // At this point for small products we already have workers_count==1 so + // we can avoid doing more work; otherwise, we still want to check + // that the cubic size (rows*cols*depth) is big enough to keep + // workers_ busy. + if (workers_count > 1) { + // Empirically determined value. + static const int min_cubic_size_per_thread = 32 * 1024; + + // We can only multiply two out of three sizes without risking overflow + int cols_times_depth = cols * depth; + + if (cols_times_depth < min_cubic_size_per_thread) { + // in that case, we can multiply by rows without risking overflow + int cubic_size = rows * cols_times_depth; + workers_count = std::min( + workers_count, CeilQuotient(cubic_size, min_cubic_size_per_thread)); + } + } + + assert(workers_count > 0 && workers_count <= max_count); + return workers_count; +} + +// The main multi-threaded Gemm function. +// To understand it, first read the code of SingleThreadedGemm(). +// The parallelization scheme used here is to have this master function +// pack a block of RHS and then start worker threads to pack a block of LHS +// each, and accumulate the corresponding products. +template <typename KernelFormat, typename Scalar, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder> +void MultiThreadGemm(MultiThreadGemmContext* context, const KernelBase& kernel, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + ScopedProfilingLabel label("gemmlowp::MultiThreadGemm"); + + assert(lhs.cols() == rhs.rows()); + + int rows = result->rows(); + int cols = result->cols(); + int depth = lhs.cols(); + + const int workers_count = + HowManyWorkers<KernelFormat::kRows>(context, rows, cols, depth); + if (workers_count == 1) { + return SingleThreadGemm<KernelFormat, Scalar, LhsOrder, RhsOrder, + ResultOrder>(context, kernel, lhs, rhs, result, + lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); + } + assert(workers_count > 1); + + Allocator* allocator = context->allocator(); + WorkersPool* workers_pool = context->workers_pool(); + + workers_pool->CreateWorkers(workers_count); + + BlockParams block_params; + block_params.Init<KernelFormat>(rows, cols, depth, workers_count); + + PackedSideBlock<typename KernelFormat::Rhs> packed_rhs( + Side::Rhs, allocator, block_params, lhs_offset); + allocator->Commit(); + + // We loop over large blocks of the RHS. + for (int c = 0; c < cols; c += block_params.l2_cols) { + int cs = std::min(block_params.l2_cols, cols - c); + + // Pack a large block of the RHS. + PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); + + // Give work to each worker. + int next_start_row = 0; + workers_pool->counter_to_decrement_when_ready().Reset(workers_count); + for (int w = 0; w < workers_count; w++) { + int start_row = next_start_row; + next_start_row = std::min( + rows, RoundUp<KernelFormat::kRows>(rows * (w + 1) / workers_count)); + + int block_rows = next_start_row - start_row; + auto lhs_block = lhs.block(start_row, 0, block_rows, depth); + auto result_block = result->block(start_row, c, block_rows, cs); + typedef GemmWithPackedRhsTask<KernelFormat, Scalar, LhsOrder, RhsOrder, + ResultOrder> TaskType; + + auto task = new TaskType(kernel, lhs_block, packed_rhs, &result_block, + lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); + workers_pool->StartWorker(w, task); + } + // Wait for the workers. + workers_pool->counter_to_decrement_when_ready().Wait(); + } + + allocator->Decommit(); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_MULTI_THREAD_GEMM_H_ diff --git a/internal/pack.h b/internal/pack.h new file mode 100644 index 0000000..b37d505 --- /dev/null +++ b/internal/pack.h @@ -0,0 +1,350 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// pack.h: packing blocks of the LHS and RHS into the data layout +// that is expected by compute.h and eventually by kernels. +// Because this data layout depends on the kernel format, code here +// is templated in KernelLhsFormat/KernelRhsFormat. +// +// Readers note: an important theme around here is that we try hard +// to handle both Lhs and Rhs with a single piece of code. We indifferently +// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices +// by (row, column) indices, we address them by (width, depth), as explained +// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, +// at once. + +#ifndef GEMMLOWP_INTERNAL_PACK_H_ +#define GEMMLOWP_INTERNAL_PACK_H_ + +#include <cstring> + +#include "internal/block_params.h" +#include "internal/kernel.h" +#include "internal/common.h" +#include "internal/allocator.h" + +namespace gemmlowp { + +// A PackedSideBlock instance is a packed block of either the LHS or RHS +// (whence the generic 'Side' name). +// +// 'Packed' means that it is laid out in the storage order that +// is expected by the specified kernel format. From a block of the input +// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() +// or PackRhs(). +template <typename KernelSideFormat> +class PackedSideBlock { + public: + PackedSideBlock(Side side, Allocator* allocator, + const BlockParams& block_params, + int rank_one_update_multiplier) + : allocator_(allocator), + rank_one_update_multiplier_(rank_one_update_multiplier), + pos_(0) { + GetSideBlockParams(side, ¶ms_, block_params); + data_handle_ = + allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); + rank_one_update_handle_ = + allocator_->Reserve<std::int32_t>(params_.l2_width); + } + + ~PackedSideBlock() {} + + void seek_run(int start_width, int start_depth) const { + int kernel_run_depth = + std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); + pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; + } + + void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } + + void seek_forward_n_cells(int n) const { + pos_ += n * KernelSideFormat::Cell::kSize; + } + + const std::uint8_t* current_data() const { + return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; + } + + std::uint8_t* current_data() { + return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; + } + + std::int32_t* rank_one_update() { + return allocator_->GetPointer<std::int32_t>(rank_one_update_handle_); + } + + const std::int32_t* rank_one_update() const { + return allocator_->GetPointer<const std::int32_t>(rank_one_update_handle_); + } + + std::int32_t rank_one_update_multiplier() const { + return rank_one_update_multiplier_; + } + + const SideBlockParams& params() const { return params_; } + + private: + // The block size parameters that this PackedSizeBlock follows. + // The L2 parameters determine its overall size, while the L1 parameters, + // together with the kernel format template parameter, determine + // the fine details of the storage/traversal order. + SideBlockParams params_; + + // Pointer to the allocator provided by the caller. Not owned. + // The Allocator is assumed to outlive the PackedSideBlock. + Allocator* const allocator_; + + // Handle on the buffer backing this packed block. Owned. + Allocator::Handle data_handle_; + + // Handle on the additional buffer backing the rank-one-update vector + // associated with this block. Owned. + Allocator::Handle rank_one_update_handle_; + + // The constant multiplier of the rank one update vector. + std::int32_t rank_one_update_multiplier_; + + // pos_ is the current position in the buffer, which we access + // sequentially, like a file. + // The idea is that we pack data in the same order as it is + // going to be traversed during the computation, which for + // cache-friendliness reasons is complicated to random-access, + // as the offsets calculations would be intricate. So we + // give up random-access addressing, and instead content ourselves + // with sequential access. + // + // pos_ is mutable because during the computation we will want to + // be able to iterate on the data in a const PackedSideBlock. + mutable int pos_; +}; + +// WidthMajor and DepthMajor are custom phrases modelled after the +// standard terminology 'row-major' and 'column-major'. Their meaning +// should be transparent once one has read the explanation in kernel.h: +// for example, in the Lhs, the 'width' dimension is the rows dimension, +// so there WidthMajor means RowMajor, while in the Rhs it is the opposite. +// Another way to put it: WidthMajor means that contiguous storage is used +// for entries having the same 'width' index. +enum class SideMapOrder { WidthMajor, DepthMajor }; + +// Similar to MatrixMap from map.h, but in terms of width/depth instead of +// rows/columns. Used to address blocks of the input LHS/RHS matrices when +// packing them. +template <typename tScalar, SideMapOrder tOrder> +class SideMap { + public: + typedef tScalar Scalar; + static const SideMapOrder kOrder = tOrder; + + SideMap(Scalar* data, int width, int depth, int stride) + : data_(data), width_(width), depth_(depth), stride_(stride) {} + + SideMap(const SideMap& other) + : data_(other.data_), + width_(other.width_), + depth_(other.depth_), + stride_(other.stride_) {} + + int width() const { return width_; } + int depth() const { return depth_; } + int stride() const { return stride_; } + int width_stride() const { + return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; + } + int depth_stride() const { + return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; + } + Scalar* data() const { return data_; } + Scalar* data(int w, int d) const { + return data_ + w * width_stride() + d * depth_stride(); + } + Scalar operator()(int w, int d) const { return *data(w, d); } + Scalar& operator()(int w, int d) { return *data(w, d); } + + SideMap block(int start_width, int start_depth, int block_width, + int block_depth) const { + assert(start_width >= 0); + assert(start_width + block_width <= width_); + assert(start_depth >= 0); + assert(start_depth + block_depth <= depth_); + + return SideMap(data(start_width, start_depth), block_width, block_depth, + stride_); + } + + private: + Scalar* data_; // not owned. + int width_, depth_, stride_; +}; + +// Generic (slow) packing code. +template <typename SrcMapType, typename KernelSideFormat> +class PackSideBlockImplGeneric { + public: + typedef typename KernelSideFormat::Cell CellFormat; + static const int kLhsCells = KernelSideFormat::kCells; + static const int kCellWidth = CellFormat::kWidth; + static const int kKernelWidth = CellFormat::kWidth * kLhsCells; + static const int kCellDepth = CellFormat::kDepth; + + virtual ~PackSideBlockImplGeneric() {} + + PackSideBlockImplGeneric(PackedSideBlock<KernelSideFormat>* packed_side_block, + const SrcMapType& src_map) + : packed_side_block_(packed_side_block), src_map_(src_map) {} + + // The public entry point to pack a block. + void PackL2() { + memset(packed_side_block_->rank_one_update(), 0, + sizeof(std::int32_t) * packed_side_block_->params().l2_width); + for (int d = 0; d < src_map_.depth(); + d += packed_side_block_->params().l1_depth) { + int ds = std::min<int>(packed_side_block_->params().l1_depth, + src_map_.depth() - d); + + for (int w = 0; w < src_map_.width(); + w += packed_side_block_->params().l1_width) { + int ws = std::min<int>(packed_side_block_->params().l1_width, + src_map_.width() - w); + + PackL1(w, ws, d, ds); + } + } + } + + PackedSideBlock<KernelSideFormat>* packed_side_block() const { + return packed_side_block_; + } + + const SrcMapType& src_map() const { return src_map_; } + + protected: + // PackRun packs only a run i.e. is the inner loop in the depth dimension. + // This is what subclasses may override to provide optimized code paths. + // Optimized implementations may still fall back to this generic code + // to handle unaligned boundaries. + virtual void PackRun(int start_width, int width, int start_depth, int depth) { + for (int d = 0; d < depth; d += kDefaultCacheLineSize) { + for (int w = 0; w < width; w++) { + Prefetch(src_map_.data(start_width + w, start_depth + d)); + } + } + for (int d = 0; d < depth; d += kCellDepth) { + // The next loop's boundary is kKernelWidth, not width, + // because we always pack whole kernels so that the + // compute stage doesn't need to worry about unaligned kernel sizes. + for (int w = 0; w < +kKernelWidth; w += kCellWidth) { + PackUnalignedCell(start_width + w, start_depth + d); + } + } + } + + private: + // The intermediate-level loops, between PackL2 and PackRun. + void PackL1(int start_width, int width, int start_depth, int depth) { + for (int w = 0; w < width; w += kKernelWidth) { + int ws = std::min(+kKernelWidth, width - w); + packed_side_block_->seek_run(start_width + w, start_depth); + PackRun(start_width + w, ws, start_depth, depth); + } + } + + // Reference un-optimized implementation of the packing of a cell; + // also serves as a fallback to handle unaligned edges. + void PackUnalignedCell(int start_width, int start_depth) { + std::uint8_t* dst_ptr = packed_side_block_->current_data(); + std::int32_t* dst_rank_one_update = + packed_side_block_->rank_one_update() + start_width; + std::int32_t dst_rank_one_update_multiplier = + packed_side_block_->rank_one_update_multiplier(); + + memset(dst_ptr, 0, sizeof(std::uint8_t) * CellFormat::kSize); + + if (start_width < src_map_.width() && start_depth < src_map_.depth()) { + int width = std::min<int>(+kCellWidth, src_map_.width() - start_width); + int depth = std::min<int>(+kCellDepth, src_map_.depth() - start_depth); + auto src_block = src_map_.block(start_width, start_depth, width, depth); + + for (int w = 0; w < width; w++) { + for (int d = 0; d < depth; d++) { + std::uint8_t s = src_block(w, d); + dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = s; + dst_rank_one_update[w] += s * dst_rank_one_update_multiplier; + } + } + } + + packed_side_block_->seek_next_cell(); + } + + // The PackedSideBlock being packed, i.e. the 'destination'. + PackedSideBlock<KernelSideFormat>* const packed_side_block_; + + // A map on the block of the original matrix block being packed, + // i.e. the 'source'. + const SrcMapType& src_map_; +}; + +// The packing code that we actually use. Defaults to using the above +// generic code; optimized paths can be inserted by specializing this +// template. See e.g. pack_neon.h. +template <typename SrcMapType, typename KernelSideFormat> +class PackSideBlockImpl + : public PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> { + public: + typedef PackSideBlockImplGeneric<SrcMapType, KernelSideFormat> Base; + + PackSideBlockImpl(PackedSideBlock<KernelSideFormat>* packed_side_block, + const SrcMapType& src_map) + : Base(packed_side_block, src_map) {} +}; + +// Packs a block of the input LHS matrix, into a PackedSideBlock +template <typename KernelSideFormat, typename MatrixMapType> +void PackLhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) { + ScopedProfilingLabel label("pack LHS"); + static_assert(MatrixMapType::kOrder == MapOrder::RowMajor, + "only row-major LHS is supported at the moment."); + static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor; + typedef typename MatrixMapType::Scalar Scalar; + typedef SideMap<Scalar, kSideMapOrder> SideMapType; + SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); + typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType; + ImplType impl(dst, src_side_map); + impl.PackL2(); +} + +// Packs a block of the input RHS matrix, into a PackedSideBlock +template <typename KernelSideFormat, typename MatrixMapType> +void PackRhs(PackedSideBlock<KernelSideFormat>* dst, const MatrixMapType& src) { + ScopedProfilingLabel label("pack RHS"); + static_assert(MatrixMapType::kOrder == MapOrder::ColMajor, + "only col-major RHS is supported at the moment."); + static const SideMapOrder kSideMapOrder = SideMapOrder::WidthMajor; + typedef typename MatrixMapType::Scalar Scalar; + typedef SideMap<Scalar, kSideMapOrder> SideMapType; + SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); + typedef PackSideBlockImpl<SideMapType, KernelSideFormat> ImplType; + ImplType impl(dst, src_side_map); + impl.PackL2(); +} + +} // namespace gemmlowp + +#ifdef GEMMLOWP_NEON +#include "internal/pack_neon.h" +#endif + +#endif // GEMMLOWP_INTERNAL_PACK_H_ diff --git a/internal/pack_neon.h b/internal/pack_neon.h new file mode 100644 index 0000000..c437489 --- /dev/null +++ b/internal/pack_neon.h @@ -0,0 +1,938 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// pack_neon.h: optimized NEON specializations of the templates in pack.h. + +#ifndef GEMMLOWP_INTERNAL_PACK_NEON_H_ +#define GEMMLOWP_INTERNAL_PACK_NEON_H_ + +#include "internal/pack.h" + +#include <arm_neon.h> + +namespace gemmlowp { + +// Specialization for 3 Cells of width 4, depth 2. +// This is the LHS format used by NEONKernel12x4Depth2. +typedef KernelSideFormat<CellFormat<4, 2>, 3> SideFormat3Cells4x2; +template <typename SrcMapType> +class PackSideBlockImpl<SrcMapType, SideFormat3Cells4x2> + : public PackSideBlockImplGeneric<SrcMapType, SideFormat3Cells4x2> { + public: + typedef SideFormat3Cells4x2 SideFormat; + typedef PackSideBlockImplGeneric<SrcMapType, SideFormat> Base; + + PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block, + const SrcMapType& src_map) + : Base(packed_side_block, src_map) {} + + protected: + static const int KernelRows = SideFormat::kWidth; + + virtual void PackRun(int start_width, int width, int start_depth, int depth) { + // Fall back to generic path for packing too narrow runs. + if (width < SideFormat::kWidth) { + Base::PackRun(start_width, width, start_depth, depth); + return; + } + + const std::uint8_t* src_ptr = + Base::src_map().data(start_width, start_depth); + const int stride = Base::src_map().stride(); + assert(src_ptr + stride == + Base::src_map().data(start_width + 1, start_depth)); + assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1)); + + // Prefetch data. + for (int d = 0; d < depth; d += kDefaultCacheLineSize) { + for (int i = 0; i < KernelRows; i++) { + Prefetch(src_ptr + i * stride + d); + } + } + + const int AlignedDepth16 = RoundDown<16>(depth); + if (AlignedDepth16) { + // Fast inner loop for handling multiples of 16 levels of depth + ScopedProfilingLabel label("optimized kernel"); + + std::int32_t* rank_one_update_ptr = + Base::packed_side_block()->rank_one_update() + start_width; + + std::uint8_t* dst_ptr = Base::packed_side_block()->current_data(); + const std::uint8_t* dst_end_ptr = dst_ptr + KernelRows * AlignedDepth16; + + __attribute__((aligned(32))) std::uint8_t buf[KernelRows * 16]; + __attribute__((aligned(16))) std::int32_t sumsbuf[12]; + asm volatile( + "mov r4, %[src_ptr]\n" + "mov r3, %[dst_ptr]\n" + + // We will accumulate the rank_one_update sums in q12--q14. + "vmov.s32 q12, #0\n" + "vmov.s32 q13, q12\n" + "vmov.s32 q14, q12\n" + + // Main loop. + "loop_PackSideBlockImplNEON4x3x2_%=:\n" + + // Load a 12x16 block into the 12 registers q0--q11. + // So each of these 12 registers contains 16 entries of + // one line, and there are 12 lines being processed. + "mov r0, r4\n" + "add r4, #16\n" + "vld1.8 {d0,d1}, [r0], %[stride]\n" + "vld1.8 {d2,d3}, [r0], %[stride]\n" + "vld1.8 {d4,d5}, [r0], %[stride]\n" + "vld1.8 {d6,d7}, [r0], %[stride]\n" + "vld1.8 {d8,d9}, [r0], %[stride]\n" + "vld1.8 {d10,d11}, [r0], %[stride]\n" + "vld1.8 {d12,d13}, [r0], %[stride]\n" + "vld1.8 {d14,d15}, [r0], %[stride]\n" + "vld1.8 {d16,d17}, [r0], %[stride]\n" + "vld1.8 {d18,d19}, [r0], %[stride]\n" + "vld1.8 {d20,d21}, [r0], %[stride]\n" + "vld1.8 {d22,d23}, [r0], %[stride]\n" + + // The CellOrder is the opposite of the MapOrder here + // so we need to transpose the data in each cell. + // We do so using an auxiliary buffer and the vst4 instruction, + // which takes 4 registers and stores them interleaved. + "mov r1, %[buf]\n" + "vst4.8 {d0, d2, d4, d6}, [r1:256]!\n" + "vst4.8 {d1, d3, d5, d7}, [r1:256]!\n" + "vst4.8 {d8, d10, d12, d14}, [r1:256]!\n" + "vst4.8 {d9, d11, d13, d15}, [r1:256]!\n" + "vst4.8 {d16, d18, d20, d22}, [r1:256]!\n" + "vst4.8 {d17, d19, d21, d23}, [r1:256]!\n" + + // Reload the data from our auxiliary buffer back into + // the same regisers; it is now transposed so each of the + // 24 d-registers from d0 to d23 now contains one cell, + // just not yet in the right sequence. We will still have + // to permute those cells. + "mov r1, %[buf]\n" + "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n" + "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n" + "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n" + "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n" + "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n" + "vld1.8 {d20,d21,d22,d23}, [r1:256]!\n" + + // Store these cells back to memory, now in the right + // sequence. + "vst1.8 d0, [r3:64]!\n" + "vst1.8 d8, [r3:64]!\n" + "vst1.8 d16, [r3:64]!\n" + "vst1.8 d1, [r3:64]!\n" + "vst1.8 d9, [r3:64]!\n" + "vst1.8 d17, [r3:64]!\n" + "vst1.8 d2, [r3:64]!\n" + "vst1.8 d10, [r3:64]!\n" + "vst1.8 d18, [r3:64]!\n" + "vst1.8 d3, [r3:64]!\n" + "vst1.8 d11, [r3:64]!\n" + "vst1.8 d19, [r3:64]!\n" + "vst1.8 d4, [r3:64]!\n" + "vst1.8 d12, [r3:64]!\n" + "vst1.8 d20, [r3:64]!\n" + "vst1.8 d5, [r3:64]!\n" + "vst1.8 d13, [r3:64]!\n" + "vst1.8 d21, [r3:64]!\n" + "vst1.8 d6, [r3:64]!\n" + "vst1.8 d14, [r3:64]!\n" + "vst1.8 d22, [r3:64]!\n" + "vst1.8 d7, [r3:64]!\n" + "vst1.8 d15, [r3:64]!\n" + "vst1.8 d23, [r3:64]!\n" + + // Now we are done packing this 12x16 block. We still have + // to accumulate the rank-one-update sums for it. + + // Add 8-bit values pair-wise into 16-bit values. + "vaddl.u8 q0, d0, d1\n" + "vaddl.u8 q1, d2, d3\n" + "vaddl.u8 q2, d4, d5\n" + "vaddl.u8 q3, d6, d7\n" + "vaddl.u8 q4, d8, d9\n" + "vaddl.u8 q5, d10, d11\n" + "vaddl.u8 q6, d12, d13\n" + "vaddl.u8 q7, d14, d15\n" + "vaddl.u8 q8, d16, d17\n" + "vaddl.u8 q9, d18, d19\n" + "vaddl.u8 q10, d20, d21\n" + "vaddl.u8 q11, d22, d23\n" + + // Add 16-bit values pair-wise into 32-bit values. + "vaddl.u16 q0, d0, d1\n" + "vaddl.u16 q1, d2, d3\n" + "vaddl.u16 q2, d4, d5\n" + "vaddl.u16 q3, d6, d7\n" + "vaddl.u16 q4, d8, d9\n" + "vaddl.u16 q5, d10, d11\n" + "vaddl.u16 q6, d12, d13\n" + "vaddl.u16 q7, d14, d15\n" + "vaddl.u16 q8, d16, d17\n" + "vaddl.u16 q9, d18, d19\n" + "vaddl.u16 q10, d20, d21\n" + "vaddl.u16 q11, d22, d23\n" + + // Accumulate the 32-bit sums into our accumulators q12--q14. + "vadd.s32 q12, q12, q0\n" + "vadd.s32 q13, q13, q4\n" + "vadd.s32 q14, q14, q8\n" + "vadd.s32 q12, q12, q1\n" + "vadd.s32 q13, q13, q5\n" + "vadd.s32 q14, q14, q9\n" + "vadd.s32 q12, q12, q2\n" + "vadd.s32 q13, q13, q6\n" + "vadd.s32 q14, q14, q10\n" + "vadd.s32 q12, q12, q3\n" + "vadd.s32 q13, q13, q7\n" + "vadd.s32 q14, q14, q11\n" + + // End of main loop. + "cmp r3, %[dst_end_ptr]\n" + "bne loop_PackSideBlockImplNEON4x3x2_%=\n" + + // Store our rank-one-update accumulator registers to the + // sums buffer. + "mov r0, %[sumsbuf]\n" + "vst1.32 {d24, d25}, [r0:128]!\n" + "vst1.32 {d26, d27}, [r0:128]!\n" + "vst1.32 {d28, d29}, [r0:128]!\n" + + : // no outputs + : // inputs + [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf), + [sumsbuf] "r"(sumsbuf) + : // clobbers + "cc", "memory", "r0", "r1", "r3", "r4", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + + // Accumulate the final rank_one_update vector. + int32x4x3_t sums; + sums.val[0] = vld1q_s32(sumsbuf); + sums.val[1] = vld1q_s32(sumsbuf + 4); + sums.val[2] = vld1q_s32(sumsbuf + 8); + sums.val[0] = vmulq_n_s32( + sums.val[0], Base::packed_side_block()->rank_one_update_multiplier()); + sums.val[1] = vmulq_n_s32( + sums.val[1], Base::packed_side_block()->rank_one_update_multiplier()); + sums.val[2] = vmulq_n_s32( + sums.val[2], Base::packed_side_block()->rank_one_update_multiplier()); + + int32x4x3_t old_sums; + old_sums.val[0] = vld1q_s32(rank_one_update_ptr + 0); + old_sums.val[1] = vld1q_s32(rank_one_update_ptr + 4); + old_sums.val[2] = vld1q_s32(rank_one_update_ptr + 8); + sums.val[0] = vaddq_s32(sums.val[0], old_sums.val[0]); + sums.val[1] = vaddq_s32(sums.val[1], old_sums.val[1]); + sums.val[2] = vaddq_s32(sums.val[2], old_sums.val[2]); + vst1q_s32(rank_one_update_ptr + 0, sums.val[0]); + vst1q_s32(rank_one_update_ptr + 4, sums.val[1]); + vst1q_s32(rank_one_update_ptr + 8, sums.val[2]); + } + + // We are done handling groups of 16 levels of depth; there may be + // a leftover for which we use the generic path. + Base::packed_side_block()->seek_forward_n_cells( + SideFormat::kCells * AlignedDepth16 / SideFormat::kDepth); + Base::PackRun(start_width, width, start_depth + AlignedDepth16, + depth - AlignedDepth16); + } +}; + +// Specialization for 5 Cells of width 4, depth 4. +// This is the LHS format used by NEONKernel20x1Depth4. +typedef KernelSideFormat<CellFormat<4, 4>, 5> SideFormat5Cells4x4; +template <typename LhsMapType> +class PackSideBlockImpl<LhsMapType, SideFormat5Cells4x4> + : public PackSideBlockImplGeneric<LhsMapType, SideFormat5Cells4x4> { + public: + typedef SideFormat5Cells4x4 SideFormat; + typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base; + + PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block, + const LhsMapType& src_map) + : Base(packed_side_block, src_map) {} + + protected: + virtual void PackRun(int start_width, int width, int start_depth, int depth) { + // Fall back to generic path for packing too narrow runs. + if (width < SideFormat::kWidth) { + Base::PackRun(start_width, width, start_depth, depth); + return; + } + + const std::uint8_t* src_ptr = + Base::src_map().data(start_width, start_depth); + const int stride = Base::src_map().stride(); + assert(src_ptr + stride == + Base::src_map().data(start_width + 1, start_depth)); + assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1)); + + // Prefetch data. + for (int d = 0; d < depth; d += kDefaultCacheLineSize) { + for (int i = 0; i < SideFormat::kWidth; i++) { + Prefetch(src_ptr + i * stride + d); + } + } + + const int AlignedDepth8 = RoundDown<8>(depth); + if (AlignedDepth8) { + // Fast inner loop for handling multiples of 8 levels of depth + ScopedProfilingLabel label("optimized kernel"); + + std::int32_t* rank_one_update_ptr = + Base::packed_side_block()->rank_one_update() + start_width; + + std::uint8_t* dst_ptr = Base::packed_side_block()->current_data(); + const std::uint8_t* dst_end_ptr = + dst_ptr + SideFormat::kWidth * AlignedDepth8; + + __attribute__((aligned(32))) std::uint8_t buf[SideFormat::kWidth * 8]; + __attribute__((aligned(16))) std::int32_t sumsbuf[20]; + asm volatile( + "mov r4, %[src_ptr]\n" + "mov r3, %[dst_ptr]\n" + + // We will accumulate the rank_one_update sums in q10--q14. + "vmov.s32 q10, #0\n" + "vmov.s32 q11, q10\n" + "vmov.s32 q12, q10\n" + "vmov.s32 q13, q10\n" + "vmov.s32 q14, q10\n" + + // Main loop. + "loop_PackSideBlockImplNEON4x5x4_%=:\n" + + // Load a 20x8 block into the 20 registers d0--d19. + // So each of these 20 registers contains 8 entries of + // one line, and there are 20 lines being processed. + "mov r0, r4\n" + "add r4, #8\n" + "vld1.8 d0, [r0], %[stride]\n" + "vld1.8 d1, [r0], %[stride]\n" + "vld1.8 d2, [r0], %[stride]\n" + "vld1.8 d3, [r0], %[stride]\n" + "vld1.8 d4, [r0], %[stride]\n" + "vld1.8 d5, [r0], %[stride]\n" + "vld1.8 d6, [r0], %[stride]\n" + "vld1.8 d7, [r0], %[stride]\n" + "vld1.8 d8, [r0], %[stride]\n" + "vld1.8 d9, [r0], %[stride]\n" + "vld1.8 d10, [r0], %[stride]\n" + "vld1.8 d11, [r0], %[stride]\n" + "vld1.8 d12, [r0], %[stride]\n" + "vld1.8 d13, [r0], %[stride]\n" + "vld1.8 d14, [r0], %[stride]\n" + "vld1.8 d15, [r0], %[stride]\n" + "vld1.8 d16, [r0], %[stride]\n" + "vld1.8 d17, [r0], %[stride]\n" + "vld1.8 d18, [r0], %[stride]\n" + "vld1.8 d19, [r0], %[stride]\n" + + // The CellOrder is the opposite of the MapOrder here + // so we need to transpose the data in each cell. + // We do so using an auxiliary buffer and the vst4 instruction, + // which takes 4 registers and stores them interleaved. + "mov r1, %[buf]\n" + "vst4.8 {d0, d1, d2, d3}, [r1:256]!\n" + "vst4.8 {d4, d5, d6, d7}, [r1:256]!\n" + "vst4.8 {d8, d9, d10, d11}, [r1:256]!\n" + "vst4.8 {d12, d13, d14, d15}, [r1:256]!\n" + "vst4.8 {d16, d17, d18, d19}, [r1:256]!\n" + + // Reload the data from our auxiliary buffer back into + // the same regisers; it is now transposed so each of the + // 20 d-registers from d0 to d19 now contains half of one cell, + // just not yet in the right sequence. We will still have + // to permute those cells. + "mov r1, %[buf]\n" + "vld1.8 {d0,d1,d2,d3}, [r1:256]!\n" + "vld1.8 {d4,d5,d6,d7}, [r1:256]!\n" + "vld1.8 {d8,d9,d10,d11}, [r1:256]!\n" + "vld1.8 {d12,d13,d14,d15}, [r1:256]!\n" + "vld1.8 {d16,d17,d18,d19}, [r1:256]!\n" + + // Store these cells back to memory, now in the right + // sequence. + "vst1.8 d0, [r3:64]!\n" + "vst1.8 d1, [r3:64]!\n" + "vst1.8 d4, [r3:64]!\n" + "vst1.8 d5, [r3:64]!\n" + "vst1.8 d8, [r3:64]!\n" + "vst1.8 d9, [r3:64]!\n" + "vst1.8 d12, [r3:64]!\n" + "vst1.8 d13, [r3:64]!\n" + "vst1.8 d16, [r3:64]!\n" + "vst1.8 d17, [r3:64]!\n" + "vst1.8 d2, [r3:64]!\n" + "vst1.8 d3, [r3:64]!\n" + "vst1.8 d6, [r3:64]!\n" + "vst1.8 d7, [r3:64]!\n" + "vst1.8 d10, [r3:64]!\n" + "vst1.8 d11, [r3:64]!\n" + "vst1.8 d14, [r3:64]!\n" + "vst1.8 d15, [r3:64]!\n" + "vst1.8 d18, [r3:64]!\n" + "vst1.8 d19, [r3:64]!\n" + + // Now we are done packing this 20x8 block. We still have + // to accumulate the rank-one-update sums for it. + + // Add 8-bit values pair-wise into 16-bit values. + "vaddl.u8 q0, d0, d1\n" + "vaddl.u8 q1, d2, d3\n" + "vaddl.u8 q2, d4, d5\n" + "vaddl.u8 q3, d6, d7\n" + "vaddl.u8 q4, d8, d9\n" + "vaddl.u8 q5, d10, d11\n" + "vaddl.u8 q6, d12, d13\n" + "vaddl.u8 q7, d14, d15\n" + "vaddl.u8 q8, d16, d17\n" + "vaddl.u8 q9, d18, d19\n" + + // Add 16-bit values pair-wise into 32-bit values. + "vaddl.u16 q0, d0, d1\n" + "vaddl.u16 q1, d2, d3\n" + "vaddl.u16 q2, d4, d5\n" + "vaddl.u16 q3, d6, d7\n" + "vaddl.u16 q4, d8, d9\n" + "vaddl.u16 q5, d10, d11\n" + "vaddl.u16 q6, d12, d13\n" + "vaddl.u16 q7, d14, d15\n" + "vaddl.u16 q8, d16, d17\n" + "vaddl.u16 q9, d18, d19\n" + + // Accumulate the 32-bit sums into our accumulators q10--q14. + "vadd.s32 q10, q10, q0\n" + "vadd.s32 q11, q11, q2\n" + "vadd.s32 q12, q12, q4\n" + "vadd.s32 q13, q13, q6\n" + "vadd.s32 q14, q14, q8\n" + "vadd.s32 q10, q10, q1\n" + "vadd.s32 q11, q11, q3\n" + "vadd.s32 q12, q12, q5\n" + "vadd.s32 q13, q13, q7\n" + "vadd.s32 q14, q14, q9\n" + + // End of main loop. + "cmp r3, %[dst_end_ptr]\n" + "bne loop_PackSideBlockImplNEON4x5x4_%=\n" + + // Store our rank-one-update accumulator registers to the + // sums buffer. + "mov r0, %[sumsbuf]\n" + "vst1.32 {d20, d21}, [r0:128]!\n" + "vst1.32 {d22, d23}, [r0:128]!\n" + "vst1.32 {d24, d25}, [r0:128]!\n" + "vst1.32 {d26, d27}, [r0:128]!\n" + "vst1.32 {d28, d29}, [r0:128]!\n" + : // no outputs + : // inputs + [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), [buf] "r"(buf), + [sumsbuf] "r"(sumsbuf) + : // clobbers + "cc", "memory", "r0", "r1", "r3", "r4", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + + // Accumulate the final rank_one_update vector. + int32x4_t sums[5]; + sums[0] = vld1q_s32(sumsbuf); + sums[1] = vld1q_s32(sumsbuf + 4); + sums[2] = vld1q_s32(sumsbuf + 8); + sums[3] = vld1q_s32(sumsbuf + 12); + sums[4] = vld1q_s32(sumsbuf + 16); + + sums[0] = vmulq_n_s32( + sums[0], Base::packed_side_block()->rank_one_update_multiplier()); + sums[1] = vmulq_n_s32( + sums[1], Base::packed_side_block()->rank_one_update_multiplier()); + sums[2] = vmulq_n_s32( + sums[2], Base::packed_side_block()->rank_one_update_multiplier()); + sums[3] = vmulq_n_s32( + sums[3], Base::packed_side_block()->rank_one_update_multiplier()); + sums[4] = vmulq_n_s32( + sums[4], Base::packed_side_block()->rank_one_update_multiplier()); + + int32x4_t old_sums[5]; + old_sums[0] = vld1q_s32(rank_one_update_ptr + 0); + old_sums[1] = vld1q_s32(rank_one_update_ptr + 4); + old_sums[2] = vld1q_s32(rank_one_update_ptr + 8); + old_sums[3] = vld1q_s32(rank_one_update_ptr + 12); + old_sums[4] = vld1q_s32(rank_one_update_ptr + 16); + + sums[0] = vaddq_s32(sums[0], old_sums[0]); + sums[1] = vaddq_s32(sums[1], old_sums[1]); + sums[2] = vaddq_s32(sums[2], old_sums[2]); + sums[3] = vaddq_s32(sums[3], old_sums[3]); + sums[4] = vaddq_s32(sums[4], old_sums[4]); + + vst1q_s32(rank_one_update_ptr + 0, sums[0]); + vst1q_s32(rank_one_update_ptr + 4, sums[1]); + vst1q_s32(rank_one_update_ptr + 8, sums[2]); + vst1q_s32(rank_one_update_ptr + 12, sums[3]); + vst1q_s32(rank_one_update_ptr + 16, sums[4]); + } + + // We are done handling groups of 8 levels of depth; there may be + // a leftover for which we use the generic path. + Base::packed_side_block()->seek_forward_n_cells( + SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth); + Base::PackRun(start_width, width, start_depth + AlignedDepth8, + depth - AlignedDepth8); + } +}; + +// Specialization for 1 Cell of width 8, depth 4. +// This is the LHS format used by NEONKernel8x1Depth4. +typedef KernelSideFormat<CellFormat<8, 4>, 1> SideFormat1Cell8x4; +template <typename LhsMapType> +class PackSideBlockImpl<LhsMapType, SideFormat1Cell8x4> + : public PackSideBlockImplGeneric<LhsMapType, SideFormat1Cell8x4> { + public: + typedef SideFormat1Cell8x4 SideFormat; + + typedef PackSideBlockImplGeneric<LhsMapType, SideFormat> Base; + + PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block, + const LhsMapType& src_map) + : Base(packed_side_block, src_map) {} + + protected: + virtual void PackRun(int start_width, int width, int start_depth, int depth) { + // Fall back to generic path for packing too narrow runs. + if (width < SideFormat::kWidth) { + Base::PackRun(start_width, width, start_depth, depth); + return; + } + + const std::uint8_t* src_ptr = + Base::src_map().data(start_width, start_depth); + const int stride = Base::src_map().stride(); + assert(src_ptr + stride == + Base::src_map().data(start_width + 1, start_depth)); + assert(src_ptr + 1 == Base::src_map().data(start_width, start_depth + 1)); + + // Prefetch data. + for (int d = 0; d < depth; d += kDefaultCacheLineSize) { + for (int i = 0; i < SideFormat::kWidth; i++) { + Prefetch(src_ptr + i * stride + d); + } + } + + const int AlignedDepth8 = RoundDown<8>(depth); + if (AlignedDepth8) { + // Fast inner loop for handling multiples of 8 levels of depth + ScopedProfilingLabel label("optimized kernel"); + + std::int32_t* rank_one_update_ptr = + Base::packed_side_block()->rank_one_update() + start_width; + + std::uint8_t* dst_ptr = Base::packed_side_block()->current_data(); + const std::uint8_t* dst_end_ptr = + dst_ptr + SideFormat::kWidth * AlignedDepth8; + + __attribute__((aligned(16))) std::int32_t sumsbuf[8]; + asm volatile( + "mov r4, %[src_ptr]\n" + "mov r3, %[dst_ptr]\n" + + // We will accumulate the rank_one_update sums in q10--q11. + "vmov.s32 q10, #0\n" + "vmov.s32 q11, q10\n" + + // Main loop. + "loop_PackSideBlockImplNEON8x1x4_%=:\n" + + // Load a 8x8 block into the 8 registers d0--d7. + // So each of these 8 registers contains 8 entries of + // one line, and there are 8 lines being processed. + "mov r0, r4\n" + "add r4, #8\n" + "vld1.8 d0, [r0], %[stride]\n" + "vld1.8 d1, [r0], %[stride]\n" + "vld1.8 d2, [r0], %[stride]\n" + "vld1.8 d3, [r0], %[stride]\n" + "vld1.8 d4, [r0], %[stride]\n" + "vld1.8 d5, [r0], %[stride]\n" + "vld1.8 d6, [r0], %[stride]\n" + "vld1.8 d7, [r0], %[stride]\n" + + // The CellOrder is the opposite of the MapOrder here + // so we need to transpose the data in each cell. + // Fortunately in this case, since what we have to transpose + // is a 8x8 block of 8-bit values, we can do so in-place with + // vtrn instructions; no need for an auxiliary buffer here. + "vtrn.8 d0, d1\n" + "vtrn.8 d2, d3\n" + "vtrn.8 d4, d5\n" + "vtrn.8 d6, d7\n" + "vtrn.16 q0, q1\n" + "vtrn.16 q2, q3\n" + "vtrn.32 q0, q2\n" + "vtrn.32 q1, q3\n" + + // Store the packed data to memory. + "vst1.8 {d0, d1, d2, d3}, [r3:64]!\n" + "vst1.8 {d4, d5, d6, d7}, [r3:64]!\n" + + // Now we are done packing this 20x8 block. We still have + // to accumulate the rank-one-update sums for it. + + // Add 8-bit values pair-wise into 16-bit values. + "vaddl.u8 q0, d0, d1\n" + "vaddl.u8 q1, d2, d3\n" + "vaddl.u8 q2, d4, d5\n" + "vaddl.u8 q3, d6, d7\n" + + // Add these 16-bit values into the 32-bit accumulators q10--q11. + "vaddw.u16 q10, q10, d0\n" + "vaddw.u16 q11, q11, d1\n" + "vaddw.u16 q10, q10, d2\n" + "vaddw.u16 q11, q11, d3\n" + "vaddw.u16 q10, q10, d4\n" + "vaddw.u16 q11, q11, d5\n" + "vaddw.u16 q10, q10, d6\n" + "vaddw.u16 q11, q11, d7\n" + + // End of main loop. + "cmp r3, %[dst_end_ptr]\n" + "bne loop_PackSideBlockImplNEON8x1x4_%=\n" + + // Store our rank-one-update accumulator registers to the + // sums buffer. + "mov r0, %[sumsbuf]\n" + "vst1.32 {d20, d21}, [r0:128]!\n" + "vst1.32 {d22, d23}, [r0:128]!\n" + : // no outputs + : // inputs + [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr), + [dst_end_ptr] "r"(dst_end_ptr), [stride] "r"(stride), + [sumsbuf] "r"(sumsbuf) + : // clobbers + "cc", "memory", "r0", "r3", "r4", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + + // Accumulate the final rank_one_update vector. + int32x4_t sums[2]; + sums[0] = vld1q_s32(sumsbuf); + sums[1] = vld1q_s32(sumsbuf + 4); + + sums[0] = vmulq_n_s32( + sums[0], Base::packed_side_block()->rank_one_update_multiplier()); + sums[1] = vmulq_n_s32( + sums[1], Base::packed_side_block()->rank_one_update_multiplier()); + + int32x4_t old_sums[2]; + old_sums[0] = vld1q_s32(rank_one_update_ptr + 0); + old_sums[1] = vld1q_s32(rank_one_update_ptr + 4); + + sums[0] = vaddq_s32(sums[0], old_sums[0]); + sums[1] = vaddq_s32(sums[1], old_sums[1]); + + vst1q_s32(rank_one_update_ptr + 0, sums[0]); + vst1q_s32(rank_one_update_ptr + 4, sums[1]); + } + + // We are done handling groups of 8 levels of depth; there may be + // a leftover for which we use the generic path. + Base::packed_side_block()->seek_forward_n_cells( + SideFormat::kCells * AlignedDepth8 / SideFormat::kDepth); + Base::PackRun(start_width, width, start_depth + AlignedDepth8, + depth - AlignedDepth8); + } +}; + +// Partial specialization for 1 Cell of width 4, and any Depth. +// This is the RHS format used by NEONKernel12x4Depth2. +template <int Depth> +using SideFormat1Cell4xD = KernelSideFormat<CellFormat<4, Depth>, 1>; +template <typename RhsMapType, int Depth> +class PackSideBlockImpl<RhsMapType, SideFormat1Cell4xD<Depth>> + : public PackSideBlockImplGeneric<RhsMapType, SideFormat1Cell4xD<Depth>> { + public: + typedef SideFormat1Cell4xD<Depth> SideFormat; + + typedef PackSideBlockImplGeneric<RhsMapType, SideFormat> Base; + + PackSideBlockImpl(PackedSideBlock<SideFormat>* packed_side_block, + const RhsMapType& src_map) + : Base(packed_side_block, src_map) {} + + protected: + virtual void PackRun(int start_width, int width, int start_depth, int depth) { + ScopedProfilingLabel label("optimized kernel"); + + std::int32_t* rank_one_update_ptr = + Base::packed_side_block()->rank_one_update() + start_width; + const int AlignedDepth16 = RoundDown<16>(depth); + const std::uint8_t* src_line0ptr = + width < 1 ? nullptr : Base::src_map().data(start_width, start_depth); + const std::uint8_t* src_line1ptr = + width < 2 ? nullptr + : Base::src_map().data(start_width + 1, start_depth); + const std::uint8_t* src_line2ptr = + width < 3 ? nullptr + : Base::src_map().data(start_width + 2, start_depth); + const std::uint8_t* src_line3ptr = + width < 4 ? nullptr + : Base::src_map().data(start_width + 3, start_depth); + std::uint8_t* dst_ptr = Base::packed_side_block()->current_data(); + const std::uint8_t* dst_end_ptr = + dst_ptr + SideFormat::kWidth * AlignedDepth16; + int32x4x4_t local_col_sums; + local_col_sums.val[0] = vdupq_n_s32(0); + local_col_sums.val[1] = vdupq_n_s32(0); + local_col_sums.val[2] = vdupq_n_s32(0); + local_col_sums.val[3] = vdupq_n_s32(0); + + switch (width) { + case 4: + while (dst_ptr != dst_end_ptr) { + uint8x16x4_t src_regs; + src_regs.val[0] = vld1q_u8(src_line0ptr); + src_regs.val[1] = vld1q_u8(src_line1ptr); + src_regs.val[2] = vld1q_u8(src_line2ptr); + src_regs.val[3] = vld1q_u8(src_line3ptr); + uint16x8x4_t s; + s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]), + vget_high_u8(src_regs.val[0])); + s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]), + vget_high_u8(src_regs.val[1])); + s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]), + vget_high_u8(src_regs.val[2])); + s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]), + vget_high_u8(src_regs.val[3])); + local_col_sums.val[0] = + vaddq_s32(local_col_sums.val[0], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[0]), vget_high_u16(s.val[0])))); + local_col_sums.val[1] = + vaddq_s32(local_col_sums.val[1], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[1]), vget_high_u16(s.val[1])))); + local_col_sums.val[2] = + vaddq_s32(local_col_sums.val[2], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[2]), vget_high_u16(s.val[2])))); + local_col_sums.val[3] = + vaddq_s32(local_col_sums.val[3], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[3]), vget_high_u16(s.val[3])))); + vst4q_u8(dst_ptr, src_regs); + src_line0ptr += 16; + src_line1ptr += 16; + src_line2ptr += 16; + src_line3ptr += 16; + dst_ptr += SideFormat::kWidth * 16; + } + break; + case 3: + while (dst_ptr != dst_end_ptr) { + uint8x16x4_t src_regs; + src_regs.val[0] = vld1q_u8(src_line0ptr); + src_regs.val[1] = vld1q_u8(src_line1ptr); + src_regs.val[2] = vld1q_u8(src_line2ptr); + src_regs.val[3] = vdupq_n_u8(0); + uint16x8x3_t s; + s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]), + vget_high_u8(src_regs.val[0])); + s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]), + vget_high_u8(src_regs.val[1])); + s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]), + vget_high_u8(src_regs.val[2])); + local_col_sums.val[0] = + vaddq_s32(local_col_sums.val[0], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[0]), vget_high_u16(s.val[0])))); + local_col_sums.val[1] = + vaddq_s32(local_col_sums.val[1], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[1]), vget_high_u16(s.val[1])))); + local_col_sums.val[2] = + vaddq_s32(local_col_sums.val[2], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[2]), vget_high_u16(s.val[2])))); + vst4q_u8(dst_ptr, src_regs); + src_line0ptr += 16; + src_line1ptr += 16; + src_line2ptr += 16; + dst_ptr += SideFormat::kWidth * 16; + } + break; + case 2: + while (dst_ptr != dst_end_ptr) { + uint8x16x4_t src_regs; + src_regs.val[0] = vld1q_u8(src_line0ptr); + src_regs.val[1] = vld1q_u8(src_line1ptr); + src_regs.val[2] = vdupq_n_u8(0); + src_regs.val[3] = vdupq_n_u8(0); + uint16x8x2_t s; + s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]), + vget_high_u8(src_regs.val[0])); + s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]), + vget_high_u8(src_regs.val[1])); + local_col_sums.val[0] = + vaddq_s32(local_col_sums.val[0], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[0]), vget_high_u16(s.val[0])))); + local_col_sums.val[1] = + vaddq_s32(local_col_sums.val[1], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[1]), vget_high_u16(s.val[1])))); + vst4q_u8(dst_ptr, src_regs); + src_line0ptr += 16; + src_line1ptr += 16; + dst_ptr += SideFormat::kWidth * 16; + } + break; + case 1: + while (dst_ptr != dst_end_ptr) { + uint8x16x4_t src_regs; + src_regs.val[0] = vld1q_u8(src_line0ptr); + src_regs.val[1] = vdupq_n_u8(0); + src_regs.val[2] = vdupq_n_u8(0); + src_regs.val[3] = vdupq_n_u8(0); + uint16x8x2_t s; + s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]), + vget_high_u8(src_regs.val[0])); + local_col_sums.val[0] = + vaddq_s32(local_col_sums.val[0], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[0]), vget_high_u16(s.val[0])))); + vst4q_u8(dst_ptr, src_regs); + src_line0ptr += 16; + dst_ptr += SideFormat::kWidth * 16; + } + break; + default: + abort(); + } + + int extra = depth - AlignedDepth16; + + if (extra) { + __attribute__((aligned(16))) std::uint8_t line0extra[16]; + __attribute__((aligned(16))) std::uint8_t line1extra[16]; + __attribute__((aligned(16))) std::uint8_t line2extra[16]; + __attribute__((aligned(16))) std::uint8_t line3extra[16]; + __attribute__((aligned(16))) std::uint8_t dstextra[64]; + + for (int i = 0; i < extra; i++) { + line0extra[i] = width >= 1 ? src_line0ptr[i] : 0; + line1extra[i] = width >= 2 ? src_line1ptr[i] : 0; + line2extra[i] = width >= 3 ? src_line2ptr[i] : 0; + line3extra[i] = width >= 4 ? src_line3ptr[i] : 0; + } + for (int i = extra; i < 16; i++) { + line0extra[i] = 0; + line1extra[i] = 0; + line2extra[i] = 0; + line3extra[i] = 0; + } + src_line0ptr = line0extra; + src_line1ptr = line1extra; + src_line2ptr = line2extra; + src_line3ptr = line3extra; + + { + uint8x16x4_t src_regs; + src_regs.val[0] = vld1q_u8(src_line0ptr); + src_regs.val[1] = vld1q_u8(src_line1ptr); + src_regs.val[2] = vld1q_u8(src_line2ptr); + src_regs.val[3] = vld1q_u8(src_line3ptr); + uint16x8x4_t s; + s.val[0] = vaddl_u8(vget_low_u8(src_regs.val[0]), + vget_high_u8(src_regs.val[0])); + s.val[1] = vaddl_u8(vget_low_u8(src_regs.val[1]), + vget_high_u8(src_regs.val[1])); + s.val[2] = vaddl_u8(vget_low_u8(src_regs.val[2]), + vget_high_u8(src_regs.val[2])); + s.val[3] = vaddl_u8(vget_low_u8(src_regs.val[3]), + vget_high_u8(src_regs.val[3])); + local_col_sums.val[0] = + vaddq_s32(local_col_sums.val[0], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[0]), vget_high_u16(s.val[0])))); + local_col_sums.val[1] = + vaddq_s32(local_col_sums.val[1], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[1]), vget_high_u16(s.val[1])))); + local_col_sums.val[2] = + vaddq_s32(local_col_sums.val[2], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[2]), vget_high_u16(s.val[2])))); + local_col_sums.val[3] = + vaddq_s32(local_col_sums.val[3], + vreinterpretq_s32_u32(vaddl_u16( + vget_low_u16(s.val[3]), vget_high_u16(s.val[3])))); + vst4q_u8(dstextra, src_regs); + } + + for (int i = 0; i < 4 * extra; i++) { + dst_ptr[i] = dstextra[i]; + } + } + + // Accumulate the final rank_one_update vector. + std::int32_t r[4]; + if (width >= 1) { + vst1q_s32(r, local_col_sums.val[0]); + rank_one_update_ptr[0] += + Base::packed_side_block()->rank_one_update_multiplier() * + (r[0] + r[1] + r[2] + r[3]); + } + if (width >= 2) { + vst1q_s32(r, local_col_sums.val[1]); + rank_one_update_ptr[1] += + Base::packed_side_block()->rank_one_update_multiplier() * + (r[0] + r[1] + r[2] + r[3]); + } + if (width >= 3) { + vst1q_s32(r, local_col_sums.val[2]); + rank_one_update_ptr[2] += + Base::packed_side_block()->rank_one_update_multiplier() * + (r[0] + r[1] + r[2] + r[3]); + } + if (width >= 4) { + vst1q_s32(r, local_col_sums.val[3]); + rank_one_update_ptr[3] += + Base::packed_side_block()->rank_one_update_multiplier() * + (r[0] + r[1] + r[2] + r[3]); + } + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_PACK_NEON_H_ diff --git a/internal/single_thread_gemm.h b/internal/single_thread_gemm.h new file mode 100644 index 0000000..0ec4201 --- /dev/null +++ b/internal/single_thread_gemm.h @@ -0,0 +1,103 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// single_thread_gemm.h: Single-threaded GEMM implementation. +// This is a good place to start reading code, as it shows the overall +// structure of a GEMM and is much simpler than multi_thread_gemm.h. + +#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ +#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ + +#include <cassert> + +#include "public/map.h" +#include "internal/allocator.h" +#include "internal/pack.h" +#include "internal/unpack.h" +#include "internal/compute.h" +#include "internal/kernel.h" + +namespace gemmlowp { + +class SingleThreadGemmContext { + public: + Allocator* allocator() { return &allocator_; } + + protected: + Allocator allocator_; +}; + +template <typename KernelFormat, typename Scalar, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder> +void SingleThreadGemm(SingleThreadGemmContext* context, + const KernelBase& kernel, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + ScopedProfilingLabel label("gemmlowp::SingleThreadGemm"); + + assert(lhs.cols() == rhs.rows()); + + int rows = result->rows(); + int cols = result->cols(); + int depth = lhs.cols(); + + Allocator* allocator = context->allocator(); + + BlockParams block_params; + block_params.Init<KernelFormat>(rows, cols, depth, 1); + + PackedSideBlock<typename KernelFormat::Lhs> packed_lhs( + Side::Lhs, allocator, block_params, rhs_offset); + PackedSideBlock<typename KernelFormat::Rhs> packed_rhs( + Side::Rhs, allocator, block_params, lhs_offset); + + PackedResultInt32 packed_result(allocator, block_params); + + allocator->Commit(); + + const bool pack_rhs_once = block_params.l2_cols == cols; + + if (pack_rhs_once) { + PackRhs(&packed_rhs, rhs); + } + + for (int r = 0; r < rows; r += block_params.l2_rows) { + int rs = std::min(block_params.l2_rows, rows - r); + + PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); + + for (int c = 0; c < cols; c += block_params.l2_cols) { + int cs = std::min(block_params.l2_cols, cols - c); + + if (!pack_rhs_once) { + PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); + } + + Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs); + + auto result_block = result->block(r, c, rs, cs); + UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth, + result_offset, result_mult_int, result_shift); + } + } + + allocator->Decommit(); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ diff --git a/internal/unpack.h b/internal/unpack.h new file mode 100644 index 0000000..55ca204 --- /dev/null +++ b/internal/unpack.h @@ -0,0 +1,100 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// unpack.h: unpacking the result blocks computed by compute.h, +// storing them into the destination matrix. + +#ifndef GEMMLOWP_INTERNAL_UNPACK_H_ +#define GEMMLOWP_INTERNAL_UNPACK_H_ + +#include "internal/allocator.h" +#include "internal/block_params.h" +#include "internal/pack.h" + +#ifdef GEMMLOWP_NEON +#include "internal/unpack_neon.h" +#endif + +namespace gemmlowp { + +class PackedResultInt32 { + Allocator* allocator_; + Allocator::Handle matrix_handle_; + const BlockParams& block_params_; + + public: + PackedResultInt32(Allocator* _allocator, const BlockParams& _block_params) + : allocator_(_allocator), block_params_(_block_params) { + matrix_handle_ = allocator_->Reserve<std::int32_t>(block_params_.l2_rows * + block_params_.l2_cols); + } + + ~PackedResultInt32() {} + + MatrixMap<std::int32_t, MapOrder::ColMajor> Map() { + return MatrixMap<std::int32_t, MapOrder::ColMajor>( + allocator_->GetPointer<std::int32_t>(matrix_handle_), + block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); + } + + MatrixMap<const std::int32_t, MapOrder::ColMajor> Map() const { + return MatrixMap<const std::int32_t, MapOrder::ColMajor>( + allocator_->GetPointer<const std::int32_t>(matrix_handle_), + block_params_.l2_rows, block_params_.l2_cols, block_params_.l2_rows); + } +}; + +template <typename ResultBlockType, typename PackedResult, + typename KernelLhsFormat, typename KernelRhsFormat> +void UnpackResultImpl(ResultBlockType* dst, const PackedResult& src, + const PackedSideBlock<KernelLhsFormat>& packed_lhs, + const PackedSideBlock<KernelRhsFormat>& packed_rhs, + int depth, std::int32_t result_offset, + std::int32_t result_mult_int, std::int32_t result_shift) { + std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() * + packed_rhs.rank_one_update_multiplier() * depth; + // No top-level blocking in the depth dimension at the moment. + // Too much loss of precision. + for (int c = 0; c < dst->cols(); c++) { + for (int r = 0; r < dst->rows(); r++) { + std::int32_t q = *src.data(r, c); + q += packed_lhs.rank_one_update()[r] + packed_rhs.rank_one_update()[c] + + rank0update; + q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >> + result_shift; + (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q; + } + } +} + +template <typename ResultBlockType, typename PackedResult, + typename KernelLhsFormat, typename KernelRhsFormat> +void UnpackResult(ResultBlockType* dst, const PackedResult& src, + const PackedSideBlock<KernelLhsFormat>& packed_lhs, + const PackedSideBlock<KernelRhsFormat>& packed_rhs, int depth, + std::int32_t result_offset, std::int32_t result_mult_int, + std::int32_t result_shift) { + ScopedProfilingLabel label("unpack"); +#ifdef GEMMLOWP_NEON + UnpackResultImplNEON(dst, src.Map(), packed_lhs, packed_rhs, depth, + result_offset, result_mult_int, result_shift); +#else + UnpackResultImpl(dst, src.Map(), packed_lhs, packed_rhs, depth, result_offset, + result_mult_int, result_shift); +#endif +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_UNPACK_H_ diff --git a/internal/unpack_neon.h b/internal/unpack_neon.h new file mode 100644 index 0000000..17bfe78 --- /dev/null +++ b/internal/unpack_neon.h @@ -0,0 +1,170 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// unpack_neon.h: optimized NEON specializations of the templates in unpack.h. + +#ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_ +#define GEMMLOWP_INTERNAL_UNPACK_NEON_H_ + +#include "internal/unpack.h" + +#include <arm_neon.h> + +namespace gemmlowp { + +template <typename ResultBlockType, typename PackedResult, + typename KernelLhsFormat, typename KernelRhsFormat> +void UnpackResultImplNEON(ResultBlockType* dst, const PackedResult& src, + const PackedSideBlock<KernelLhsFormat>& packed_lhs, + const PackedSideBlock<KernelRhsFormat>& packed_rhs, + int depth, std::int32_t result_offset, + std::int32_t result_mult_int, + std::int32_t result_shift) { + ScopedProfilingLabel label("optimized kernel"); + + std::int32_t rank0update = packed_lhs.rank_one_update_multiplier() * + packed_rhs.rank_one_update_multiplier() * depth; + std::int32_t preshift_offset = 1 << (result_shift - 1); + int32x4_t shift_reg = vdupq_n_s32(-result_shift); + for (int c = 0; c < dst->cols(); c++) { + std::uint8_t* dst_ptr = dst->data(0, c); + const std::int32_t* src_ptr = src.data(0, c); + const std::int32_t* rank_one_update_ptr = packed_lhs.rank_one_update(); + std::int32_t rank1update = packed_rhs.rank_one_update()[c]; + std::int32_t constant_offset = rank1update + rank0update + result_offset; + + int dst_rows_aligned4 = RoundDown<4>(dst->rows()); + int dst_rows_aligned16 = RoundDown<16>(dst->rows()); + + if (dst_rows_aligned16) { + std::uint8_t* dst_end_ptr = dst_ptr + dst_rows_aligned16; + + asm volatile( + "vdup.32 q12, %[constant_offset]\n" + "vdup.32 q13, %[preshift_offset]\n" + "neg r3, %[result_shift]\n" + "vdup.32 q14, r3\n" + "vdup.32 q15, %[result_mult_int]\n" + + "loop_UnpackResultImplNEON_%=:\n" + + // Load a 16x1 block of the packed result matrix + // (so 16 contiguous entries in one column). + "vld1.32 {d0, d1, d2, d3}, [%[src_ptr]]!\n" + "vld1.32 {d4, d5, d6, d7}, [%[src_ptr]]!\n" + + // Load entries the LHS rank one update vector. + "vld1.32 {d8, d9, d10, d11}, " + "[%[rank_one_update_ptr]:256]!\n" + "vld1.32 {d12, d13, d14, d15}, " + "[%[rank_one_update_ptr]:256]!\n" + + // Apply the LHS rank one update. + "vadd.s32 q0, q0, q4\n" + "vadd.s32 q1, q1, q5\n" + "vadd.s32 q2, q2, q6\n" + "vadd.s32 q3, q3, q7\n" + + // Add the constant offset + // (which includes the RHS rank one update, see above). + "vadd.s32 q0, q0, q12\n" + "vadd.s32 q1, q1, q12\n" + "vadd.s32 q2, q2, q12\n" + "vadd.s32 q3, q3, q12\n" + + // Multiply by the result multiplier + "vmul.s32 q0, q0, q15\n" + "vmul.s32 q1, q1, q15\n" + "vmul.s32 q2, q2, q15\n" + "vmul.s32 q3, q3, q15\n" + + // Add the pre-shift offset (so that the shift is rounding) + "vadd.s32 q0, q0, q13\n" + "vadd.s32 q1, q1, q13\n" + "vadd.s32 q2, q2, q13\n" + "vadd.s32 q3, q3, q13\n" + + // Shift right (shift left by negative offset). + "vshl.s32 q0, q0, q14\n" + "vshl.s32 q1, q1, q14\n" + "vshl.s32 q2, q2, q14\n" + "vshl.s32 q3, q3, q14\n" + + // So far we had signed 32bit values; now we cast them down + // to unsigned 8bit, saturating. + "vqmovn.s32 d8, q0\n" + "vqmovn.s32 d9, q1\n" + "vqmovn.s32 d10, q2\n" + "vqmovn.s32 d11, q3\n" + "vqmovun.s16 d0, q4\n" + "vqmovun.s16 d1, q5\n" + + // Store result into the destination matrix. + "vst1.8 {d0, d1}, [%[dst_ptr]]!\n" + + // End of the loop. + "cmp %[dst_ptr], %[dst_end_ptr]\n" + "bne loop_UnpackResultImplNEON_%=\n" + + : // outputs + [dst_ptr] "+r"(dst_ptr), [src_ptr] "+r"(src_ptr), + [rank_one_update_ptr] "+r"(rank_one_update_ptr) + : // inputs + [dst_end_ptr] "r"(dst_end_ptr), + [constant_offset] "r"(constant_offset), + [result_mult_int] "r"(result_mult_int), + [preshift_offset] "r"(preshift_offset), + [result_shift] "r"(result_shift) + : // clobbers + "cc", "memory", "r3", + // note: someone on internet says that quad registers are + // unsupported in the clobber list! + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10", + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", + "d31"); + } + + // We have finished handling groups of 16 entries at once; now + // try to handle 4 entries at once. + for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) { + int32x4_t q = vld1q_s32(src_ptr); + q = vaddq_s32(q, vld1q_s32(rank_one_update_ptr)); + q = vaddq_s32(q, vdupq_n_s32(constant_offset)); + q = vmulq_n_s32(q, result_mult_int); + q = vaddq_s32(q, vdupq_n_s32(preshift_offset)); + q = vshlq_s32(q, shift_reg); + int16x4_t q_16 = vqmovn_s32(q); + uint8x8_t q_8 = vqmovun_s16(vcombine_s16(q_16, q_16)); + vst1_lane_u32(reinterpret_cast<std::uint32_t*>(dst_ptr), + vreinterpret_u32_u8(q_8), 0); + dst_ptr += 4; + src_ptr += 4; + rank_one_update_ptr += 4; + } + // We have finished handling 4 entries at once; now handle + // remaining entries one by one. + for (int r = dst_rows_aligned4; r < dst->rows(); r++) { + std::int32_t q = src(r, c); + q += packed_lhs.rank_one_update()[r] + rank1update + rank0update; + q = ((q + result_offset) * result_mult_int + (1 << (result_shift - 1))) >> + result_shift; + (*dst)(r, c) = q > 255 ? 255 : q < 0 ? 0 : q; + } + } +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_ diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h new file mode 100644 index 0000000..b1592c8 --- /dev/null +++ b/profiling/instrumentation.h @@ -0,0 +1,217 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// instrumentation.h: contains the definitions needed to +// instrument code for profiling: +// ScopedProfilingLabel, RegisterCurrentThreadForProfiling. +// +// profiler.h is only needed to drive the profiler: +// StartProfiling, FinishProfiling. +// +// See the usage example in profiler.h. + +#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ +#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ + +#include <pthread.h> +#include <cstdint> +#include <cassert> +#include <cstdlib> +#include <algorithm> + +#ifdef GEMMLOWP_PROFILING +#include <set> +#include <cstdio> +#include <cstring> +#endif + +// We should always use C++11 thread_local; unfortunately that +// isn't fully supported on Apple yet. +#ifdef __APPLE__ +#define GEMMLOWP_THREAD_LOCAL static __thread +#else +#define GEMMLOWP_THREAD_LOCAL thread_local +#endif + +namespace gemmlowp { + +inline void ReleaseBuildAssertion(bool condition, const char* msg) { + if (!condition) { + fprintf(stderr, "gemmlowp error: %s\n", msg); + abort(); + } +} + +// To be used as template parameter for GlobalLock. +// GlobalLock<ProfilerLockId> is the profiler global lock: +// registering threads, starting profiling, finishing profiling, and +// the profiler itself as it samples threads, all need to lock it. +struct ProfilerLockId; + +// A very plain global lock. Templated in LockId so we can have multiple +// locks, one for each LockId type. +template <typename LockId> +class GlobalLock { + static pthread_mutex_t* Mutex() { + static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + return &m; + } + + public: + static void Lock() { pthread_mutex_lock(Mutex()); } + static void Unlock() { pthread_mutex_unlock(Mutex()); } +}; + +// A very simple RAII helper to lock and unlock a GlobalLock +template <typename LockId> +struct AutoGlobalLock { + AutoGlobalLock() { GlobalLock<LockId>::Lock(); } + ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } +}; + +// MemoryBarrier is purely a compile-time thing; it tells two things +// to the compiler: +// 1) It prevents reordering code across it +// (thanks to the 'volatile' after 'asm') +// 2) It requires the compiler to assume that any value previously +// read from memory, may have changed. Thus it offers an alternative +// to using 'volatile' variables. +inline void MemoryBarrier() { asm volatile("" ::: "memory"); } + +// Profiling definitions. Two paths: when profiling is enabled, +// and when profiling is disabled. +#ifdef GEMMLOWP_PROFILING +// This code path is when profiling is enabled. + +// A pseudo-call-stack. Contrary to a real call-stack, this only +// contains pointers to literal strings that were manually entered +// in the instrumented code (see ScopedProfilingLabel). +struct ProfilingStack { + static const std::size_t kMaxSize = 15; + typedef const char* LabelsArrayType[kMaxSize]; + LabelsArrayType labels; + std::size_t size; + + ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } + + void Push(const char* label) { + MemoryBarrier(); + ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); + labels[size] = label; + MemoryBarrier(); + size++; + MemoryBarrier(); + } + + void Pop() { + MemoryBarrier(); + ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); + size--; + MemoryBarrier(); + } + + void UpdateTop(const char* new_label) { + MemoryBarrier(); + assert(size); + labels[size - 1] = new_label; + MemoryBarrier(); + } + + ProfilingStack& operator=(const ProfilingStack& other) { + memcpy(this, &other, sizeof(ProfilingStack)); + return *this; + } + + bool operator==(const ProfilingStack& other) const { + return !memcmp(this, &other, sizeof(ProfilingStack)); + } +}; + +static_assert( + !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), + "ProfilingStack should have power-of-two size to fit in cache lines"); + +struct ThreadInfo; + +// The global set of threads being profiled. +inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { + static std::set<ThreadInfo*> v; + return v; +} + +struct ThreadInfo { + pthread_key_t key; // used only to get a callback at thread exit. + ProfilingStack stack; + + ThreadInfo() { + pthread_key_create(&key, ThreadExitCallback); + pthread_setspecific(key, this); + } + + static void ThreadExitCallback(void* ptr) { + AutoGlobalLock<ProfilerLockId> lock; + ThreadInfo* self = static_cast<ThreadInfo*>(ptr); + ThreadsUnderProfiling().erase(self); + pthread_key_delete(self->key); + } +}; + +inline ThreadInfo& ThreadLocalThreadInfo() { + GEMMLOWP_THREAD_LOCAL ThreadInfo i; + return i; +} + +// ScopedProfilingLabel is how one instruments code for profiling +// with this profiler. Construct local ScopedProfilingLabel variables, +// passing a literal string describing the local code. Profile +// samples will then be annotated with this label, while it is in scope +// (whence the name --- also known as RAII). +// See the example in profiler.h. +class ScopedProfilingLabel { + ProfilingStack* profiling_stack_; + + public: + explicit ScopedProfilingLabel(const char* label) + : profiling_stack_(&ThreadLocalThreadInfo().stack) { + profiling_stack_->Push(label); + } + + ~ScopedProfilingLabel() { profiling_stack_->Pop(); } + + void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } +}; + +// To be called once on each thread to be profiled. +inline void RegisterCurrentThreadForProfiling() { + AutoGlobalLock<ProfilerLockId> lock; + ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); +} + +#else // not GEMMLOWP_PROFILING +// This code path is when profiling is disabled. + +// This empty definition of ScopedProfilingLabel ensures that +// it has zero runtime overhead when profiling is disabled. +struct ScopedProfilingLabel { + explicit ScopedProfilingLabel(const char*) {} + void Update(const char*) {} +}; + +inline void RegisterCurrentThreadForProfiling() {} + +#endif + +} // end namespace gemmlowp + +#endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_ diff --git a/profiling/profiler.h b/profiling/profiler.h new file mode 100644 index 0000000..9ea7a9f --- /dev/null +++ b/profiling/profiler.h @@ -0,0 +1,373 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// profiler.h: a simple sampling profiler that's always just one #include away! +// +// Overview +// ======== +// +// This profiler only samples a pseudo-stack, not the actual call stack. +// The code to be profiled needs to be instrumented with +// pseudo-stack "labels", see ScopedProfilingLabel. +// Using pseudo-stacks allows this profiler to be very simple, low-overhead, +// portable, and independent of compilation details such as function inlining +// and frame pointers. The granularity of instrumentation can be freely chosen, +// and it is possible to get some annotate-like detail, i.e. detail within one +// function without splitting it into multiple functions. +// +// This profiler should remain small and simple; its key feature is to fit in +// a single header file so that there should never be a reason to refrain +// from profiling. More complex and feature-rich alternatives are +// readily available. This one offers a strict superset of its +// functionality: https://github.com/bgirard/GeckoProfiler, including +// intertwining pseudostacks with real call stacks, more annotation options, +// and advanced visualization. +// +// Usage +// ===== +// +// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is +// not enabled, profiling instrumentation from common.h +// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) +// is still defined but does nothing. On the other hand, +// when profiling is not enabled, it is an error to #include the +// present file. +// +// 1. Each thread can opt in to profiling by calling +// RegisterCurrentThreadForProfiling() defined in common.h. +// This can be done at any time, before or during profiling. +// No sample will be collected from a thread until +// it has called RegisterCurrentThreadForProfiling(). +// +// 2. Instrument your code to be profiled with ScopedProfilingLabel, +// which is a RAII helper defined in common.h. The identifier +// names (some_label, etc) do not matter; what will show up +// in the profile is the string passed to the constructor, which +// must be a literal string. See the full example below. +// +// Note: the overhead of ScopedProfilingLabel is zero when not +// enabling profiling (when not defining GEMMLOWP_PROFILING). +// +// 3. Use the profiler.h interface to control profiling. There are two +// functions: StartProfiling() and FinishProfiling(). They must be +// called on the same thread. FinishProfiling() prints the profile +// on stdout. +// +// Full example +// ============ +/* + #define GEMMLOWP_PROFILING + #include "profiling/instrumentation.h" + using namespace gemmlowp; + + const int iters = 100000000; + volatile int i; + + void Bar() { + ScopedProfilingLabel label("Bar"); + for (i = 0; i < iters; i++) {} + } + + void Foo() { + ScopedProfilingLabel label("Foo"); + for (i = 0; i < iters; i++) {} + Bar(); + } + + void Init() { + RegisterCurrentThreadForProfiling(); + } + + #include "profiling/profiler.h" + + int main() { + Init(); + StartProfiling(); + Foo(); + FinishProfiling(); + } +* +* Output: +* + gemmlowp profile (1 threads, 304 samples) + 100.00% Foo + 51.32% other + 48.68% Bar + 0.00% other (outside of any label) +*/ +// +// Interpreting results +// ==================== +// +// Each node shows the absolute percentage, among all the samples, +// of the number of samples that recorded the given pseudo-stack. +// The percentages are *NOT* relative to the parent node. In addition +// to your own labels, you will also see 'other' nodes that collect +// the remainder of samples under the parent node that didn't fall into +// any of the labelled child nodes. Example: +// +// 20% Foo +// 12% Bar +// 6% Xyz +// 2% other +// +// This means that 20% of all labels were under Foo, of which 12%/20%==60% +// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not +// under either Bar or Xyz. +// +// Typically, one wants to keep adding ScopedProfilingLabel's until +// the 'other' nodes show low percentages. +// +// Interpreting results with multiple threads +// ========================================== +// +// At each sample, each thread registered for profiling gets sampled once. +// So if there is one "main thread" spending its time in MainFunc() and +// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the +// samples will be in WorkerFunc, so the profile will look like this: +// +// 80% WorkerFunc +// 20% MainFunc + +#ifndef GEMMLOWP_PROFILING_PROFILER_H_ +#define GEMMLOWP_PROFILING_PROFILER_H_ + +#ifndef GEMMLOWP_PROFILING +#error Profiling is not enabled! +#endif + +#include <vector> + +#include "profiling/instrumentation.h" + +namespace gemmlowp { + +// A tree view of a profile. +class ProfileTreeView { + struct Node { + std::vector<Node*> children; + const char* label; + std::size_t weight; + Node() : label(nullptr), weight(0) {} + ~Node() { + for (auto child : children) { + delete child; + } + } + }; + + static bool CompareNodes(Node* n1, Node* n2) { + return n1->weight > n2->weight; + } + + Node root_; + + void PrintNode(const Node* node, int level) const { + if (level) { + for (int i = 1; i < level; i++) { + printf(" "); + } + printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); + } + for (auto child : node->children) { + PrintNode(child, level + 1); + } + } + + static void AddStackToNode(const ProfilingStack& stack, Node* node, + std::size_t level) { + node->weight++; + if (stack.size == level) { + return; + } + Node* child_to_add_to = nullptr; + for (auto child : node->children) { + if (child->label == stack.labels[level]) { + child_to_add_to = child; + break; + } + } + if (!child_to_add_to) { + child_to_add_to = new Node; + child_to_add_to->label = stack.labels[level]; + node->children.push_back(child_to_add_to); + } + AddStackToNode(stack, child_to_add_to, level + 1); + return; + } + + void AddStack(const ProfilingStack& stack) { + AddStackToNode(stack, &root_, 0); + } + + void AddOtherChildrenToNode(Node* node) { + std::size_t top_level_children_weight = 0; + for (auto c : node->children) { + AddOtherChildrenToNode(c); + top_level_children_weight += c->weight; + } + if (top_level_children_weight) { + Node* other_child = new Node; + other_child->label = + node == &root_ ? "other (outside of any label)" : "other"; + other_child->weight = node->weight - top_level_children_weight; + node->children.push_back(other_child); + } + } + + void AddOtherNodes() { AddOtherChildrenToNode(&root_); } + + void SortNode(Node* node) { + std::sort(node->children.begin(), node->children.end(), CompareNodes); + for (auto child : node->children) { + SortNode(child); + } + } + + void Sort() { SortNode(&root_); } + + public: + explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { + for (auto stack : stacks) { + AddStack(stack); + } + AddOtherNodes(); + Sort(); + } + + void Print() const { + printf("\n"); + printf("gemmlowp profile (%d threads, %d samples)\n", + static_cast<int>(ThreadsUnderProfiling().size()), + static_cast<int>(root_.weight)); + PrintNode(&root_, 0); + printf("\n"); + } +}; + +// This function is the only place that determines our sampling frequency. +inline void WaitOneProfilerTick() { + static const int millisecond = 1000000; + +#if defined __arm__ || defined __aarch64__ + // Reduced sampling frequency on mobile devices helps limit time and memory + // overhead there. + static const int interval = 10 * millisecond; +#else + static const int interval = 1 * millisecond; +#endif + + timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = interval; + nanosleep(&ts, nullptr); +} + +// This is how we track whether we've already started profiling, +// to guard against misuse of the API. +inline bool& IsProfiling() { + static bool b; + return b; +} + +// This is how we tell the profiler thread to finish. +inline bool& ProfilerThreadShouldFinish() { + static bool b; + return b; +} + +// The profiler thread. See ProfilerThreadFunc. +inline pthread_t& ProfilerThread() { + static pthread_t t; + return t; +} + +// Records a stack from a running thread. +// The tricky part is that we're not interrupting the thread. +// This is OK because we're looking at a pseudo-stack of labels, +// not at the real thread stack, and if the pseudo-stack changes +// while we're recording it, we are OK with getting either the +// old or the new stack. Note that ProfilingStack::Pop +// only decrements the size, and doesn't null the popped label, +// so if we're concurrently recording it, it shouldn't change +// under our feet until another label is pushed, at which point +// we are OK with getting either this new label or the old one. +// In the end, the key atomicity property that we are relying on +// here is that pointers are changed atomically, and the labels +// are pointers (to literal strings). +inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { + assert(!dst->size); + while (dst->size < thread->stack.size) { + dst->labels[dst->size] = thread->stack.labels[dst->size]; + dst->size++; + MemoryBarrier(); // thread->stack can change at any time + } +} + +// The profiler thread's entry point. +// Note that a separate thread is to be started each time we call +// StartProfiling(), and finishes when we call FinishProfiling(). +// So here we only need to handle the recording and reporting of +// a single profile. +inline void* ProfilerThreadFunc(void*) { + assert(ProfilerThread() == pthread_self()); + + // Since we only handle one profile per profiler thread, the + // profile data (the array of recorded stacks) can be a local variable here. + std::vector<ProfilingStack> stacks; + + while (!ProfilerThreadShouldFinish()) { + WaitOneProfilerTick(); + { + AutoGlobalLock<ProfilerLockId> lock; + for (auto t : ThreadsUnderProfiling()) { + ProfilingStack s; + RecordStack(t, &s); + stacks.push_back(s); + } + } + } + + // Profiling is finished and we now report the results. + ProfileTreeView(stacks).Print(); + + return nullptr; +} + +// Starts recording samples. +inline void StartProfiling() { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); + IsProfiling() = true; + ProfilerThreadShouldFinish() = false; + pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); +} + +// Stops recording samples, and prints a profile tree-view on stdout. +inline void FinishProfiling() { + { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); + // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, + // as the scary comments below should make clear. + // Should we use a condition variable? + ProfilerThreadShouldFinish() = true; + } // must release the lock here to avoid deadlock with profiler thread. + pthread_join(ProfilerThread(), nullptr); + IsProfiling() = false; // yikes, this should be guarded by the lock! +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PROFILING_PROFILER_H_ diff --git a/public/gemmlowp.h b/public/gemmlowp.h new file mode 100644 index 0000000..7b0f35e --- /dev/null +++ b/public/gemmlowp.h @@ -0,0 +1,52 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// gemmlowp.h: the main public interface header of gemmlowp. + +#ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_ +#define GEMMLOWP_PUBLIC_GEMMLOWP_H_ + +#include "public/map.h" +#include "internal/multi_thread_gemm.h" +#include "internal/kernel_default.h" + +namespace gemmlowp { + +class GemmContext : public MultiThreadGemmContext {}; + +// Computes a general matrix product ("GEMM"). +// The meaning of the offsets, result_mult_int and result_shift +// parameters is the same as in the standard EightBitIntGemm interface +// (which is also implemented in the eight_bit_int_gemm directory). +template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder, + MapOrder ResultOrder> +void Gemm(GemmContext* context, const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + if (rhs.cols() > DefaultKernelForGEMM::Format::kCols / 2) { + MultiThreadGemm<DefaultKernelForGEMM::Format>( + context, DefaultKernelForGEMM(), lhs, rhs, result, lhs_offset, + rhs_offset, result_offset, result_mult_int, result_shift); + } else { + MultiThreadGemm<DefaultKernelForGEMV::Format>( + context, DefaultKernelForGEMV(), lhs, rhs, result, lhs_offset, + rhs_offset, result_offset, result_mult_int, result_shift); + } +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_GEMMLOWP_H_ diff --git a/public/map.h b/public/map.h new file mode 100644 index 0000000..ba92c08 --- /dev/null +++ b/public/map.h @@ -0,0 +1,77 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// map.h: a minimalist view-existing-buffer-as-a-matrix class, +// which is how gemmlowp interfaces with external matrix data. + +#ifndef GEMMLOWP_PUBLIC_MAP_H_ +#define GEMMLOWP_PUBLIC_MAP_H_ + +#include "internal/common.h" + +namespace gemmlowp { + +// The two storage orders allowed to map buffers as matrices: ColMajor +// means column-major, RowMajor means row-major. +enum class MapOrder { ColMajor, RowMajor }; + +// A MatrixMap is a view of an existing buffer as a matrix. It does not own +// the buffer. +template <typename tScalar, MapOrder tOrder> +class MatrixMap { + public: + typedef tScalar Scalar; + static const MapOrder kOrder = tOrder; + + protected: + Scalar* data_; // not owned. + int rows_, cols_, stride_; + + public: + MatrixMap(Scalar* data, int rows, int cols, int stride) + : data_(data), rows_(rows), cols_(cols), stride_(stride) {} + + MatrixMap(const MatrixMap& other) + : data_(other.data_), + rows_(other.rows_), + cols_(other.cols_), + stride_(other.stride_) {} + + int rows() const { return rows_; } + int cols() const { return cols_; } + int stride() const { return stride_; } + int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; } + int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; } + Scalar* data() const { return data_; } + Scalar* data(int row, int col) const { + return data_ + row * rows_stride() + col * cols_stride(); + } + Scalar operator()(int row, int col) const { return *data(row, col); } + Scalar& operator()(int row, int col) { return *data(row, col); } + + MatrixMap block(int start_row, int start_col, int block_rows, + int block_cols) const { + assert(start_row >= 0); + assert(start_row + block_rows <= rows_); + assert(start_col >= 0); + assert(start_col + block_cols <= cols_); + + return MatrixMap(data(start_row, start_col), block_rows, block_cols, + stride_); + } +}; + +} // namespace gemmlowp + +#endif // GEMMLOWP_PUBLIC_MAP_H_ diff --git a/scripts/prepare-device-for-benchmarking.sh b/scripts/prepare-device-for-benchmarking.sh new file mode 100755 index 0000000..16ac2eb --- /dev/null +++ b/scripts/prepare-device-for-benchmarking.sh @@ -0,0 +1,138 @@ +# Copyright 2014 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Puts device in a special state giving optimal benchmark results. +# Not very realistic wrt real-world conditions. We are more interested +# in performance on devices in default state. + +#!/bin/bash + +echo "disabling mpdecision..." +adb root +adb remount +adb shell "mv /system/bin/mpdecision /system/bin/mpdecision-dontfind" > /dev/null + +echo "rebooting device..." +adb reboot +adb wait-for-device + +echo "restarting adbd as root..." +isroot=0 +while [ $isroot -eq 0 ] +do + adb root > /dev/null + if [ $? -eq 0 ] + then + isroot=1 + fi + echo " retrying in 1 s..." + sleep 1 +done + +echo "querying ro.hardware..." +hardware="`adb shell getprop ro.hardware`" +while [ "$hardware" == "" ] +do + echo "retrying in 1 s..." + sleep 1 + hardware="`adb shell getprop ro.hardware`" +done + +echo "got ro.hardware=$hardware" + +shouldstopui=0 +if [[ "$#" =~ .*--stop-ui.* ]] +then + shouldstopui=1 +else + if [[ "$hardware" =~ sprout.* ]] + then + echo "detected Android One (sprout). Will default to leaving the UI on." + else + echo "Will default to stopping the UI." + shouldstopui=1 + fi +fi + +if [ $shouldstopui -ne 0 ] +then + echo "stopping the UI..." + isshellstopped=0 + while [ $isshellstopped -eq 0 ] + do + adb shell stop > /dev/null + if [ $? -eq 0 ] + then + isshellstopped=1 + fi + echo " retrying in 1 s..." + sleep 1 + done +fi + +waitsec=10 +echo "sleeping $waitsec s before changing CPU settings, to work around a race with Android startup..." +sleep $waitsec + +echo "bringing all cores online..." +for cpu in `seq 0 3` +do + file="/sys/devices/system/cpu/cpu$cpu/online" + if [ -e $file ] + then + echo " cpu $cpu" + echo "echo 1 > $file; exit" | adb shell > /dev/null + if [ $? -ne 0 ] + then + echo "WARNING: failed to bring cpu $cpu online ($file)" + fi + fi +done + +echo "setting performance governor..." +for cpu in `seq 0 3` +do + file="/sys/devices/system/cpu/cpu$cpu/cpufreq/scaling_governor" + if [ -e $file ] + then + echo " cpu $cpu" + adb shell "echo performance > $file" > /dev/null + if [ $? -ne 0 ] + then + echo "WARNING: failed to set cpufreq governor ($file)" + fi + fi +done + +cpuloadlowsecs=0 +echo "waiting for CPU load to settle down..." +while [ $cpuloadlowsecs -lt 5 ] +do + cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`" + if [ "$cpuload" -lt "2" ] + then + cpuloadlowsecs=$((cpuloadlowsecs+1)) + echo "CPU load has been low for $cpuloadlowsecs s..." + else + cpuloadlowsecs=0 + echo "CPU load isn't low enough ($cpuload %)..." + fi + sleep 1 +done + +if [ $shouldstopui -eq 0 ] +then + echo "OK, the device might be ready now, but the UI is still running," + echo "so take a look at the screen to check if it's not doing something special." +fi diff --git a/scripts/restore-device-normal-state.sh b/scripts/restore-device-normal-state.sh new file mode 100755 index 0000000..583a24d --- /dev/null +++ b/scripts/restore-device-normal-state.sh @@ -0,0 +1,42 @@ +# Copyright 2014 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Un-does the effect of prepare-device-for-benchmarking.sh + +#!/bin/bash + +echo "restoring mpdecision..." +adb root +adb remount +adb shell "mv /system/bin/mpdecision-dontfind /system/bin/mpdecision" > /dev/null + +echo "rebooting device..." +adb reboot +adb wait-for-device + +cpuloadlowsecs=0 +echo "waiting for CPU load to settle down..." +while [ $cpuloadlowsecs -lt 5 ] +do + cpuload="`adb shell top -n 1 -d 1 -s cpu | awk '{sum += $3} END {print sum}'`" + if [ "$cpuload" -lt "2" ] + then + cpuloadlowsecs=$((cpuloadlowsecs+1)) + echo "CPU load has been low for $cpuloadlowsecs s..." + else + cpuloadlowsecs=0 + echo "CPU load isn't low enough ($cpuload %)..." + fi + sleep 1 +done diff --git a/scripts/test-android.sh b/scripts/test-android.sh new file mode 100755 index 0000000..b5d83d7 --- /dev/null +++ b/scripts/test-android.sh @@ -0,0 +1,83 @@ +# Copyright 2014 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#!/bin/bash + +if [ -z "$CXX" ] +then + echo "please set the CXX environment variable to point to your native Android toolchain C++ compiler" + exit 1 +fi + +default_cflags="-O3" + +if [ "$#" -eq 0 ] +then + echo "Usage: $0 files... [cflags...]" + echo "All command-line parameters are passed along to the C++ compiler, so they can \ +be either source files, or compiler flags." + echo "Default cflags: $default_cflags" + echo "Relies on the CXX environment variable to point to an Android C++ toolchain compiler." + exit 1 +fi + +EXE=gemmlowp-android-binary + +$CXX \ + --std=c++11 \ + -Wall -Wextra -pedantic \ + -fPIE -pie -mfpu=neon -mfloat-abi=softfp \ + -lstdc++ -latomic \ + -I . -I .. \ + -o $EXE \ + -Wno-unused-variable -Wno-unused-parameter \ + $default_cflags \ + $* + +if [ $? != 0 ]; then + echo "build failed" + exit 1 +fi + +adb root + +if [ $? != 0 ]; then + echo "$0: adb root failed" + exit 1 +fi + +adb shell mkdir -p /data/local/tmp + +if [ $? != 0 ]; then + echo "$0: adb shell failed to mkdir /data/local/tmp" + exit 1 +fi + +adb push $EXE /data/local/tmp + +if [ $? != 0 ]; then + echo "$0: adb push failed to write to /data/local/tmp" + exit 1 +fi + +adb shell "echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor" + +adb shell "/data/local/tmp/$EXE" | tee "log-$EXE" + +if [ $? != 0 ]; then + echo "$0: adb shell failed to run binary on device" + exit 1 +fi + +adb shell "echo ondemand > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor" diff --git a/test/benchmark.cc b/test/benchmark.cc new file mode 100644 index 0000000..d59164a --- /dev/null +++ b/test/benchmark.cc @@ -0,0 +1,210 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include <unistd.h> +#ifdef __APPLE__ +#include <sys/time.h> +#endif + +#include <iostream> +#include <ctime> +#include <cstdint> +#include <vector> +#include <map> +#include <cstdlib> + +#include "test/test.h" +#include "public/gemmlowp.h" + +namespace gemmlowp { + +double time() { +#ifdef __APPLE__ + timeval t; + gettimeofday(&t, nullptr); + return t.tv_sec + 1e-6 * t.tv_usec; +#else + timespec t; + clock_gettime(CLOCK_REALTIME, &t); + return t.tv_sec + 1e-9 * t.tv_nsec; +#endif +} + +const double min_accurate_duration = 1e-1; +const std::size_t min_working_set_size = 16 * 1024 * 1024; + +template <typename Kernel, typename LhsType, typename RhsType, + typename ResultType> +double gflops_for_gemm_size(GemmContext* context, int rows, int depth, + int cols) { + typedef std::uint8_t Scalar; + + // set up the matrix pool + + const std::size_t combined_three_matrices_sizes = + sizeof(Scalar) * (rows * depth + depth * cols + rows * cols); + + const std::size_t matrix_pool_size = + 1 + min_working_set_size / combined_three_matrices_sizes; + + std::vector<LhsType> lhs(matrix_pool_size); + std::vector<RhsType> rhs(matrix_pool_size); + std::vector<ResultType> result(matrix_pool_size); + + lhs[0].Resize(rows, depth); + MakeConstant(&lhs[0], 128); + rhs[0].Resize(depth, cols); + MakeConstant(&rhs[0], 128); + result[0].Resize(rows, cols); + MakeZero(&result[0]); + + for (std::size_t i = 1; i < matrix_pool_size; i++) { + lhs[i] = lhs[0]; + rhs[i] = rhs[0]; + result[i] = result[0]; + } + + const int depth_shift = static_cast<int>( + std::ceil(0.5 * std::log(static_cast<float>(depth)) / std::log(2.0f))); + + // main benchmark loop + + int iters_at_a_time = 1; + float time_per_iter = 0.0f; + std::size_t matrix_index = 0; + + while (true) { + double starttime = time(); + for (int i = 0; i < iters_at_a_time; i++) { + Gemm(context, lhs[matrix_index].const_map(), + rhs[matrix_index].const_map(), &result[matrix_index].map(), -75, -91, + 74980, 123, 18 + depth_shift); + + matrix_index++; + if (matrix_index == matrix_pool_size) { + matrix_index = 0; + } + } + double endtime = time(); + + const float timing = static_cast<float>(endtime - starttime); + + if (timing >= min_accurate_duration) { + time_per_iter = timing / iters_at_a_time; + break; + } + + iters_at_a_time *= 2; + } + + return 2e-9 * rows * depth * cols / time_per_iter; +} + +void benchmark(GemmContext* context) { +#ifdef GEMMLOWP_TEST_KERNEL + typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMM; + typedef gemmlowp::GEMMLOWP_TEST_KERNEL KernelForGEMV; +#else + typedef gemmlowp::DefaultKernelForGEMM KernelForGEMM; + typedef gemmlowp::DefaultKernelForGEMV KernelForGEMV; +#endif + + std::map<std::tuple<int, int, int>, std::vector<double>> benchmark_results; + + std::vector<std::tuple<int, int, int>> benchmark_sizes; + benchmark_sizes.emplace_back(10, 10, 10); + benchmark_sizes.emplace_back(20, 20, 20); + benchmark_sizes.emplace_back(30, 30, 30); + benchmark_sizes.emplace_back(40, 40, 40); + benchmark_sizes.emplace_back(50, 50, 50); + benchmark_sizes.emplace_back(60, 60, 60); + benchmark_sizes.emplace_back(64, 256, 147); + benchmark_sizes.emplace_back(100, 100, 1); + benchmark_sizes.emplace_back(100, 100, 100); + benchmark_sizes.emplace_back(100, 1000, 100); + benchmark_sizes.emplace_back(1000, 1000, 1); + benchmark_sizes.emplace_back(1000, 1000, 10); + benchmark_sizes.emplace_back(1000, 1000, 100); + benchmark_sizes.emplace_back(1000, 1000, 1000); + + const int repeat = 2; + + typedef Matrix<std::uint8_t, MapOrder::RowMajor> LhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> RhsType; + typedef Matrix<std::uint8_t, MapOrder::ColMajor> ResultType; + +#ifdef GEMMLOWP_TEST_PROFILE + gemmlowp::RegisterCurrentThreadForProfiling(); + gemmlowp::StartProfiling(); +#endif + + // We don't record the first repetition, it's just warm-up. + for (int r = 0; r < repeat + 1; r++) { + std::cout << "repetition " << r + 1 << "/" << repeat + 1 << "...\r" + << std::flush; + for (auto s : benchmark_sizes) { + double gflops = 0; + int rows = std::get<0>(s); + int depth = std::get<1>(s); + int cols = std::get<2>(s); + if (cols > KernelForGEMM::Format::kCols / 2) { + gflops = + gflops_for_gemm_size<KernelForGEMM, LhsType, RhsType, ResultType>( + context, rows, depth, cols); + } else { + gflops = + gflops_for_gemm_size<KernelForGEMV, LhsType, RhsType, ResultType>( + context, rows, depth, cols); + } + if (r > 0) { + benchmark_results[s].emplace_back(gflops); + } + } + } + + std::cout << " \r" + << std::flush; + +#ifdef GEMMLOWP_TEST_PROFILE + gemmlowp::FinishProfiling(); +#endif + + std::cout.precision(4); + + for (auto b : benchmark_results) { + sort(b.second.begin(), b.second.end()); + std::cout << std::get<0>(b.first) << "x" << std::get<1>(b.first) << "x" + << std::get<2>(b.first) << " : " << b.second.back() << " GFlops/s" + << std::endl; + } + std::cout << std::endl; +} + +} // end namespace gemmlowp + +int main() { + { + gemmlowp::GemmContext context; + std::cout << "Benchmarking default mode (typically multi-threaded)..." + << std::endl; + gemmlowp::benchmark(&context); + } + + { + gemmlowp::GemmContext context; + context.set_max_num_threads(1); + std::cout << "Benchmarking single-threaded mode..." << std::endl; + gemmlowp::benchmark(&context); + } +} diff --git a/test/test.cc b/test/test.cc new file mode 100644 index 0000000..5574640 --- /dev/null +++ b/test/test.cc @@ -0,0 +1,526 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/test.h" + +#include <unistd.h> +#include <iostream> +#include <ctime> +#include <cstdint> +#include <vector> +#include <cstdlib> +#include <string> + +#include "public/gemmlowp.h" +#include "internal/kernel_reference.h" +#include "eight_bit_int_gemm/eight_bit_int_gemm.h" + +namespace gemmlowp { + +struct ReferenceEightBitIntGemmContext { + ReferenceEightBitIntGemmContext() + : saturated_0_values(0), saturated_255_values(0) {} + + int saturated_0_values, saturated_255_values; +}; + +void ReferenceEightBitIntGemm(ReferenceEightBitIntGemmContext* context, int m, + int n, int k, const uint8_t* a, int32_t a_offset, + int lda, const uint8_t* b, int32_t b_offset, + int ldb, uint8_t* c, int32_t c_offset, + int32_t c_mult_int, int32_t c_shift, int ldc) { + context->saturated_0_values = 0; + context->saturated_255_values = 0; + int i, j, l; + + for (j = 0; j < n; j++) { + for (i = 0; i < m; i++) { + int32_t total = 0; + for (l = 0; l < k; l++) { + const int a_index = ((i * lda) + l); + const uint8_t a_as_byte = a[a_index]; + const int32_t a_as_int = ((static_cast<std::int32_t>(a_as_byte)) + a_offset); + const int b_index = ((j * ldb) + l); + const uint8_t b_as_byte = b[b_index]; + const int32_t b_as_int = ((static_cast<std::int32_t>(b_as_byte)) + b_offset); + const int32_t mult_as_int = (a_as_int * b_as_int); + total += mult_as_int; + } + const int c_index = ((ldc * i) + j); + int32_t output = + ((((total + c_offset) * c_mult_int) + (1 << (c_shift - 1))) >> + c_shift); + if (output >= 255) { + output = 255; + context->saturated_255_values++; + } + if (output <= 0) { + output = 0; + context->saturated_0_values++; + } + c[c_index] = static_cast<std::uint8_t>(output); + } + } +} + +// *GemmWrapper's allow to wrap various Gemm functions in a uniform +// interface, so we can use the same testing code to test all of them + +template <typename Kernel, typename Scalar, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder> +struct SingleThreadGemmWrapper { + static const int kLhsBitDepth = Kernel::kLhsBitDepth; + static const int kRhsBitDepth = Kernel::kRhsBitDepth; + + static const char* Name() { + static char buf[256]; + snprintf(buf, sizeof(buf), + "SingleThreadGemm, Kernel: %s", Kernel().Name()); + return buf; + } + + typedef SingleThreadGemmContext Context; + + static void Gemm(Context* context, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + SingleThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder, + ResultOrder>(context, Kernel(), lhs, rhs, result, + lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); + } +}; + +template <typename Kernel, typename Scalar, MapOrder LhsOrder, + MapOrder RhsOrder, MapOrder ResultOrder> +struct MultiThreadGemmWrapper { + static const int kLhsBitDepth = Kernel::kLhsBitDepth; + static const int kRhsBitDepth = Kernel::kRhsBitDepth; + + static const char* Name() { + static char buf[256]; + snprintf(buf, sizeof(buf), + "MultiThreadGemm, Kernel: %s", Kernel().Name()); + return buf; + } + + typedef MultiThreadGemmContext Context; + + static void Gemm(Context* context, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + MultiThreadGemm<typename Kernel::Format, Scalar, LhsOrder, RhsOrder, + ResultOrder>(context, Kernel(), lhs, rhs, result, + lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); + } +}; + +template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder, + MapOrder ResultOrder> +struct PublicGemmWrapper { + static const int kLhsBitDepth = 8; + static const int kRhsBitDepth = 8; + + static const char* Name() { return "public Gemm"; } + + typedef GemmContext Context; + + static void Gemm(Context* context, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + gemmlowp::Gemm<uint8_t, LhsOrder, RhsOrder, ResultOrder>( + context, lhs, rhs, result, lhs_offset, rhs_offset, result_offset, + result_mult_int, result_shift); + } +}; + +template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder, + MapOrder ResultOrder> +struct EightBitIntGemmWrapper { + static const int kLhsBitDepth = 8; + static const int kRhsBitDepth = 8; + + static const char* Name() { return "EightBitIntGemm"; } + + typedef void Context; + + static void Gemm(Context*, const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + eight_bit_int_gemm::EightBitIntGemm( + rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset, + rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(), + result_offset, result_mult_int, result_shift, result->stride()); + } +}; + +template <typename Scalar, MapOrder LhsOrder, MapOrder RhsOrder, + MapOrder ResultOrder> +struct ReferenceEightBitIntGemmWrapper { + static const int kLhsBitDepth = 8; + static const int kRhsBitDepth = 8; + + static const char* Name() { return "ReferenceEightBitIntGemm"; } + + typedef ReferenceEightBitIntGemmContext Context; + + static void Gemm(Context* context, + const MatrixMap<const Scalar, LhsOrder>& lhs, + const MatrixMap<const Scalar, RhsOrder>& rhs, + MatrixMap<Scalar, ResultOrder>* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift) { + ReferenceEightBitIntGemm( + context, rhs.cols(), lhs.rows(), lhs.cols(), rhs.data(), rhs_offset, + rhs.stride(), lhs.data(), lhs_offset, lhs.stride(), result->data(), + result_offset, result_mult_int, result_shift, result->stride()); + } +}; + +// Our approach to choosing result_shift values for testing, is bisection. +// This function takes an interval, [result_shift_min .. result_shift_max]. +// If too much saturation occurred in either direction, it bisects accordingly, +// recursing until the interval contains only one value. +// The primary reason why we prefer this over computing optimal shift values, +// is that we actually want to exercise some saturation, as there is nontrivial +// code handling that in gemmlowp. +// Secondarily, this is faster than computing optimal shifts, since in 90% of +// cases the first-tried shift value 16 turns out to be good enough. +template <typename GemmWrapper, typename LhsType, typename RhsType, + typename ResultType> +void test_gemm_impl(typename GemmWrapper::Context* context, const LhsType& lhs, + const RhsType& rhs, ResultType* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int, + int result_shift_min, int result_shift_max) { + const int rows = lhs.rows(); + const int cols = rhs.cols(); + Check(lhs.cols() == rhs.rows()); + const int depth = lhs.cols(); + + const int result_shift = (result_shift_min + result_shift_max) / 2; + + GemmWrapper::Gemm(context, lhs.const_map(), rhs.const_map(), &result->map(), + lhs_offset, rhs_offset, result_offset, result_mult_int, + result_shift); + + typedef typename ResultType::Scalar Scalar; + static const MapOrder kLhsOrder = LhsType::kOrder; + static const MapOrder kRhsOrder = RhsType::kOrder; + static const MapOrder kResultOrder = ResultType::kOrder; + ResultType ref_result(rows, cols); + ReferenceEightBitIntGemmContext reference_context; + ReferenceEightBitIntGemmWrapper<Scalar, kLhsOrder, kRhsOrder, + kResultOrder>::Gemm(&reference_context, + lhs.const_map(), + rhs.const_map(), + &ref_result.map(), + lhs_offset, rhs_offset, + result_offset, + result_mult_int, + result_shift); + + const bool good = *result == ref_result; + printf("%s: %dx%dx%d, %s, offsets %d/%d/%d, mult %d, shift %d\n", + good ? "PASS" : "FAIL", rows, depth, cols, GemmWrapper::Name(), + lhs_offset, rhs_offset, result_offset, result_mult_int, result_shift); + + if (!good) { + int maxdiff = 0; + int countdiff = 0; + for (int c = 0; c < result->cols(); c++) { + for (int r = 0; r < result->rows(); r++) { + int a = (*result)(r, c); + int b = ref_result(r, c); + if (a != b) { + countdiff++; + maxdiff = std::max(maxdiff, std::abs(a - b)); + } + } + } + printf("max difference: %d\n", maxdiff); + printf("number of different places: %d\n", countdiff); + int bad_coeffs_printed = 0; + for (int c = 0; c < result->cols() && bad_coeffs_printed < 20; c++) { + for (int r = 0; r < result->rows() && bad_coeffs_printed < 20; r++) { + if (ref_result(r, c) != (*result)(r, c)) { + printf("bad coeff: at (%d, %d), expected %d, got %d\n", r, c, + ref_result(r, c), (*result)(r, c)); + bad_coeffs_printed++; + } + } + } + } + + Check(good); + + if (result_shift_min != result_shift_max) { + const int max_allowed_saturated_values = result->size() / 16; + + int new_result_shift_min = result_shift_min; + int new_result_shift_max = result_shift_max; + bool retry = false; + + if (reference_context.saturated_0_values > max_allowed_saturated_values) { + new_result_shift_max = (result_shift_min + result_shift_max) / 2; + retry = true; + } + + if (reference_context.saturated_255_values > max_allowed_saturated_values) { + new_result_shift_min = (result_shift_min + result_shift_max) / 2; + retry = true; + } + + if (retry) { + test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset, + rhs_offset, result_offset, result_mult_int, + new_result_shift_min, new_result_shift_max); + } + } +} + +template <typename GemmWrapper, typename LhsType, typename RhsType, + typename ResultType> +void test_gemm(typename GemmWrapper::Context* context, const LhsType& lhs, + const RhsType& rhs, ResultType* result, int lhs_offset, + int rhs_offset, int result_offset, int result_mult_int) { + test_gemm_impl<GemmWrapper>(context, lhs, rhs, result, lhs_offset, rhs_offset, + result_offset, result_mult_int, 0, 32); +} +enum class WhatParamsToTest { + AllCombos, + OnlyGenericCase, +}; + +template <typename GemmWrapper> +void test_gemm(typename GemmWrapper::Context* context, int rows, int depth, + int cols, + WhatParamsToTest what_to_test = WhatParamsToTest::AllCombos) { + typedef std::uint8_t Scalar; + typedef Matrix<Scalar, MapOrder::RowMajor> LhsType; + LhsType lhs(rows, depth); + MakeRandom(&lhs, GemmWrapper::kLhsBitDepth); + typedef Matrix<Scalar, MapOrder::ColMajor> RhsType; + RhsType rhs(depth, cols); + MakeRandom(&rhs, GemmWrapper::kRhsBitDepth); + typedef Matrix<Scalar, MapOrder::ColMajor> ResultType; + ResultType result(rows, cols); + MakeZero(&result); + + if (what_to_test == WhatParamsToTest::AllCombos) { + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 1); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 0, 0, 1); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 10, 0, 1); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 10, 1); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 0, 0, 0, 10); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 10, 10, 10, 10); + test_gemm<GemmWrapper>(context, lhs, rhs, &result, 256, 1, 17, 4); + } + test_gemm<GemmWrapper>(context, lhs, rhs, &result, -75, -91, 74980, 123); +} + +template <typename Kernel> +void test_gemm_kernel(MultiThreadGemmContext* context) { + typedef MultiThreadGemmWrapper<Kernel, uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, + MapOrder::ColMajor> GemmWrapper; + test_gemm<GemmWrapper>(context, 1, 1, 1, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 2, 2, 2, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 3, 3, 3, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 4, 4, 4, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 5, 5, 5, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 9, 11, 13, WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 50, 50, 50, WhatParamsToTest::AllCombos); + test_gemm<GemmWrapper>(context, 500, 500, 500, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 100, 5000, 100, + WhatParamsToTest::OnlyGenericCase); +} + +template <typename GemmWrapper> +void test_gemm(typename GemmWrapper::Context* context) { + test_gemm<GemmWrapper>(context, 1, 1, 1); + test_gemm<GemmWrapper>(context, 2, 1, 1); + test_gemm<GemmWrapper>(context, 1, 2, 1); + test_gemm<GemmWrapper>(context, 1, 1, 2); + test_gemm<GemmWrapper>(context, 2, 2, 2); + test_gemm<GemmWrapper>(context, 3, 3, 3); + test_gemm<GemmWrapper>(context, 4, 4, 4); + test_gemm<GemmWrapper>(context, 5, 5, 5); + test_gemm<GemmWrapper>(context, 6, 6, 6); + test_gemm<GemmWrapper>(context, 3, 5, 7); + test_gemm<GemmWrapper>(context, 7, 3, 5); + test_gemm<GemmWrapper>(context, 5, 7, 3); + test_gemm<GemmWrapper>(context, 8, 8, 8); + + test_gemm<GemmWrapper>(context, 16, 16, 16); + test_gemm<GemmWrapper>(context, 32, 32, 32); + test_gemm<GemmWrapper>(context, 64, 64, 64); + test_gemm<GemmWrapper>(context, 128, 128, 128); + + test_gemm<GemmWrapper>(context, 17, 24, 31); + test_gemm<GemmWrapper>(context, 37, 55, 73); + test_gemm<GemmWrapper>(context, 57, 87, 117); + test_gemm<GemmWrapper>(context, 93, 83, 73); + test_gemm<GemmWrapper>(context, 109, 89, 99); + test_gemm<GemmWrapper>(context, 78, 101, 82); + + test_gemm<GemmWrapper>(context, 512, 512, 512, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1024, 1024, 1024, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 567, 2345, 123, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 100, 5000, 100, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1, 1, 1000, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1000, 1, 1, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1, 1000, 1, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1, 1000, 1000, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1000, 1, 1000, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 1000, 1000, 1, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 777, 3456, 1, + WhatParamsToTest::OnlyGenericCase); + test_gemm<GemmWrapper>(context, 4567, 555, 1, + WhatParamsToTest::OnlyGenericCase); +} + +template <typename GemmWrapper> +void test_gemv(typename GemmWrapper::Context* context) { + test_gemm<GemmWrapper>(context, 2, 2, 1); + test_gemm<GemmWrapper>(context, 3, 3, 1); + test_gemm<GemmWrapper>(context, 4, 4, 1); + test_gemm<GemmWrapper>(context, 5, 5, 1); + test_gemm<GemmWrapper>(context, 6, 6, 1); + test_gemm<GemmWrapper>(context, 3, 5, 1); + test_gemm<GemmWrapper>(context, 7, 3, 1); + test_gemm<GemmWrapper>(context, 5, 7, 1); + test_gemm<GemmWrapper>(context, 8, 8, 1); + test_gemm<GemmWrapper>(context, 32, 32, 1); + test_gemm<GemmWrapper>(context, 128, 128, 1); + test_gemm<GemmWrapper>(context, 321, 123, 1); +} + +void test() { +#ifdef GEMMLOWP_TEST_PROFILE + RegisterCurrentThreadForProfiling(); + StartProfiling(); +#endif + + GemmContext context; + + // Test the internal GEMM interfaces + test_gemm< + SingleThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>( + &context); + + test_gemm< + MultiThreadGemmWrapper<DefaultKernelForGEMM, uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>(&context); + + // Test the public GEMM interfaces + test_gemm<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor, + MapOrder::ColMajor>>(&context); + + test_gemm<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>( + &context); + + // Test GEMV cases (internal interfaces) + test_gemv< + SingleThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>( + &context); + + test_gemv< + MultiThreadGemmWrapper<DefaultKernelForGEMV, uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>(&context); + + // Test GEMV cases (public interfaces) + test_gemv<PublicGemmWrapper<uint8_t, MapOrder::RowMajor, MapOrder::ColMajor, + MapOrder::ColMajor>>(&context); + + test_gemv<EightBitIntGemmWrapper<uint8_t, MapOrder::RowMajor, + MapOrder::ColMajor, MapOrder::ColMajor>>( + &context); + + // Test specific kernels with various different formats, + // to exercises corner cases especially in the packing code. + test_gemm_kernel< + ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<1, 1>, 1>, + KernelSideFormat<CellFormat<1, 1>, 1>>>>( + &context); + + test_gemm_kernel< + ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<3, 4>, 2>, + KernelSideFormat<CellFormat<5, 4>, 3>>>>( + &context); + + test_gemm_kernel< + ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<5, 3>, 3>, + KernelSideFormat<CellFormat<4, 3>, 2>>>>( + &context); + + test_gemm_kernel< + ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>, + KernelSideFormat<CellFormat<4, 3>, 1>>>>( + &context); + + test_gemm_kernel< + ReferenceKernel<KernelFormat<KernelSideFormat<CellFormat<4, 3>, 3>, + KernelSideFormat<CellFormat<2, 3>, 2>>>>( + &context); + +// Test all our optimized kernels, even if they are not used +// at the moment, as they might be handy later and so it's +// useful to keep them functional for now. +#ifdef GEMMLOWP_NEON + test_gemm_kernel<gemmlowp::NEONKernel12x4Depth2>(&context); + test_gemm_kernel<gemmlowp::NEONKernel20x1Depth4>(&context); + test_gemm_kernel<gemmlowp::NEONKernel8x1Depth4>(&context); +#endif + +#ifdef GEMMLOWP_TEST_PROFILE + FinishProfiling(); +#endif + + std::cerr << "All tests passed." << std::endl; + + // We have been testing the eight_bit_int_gemm, so we should free its + // persistent + // resources now to avoid having leak-checking tools report leaks. + eight_bit_int_gemm::FreePersistentResources(); +} + +} // end namespace gemmlowp + +int main() { gemmlowp::test(); } diff --git a/test/test.h b/test/test.h new file mode 100644 index 0000000..f630674 --- /dev/null +++ b/test/test.h @@ -0,0 +1,118 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// test.h: shared testing helpers. + +#ifndef GEMMLOWP_TEST_TEST_H_ +#define GEMMLOWP_TEST_TEST_H_ + +#ifdef GEMMLOWP_TEST_PROFILE +#define GEMMLOWP_PROFILING +#include "profiling/profiler.h" +#endif + +#include <iostream> +#include <cstdlib> +#include <vector> + +#include "public/map.h" + +namespace gemmlowp { + +inline int Random() { + // Use ugly old rand() since this doesn't need to be high quality. + return rand(); +} + +inline void Check(bool b) { ReleaseBuildAssertion(b, "test failed"); } + +// gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, +// since it only maps existing data. In tests though, we need to +// create our own matrices. +template <typename tScalar, MapOrder tOrder> +class Matrix : public MatrixMap<tScalar, tOrder> { + public: + typedef MatrixMap<tScalar, tOrder> Map; + typedef MatrixMap<const tScalar, tOrder> ConstMap; + typedef typename Map::Scalar Scalar; + using Map::kOrder; + using Map::rows_; + using Map::cols_; + using Map::stride_; + using Map::data_; + + public: + Matrix() : Map(nullptr, 0, 0, 0) {} + + Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } + + Matrix(const Matrix& other) { *this = other; } + + Matrix& operator=(const Matrix& other) { + Resize(other.rows_, other.cols_); + memcpy(data_, other.data_, size() * sizeof(Scalar)); + return *this; + } + + friend bool operator==(const Matrix& a, const Matrix& b) { + return a.rows_ == b.rows_ && a.cols_ == b.cols_ && + !memcmp(a.data_, b.data_, a.size()); + } + + void Resize(int rows, int cols) { + rows_ = rows; + cols_ = cols; + stride_ = kOrder == MapOrder::ColMajor ? rows : cols; + storage.resize(size()); + data_ = storage.data(); + } + + int size() const { return rows_ * cols_; } + + Map& map() { return *static_cast<Map*>(this); } + + ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } + + protected: + std::vector<Scalar> storage; +}; + +template <typename MatrixType> +void MakeRandom(MatrixType* m, int bits) { + typedef typename MatrixType::Scalar Scalar; + const Scalar mask = (1 << bits) - 1; + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = Random() & mask; + } + } +} + +template <typename MatrixType> +void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) { + for (int c = 0; c < m->cols(); c++) { + for (int r = 0; r < m->rows(); r++) { + (*m)(r, c) = val; + } + } +} + +template <typename MatrixType> +void MakeZero(MatrixType* m) { + MakeConstant(m, 0); +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_TEST_TEST_H_ diff --git a/test/test_allocator.cc b/test/test_allocator.cc new file mode 100644 index 0000000..9e76b79 --- /dev/null +++ b/test/test_allocator.cc @@ -0,0 +1,54 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/test.h" +#include "internal/allocator.h" + +namespace gemmlowp { + +void test_allocator(Allocator* a, int max_array_size) { + const std::size_t int32_array_size = Random() % max_array_size; + auto handle_to_int32_array = a->Reserve<std::int32_t>(int32_array_size); + const std::size_t int8_array_size = Random() % max_array_size; + auto handle_to_int8_array = a->Reserve<std::int8_t>(int8_array_size); + a->Commit(); + std::int32_t* int32_array = + a->GetPointer<std::int32_t>(handle_to_int32_array); + std::int8_t* int8_array = a->GetPointer<std::int8_t>(handle_to_int8_array); + Check(int32_array == a->GetPointer<std::int32_t>(handle_to_int32_array)); + Check(int8_array == a->GetPointer<std::int8_t>(handle_to_int8_array)); + Check( + !(reinterpret_cast<std::uintptr_t>(int32_array) % Allocator::kAlignment)); + Check( + !(reinterpret_cast<std::uintptr_t>(int8_array) % Allocator::kAlignment)); + Check(reinterpret_cast<std::uintptr_t>(int8_array) >= + reinterpret_cast<std::uintptr_t>(int32_array + int32_array_size)); + memset(int32_array, 0, sizeof(*int32_array) * int32_array_size); + memset(int8_array, 0, sizeof(*int8_array) * int8_array_size); + a->Decommit(); +} + +void test_allocator() { + Allocator allocator; + + // Test allocating increasingly large sizes on the same allocator, + // starting with size 0. + for (int i = 1; i < 1000; i += 10) { + test_allocator(&allocator, i); + } +} + +} // namespace gemmlowp + +int main() { gemmlowp::test_allocator(); } diff --git a/test/test_blocking_counter.cc b/test/test_blocking_counter.cc new file mode 100644 index 0000000..1fa3c9d --- /dev/null +++ b/test/test_blocking_counter.cc @@ -0,0 +1,102 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/test.h" + +#include <pthread.h> +#include <vector> + +#include "internal/multi_thread_gemm.h" + +namespace gemmlowp { + +class Thread { + public: + Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement) + : blocking_counter_(blocking_counter), + number_of_times_to_decrement_(number_of_times_to_decrement), + made_the_last_decrement_(false) { + pthread_create(&thread_, nullptr, ThreadFunc, this); + } + + ~Thread() { Join(); } + + bool Join() const { + pthread_join(thread_, nullptr); + return made_the_last_decrement_; + } + + private: + Thread(const Thread& other) = delete; + + void ThreadFunc() { + for (int i = 0; i < number_of_times_to_decrement_; i++) { + Check(!made_the_last_decrement_); + made_the_last_decrement_ = blocking_counter_->DecrementCount(); + } + } + + static void* ThreadFunc(void* ptr) { + static_cast<Thread*>(ptr)->ThreadFunc(); + return nullptr; + } + + BlockingCounter* const blocking_counter_; + const int number_of_times_to_decrement_; + pthread_t thread_; + bool made_the_last_decrement_; +}; + +void test_blocking_counter(BlockingCounter* blocking_counter, int num_threads, + int num_decrements_per_thread, + int num_decrements_to_wait_for) { + std::vector<Thread*> threads; + blocking_counter->Reset(num_decrements_to_wait_for); + for (int i = 0; i < num_threads; i++) { + threads.push_back(new Thread(blocking_counter, num_decrements_per_thread)); + } + blocking_counter->Wait(); + + int num_threads_that_made_the_last_decrement = 0; + for (int i = 0; i < num_threads; i++) { + if (threads[i]->Join()) { + num_threads_that_made_the_last_decrement++; + } + delete threads[i]; + } + Check(num_threads_that_made_the_last_decrement == 1); +} + +void test_blocking_counter() { + BlockingCounter* blocking_counter = new BlockingCounter; + + // repeating the entire test sequence ensures that we test + // non-monotonic changes. + for (int repeat = 1; repeat <= 2; repeat++) { + for (int num_threads = 1; num_threads <= 16; num_threads++) { + for (int num_decrements_per_thread = 1; + num_decrements_per_thread <= 64 * 1024; + num_decrements_per_thread *= 4) { + test_blocking_counter(blocking_counter, num_threads, + num_decrements_per_thread, + num_threads * num_decrements_per_thread); + } + } + } + delete blocking_counter; +} + +} // end namespace gemmlowp + +int main() { gemmlowp::test_blocking_counter(); } diff --git a/test/test_math_helpers.cc b/test/test_math_helpers.cc new file mode 100644 index 0000000..c1482f7 --- /dev/null +++ b/test/test_math_helpers.cc @@ -0,0 +1,134 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "test/test.h" + +#include <limits> + +#include "internal/common.h" + +namespace gemmlowp { + +// Our math helpers don't intend to be reliable all the way to the +// limit of representable range, wrt overflow. +// We don't care for 2G sized matrices. +// This test stops at half of the representable range. +template <typename Integer> +Integer ValueRangeCutoff() { + return std::numeric_limits<Integer>::max() / 2; +} + +int RandomNonnegativeFarAwayFromOverflow() { return Random() % (1 << 24); } + +template <int Modulus> +void test_round_up_down(int x) { + Check(x >= RoundDown<Modulus>(x)); + Check(x < RoundDown<Modulus>(x) + Modulus); + Check(RoundDown<Modulus>(x) % Modulus == 0); + + Check(x <= RoundUp<Modulus>(x)); + Check(x > RoundUp<Modulus>(x) - Modulus); + Check(RoundUp<Modulus>(x) % Modulus == 0); +} + +template <int Modulus> +void test_round_up_down() { + for (int i = 0; i < 100; i++) { + test_round_up_down<Modulus>(i); + const int N = ValueRangeCutoff<int>(); + test_round_up_down<Modulus>(Random() % N); + } +} + +template <typename Integer> +void test_ceil_quotient(Integer x, Integer y) { + Check(CeilQuotient(x, y) * y >= x); + Check(CeilQuotient(x, y) * y < x + y); +} + +template <typename Integer> +void test_ceil_quotient() { + const Integer N = ValueRangeCutoff<Integer>(); + const Integer K = std::min(N, Integer(100)); + for (Integer x = 0; x < K; x++) { + for (Integer y = 1; y < K; y++) { + test_ceil_quotient(x, y); + test_ceil_quotient(x, Integer(1 + (Random() % (N - 1)))); + test_ceil_quotient(Integer(Random() % N), y); + test_ceil_quotient(Integer(Random() % N), + Integer(1 + (Random() % (N - 1)))); + } + } +} + +template <typename Integer> +void test_round_up_to_next_power_of_two(Integer x) { + Check(RoundUpToPowerOfTwo(RoundUpToPowerOfTwo(x) == RoundUpToPowerOfTwo(x))); + Check(RoundUpToPowerOfTwo(x) >= x); + Check(x == 0 || RoundUpToPowerOfTwo(x) < 2 * x); + Check((RoundUpToPowerOfTwo(x) & (RoundUpToPowerOfTwo(x) - 1)) == 0); +} + +template <typename Integer> +void test_round_up_to_next_power_of_two() { + const Integer N = ValueRangeCutoff<Integer>(); + const Integer K = std::min(N, Integer(100)); + for (Integer x = 0; x < K; x++) { + test_round_up_to_next_power_of_two(x); + test_round_up_to_next_power_of_two(Random() % N); + } +} + +void test_math_helpers() { + test_round_up_down<1>(); + test_round_up_down<2>(); + test_round_up_down<3>(); + test_round_up_down<4>(); + test_round_up_down<5>(); + test_round_up_down<6>(); + test_round_up_down<7>(); + test_round_up_down<8>(); + test_round_up_down<9>(); + test_round_up_down<10>(); + test_round_up_down<11>(); + test_round_up_down<12>(); + test_round_up_down<13>(); + test_round_up_down<14>(); + test_round_up_down<15>(); + test_round_up_down<16>(); + + test_round_up_down<50>(); + test_round_up_down<51>(); + + test_round_up_down<500>(); + test_round_up_down<501>(); + + test_ceil_quotient<std::int8_t>(); + test_ceil_quotient<std::uint8_t>(); + test_ceil_quotient<std::int16_t>(); + test_ceil_quotient<std::uint16_t>(); + test_ceil_quotient<std::int32_t>(); + test_ceil_quotient<std::uint32_t>(); + + test_round_up_to_next_power_of_two<std::int8_t>(); + test_round_up_to_next_power_of_two<std::uint8_t>(); + test_round_up_to_next_power_of_two<std::int16_t>(); + test_round_up_to_next_power_of_two<std::uint16_t>(); + test_round_up_to_next_power_of_two<std::int32_t>(); + test_round_up_to_next_power_of_two<std::uint32_t>(); +} + +} // end namespace gemmlowp + +int main() { gemmlowp::test_math_helpers(); } |