diff options
author | Benoit Jacob <benoitjacob@google.com> | 2020-05-06 13:27:04 -0700 |
---|---|---|
committer | Copybara-Service <copybara-worker@google.com> | 2020-05-06 13:27:24 -0700 |
commit | 210c986be9454628fb5a17442380533bf5f39fa8 (patch) | |
tree | 0832d95990cc684ea917ccc5b3b177baaf0d7373 | |
parent | 1b313682ef8b8fc8ed08719c610d1c3503b016bf (diff) | |
download | ruy-210c986be9454628fb5a17442380533bf5f39fa8.tar.gz |
Move SystemAligned{Alloc,Free} functions to their own library as they are used independently of Allocator.
Merge detail::AlignedAllocator into Allocator, I didn't see a need to keep separate classes.
Move Allocator method implementations to allocator.cc.
Remove ToVoidPtr, it only had a single use. It was also possible to do without a reinterpret_cast, as the language guarantees sizeof(char)==1 and pointer casts to/from void* don't need reinterpret_cast.
Trim dependencies if :prepacked_cache.
PiperOrigin-RevId: 310216377
-rw-r--r-- | ruy/BUILD | 18 | ||||
-rw-r--r-- | ruy/allocator.cc | 57 | ||||
-rw-r--r-- | ruy/allocator.h | 155 | ||||
-rw-r--r-- | ruy/prepacked_cache.cc | 2 | ||||
-rw-r--r-- | ruy/system_aligned_alloc.cc | 51 | ||||
-rw-r--r-- | ruy/system_aligned_alloc.h | 53 |
6 files changed, 189 insertions, 147 deletions
@@ -121,6 +121,17 @@ cc_library( ) cc_library( + name = "system_aligned_alloc", + srcs = [ + "system_aligned_alloc.cc", + ], + hdrs = [ + "system_aligned_alloc.h", + ], + copts = ruy_copts_base(), +) + +cc_library( name = "prepacked_cache", srcs = [ "prepacked_cache.cc", @@ -130,11 +141,8 @@ cc_library( ], copts = ruy_copts_base(), deps = [ - ":allocator", ":mat", - ":opt_set", - ":platform", - ":time", + ":system_aligned_alloc", "//ruy/profiler:instrumentation", ], ) @@ -182,8 +190,8 @@ cc_library( ], copts = ruy_copts_base(), deps = [ - ":check_macros", ":size_util", + ":system_aligned_alloc", ], ) diff --git a/ruy/allocator.cc b/ruy/allocator.cc index d8fb738..7fdf73a 100644 --- a/ruy/allocator.cc +++ b/ruy/allocator.cc @@ -15,37 +15,44 @@ limitations under the License. #include "ruy/allocator.h" -#include <cstdint> -#include <cstdlib> - -#ifdef _WIN32 -#include <malloc.h> -#endif +#include "ruy/system_aligned_alloc.h" namespace ruy { -namespace detail { - -void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) { -#ifdef _WIN32 - return _aligned_malloc(num_bytes, kMinimumBlockAlignment); -#else - void *ptr; - if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) { - return nullptr; - } - return ptr; -#endif +Allocator::~Allocator() { + FreeAll(); + detail::SystemAlignedFree(ptr_); } -void SystemAlignedFree(void *ptr) { -#ifdef _WIN32 - _aligned_free(ptr); -#else - free(ptr); -#endif +void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) { + void* p = detail::SystemAlignedAlloc(num_bytes); + fallback_blocks_total_size_ += num_bytes; + fallback_blocks_.push_back(p); + return p; } -} // namespace detail +void Allocator::FreeAll() { + current_ = 0; + if (fallback_blocks_.empty()) { + return; + } + + // No rounding-up of the size means linear instead of logarithmic + // bound on the number of allocation in some worst-case calling patterns. + // This is considered worth it because minimizing memory usage is important + // and actual calling patterns in applications that we care about still + // reach the no-further-allocations steady state in a small finite number + // of iterations. + std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_; + detail::SystemAlignedFree(ptr_); + ptr_ = detail::SystemAlignedAlloc(new_size); + size_ = new_size; + + for (void* p : fallback_blocks_) { + detail::SystemAlignedFree(p); + } + fallback_blocks_.clear(); + fallback_blocks_total_size_ = 0; +} } // namespace ruy diff --git a/ruy/allocator.h b/ruy/allocator.h index 2df0a22..20cc7c3 100644 --- a/ruy/allocator.h +++ b/ruy/allocator.h @@ -21,50 +21,18 @@ limitations under the License. #include <memory> #include <vector> -#include "ruy/check_macros.h" #include "ruy/size_util.h" +#include "ruy/system_aligned_alloc.h" namespace ruy { -namespace detail { - -inline void* VoidPtrAdd(void* p, std::ptrdiff_t offset) { - RUY_DCHECK(p); - std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(p) + offset; - return reinterpret_cast<void*>(addr); -} - -// Minimum alignment for blocks. -// -// Considerations: -// - This needs to be at least the alignment of any usual data type. -// - It's useful that this is at least the size of a cache line to limit -// possible cache side effects (if only on performance behavior). -// - It's useful that this is at least the size of SIMD registers, as -// some SIMD instruction sets have at least performance behavior -// differences (e.g. NEON) or even different requirements (e.g. SSE) -// based on that. -// - It's useful that this is at least the size of an "exclusive reservation -// granule" on ARM, meaning that if we use this Allocator to allocate -// an atomic variable, there will be no side effects from other things -// contending for exclusive/atomic memory accesses to it. While the -// ARM reference manual mentions that this granule size may be as large -// as 2048 bytes, in practice we observe it to be 64 bytes. It can -// be queried cheaply, at runtime, from userspace, if needed. -static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64; - -// Primitive allocation functions obtaining aligned memory from the -// operating system. -void* SystemAlignedAlloc(std::ptrdiff_t num_bytes); -void SystemAlignedFree(void* ptr); - // Specialized allocator designed to converge to a steady-state where all // allocations are bump-ptr allocations from an already-allocated buffer. // // To support these constraints, this allocator only supports two // operations. -// - AllocateAlignedBytes: allocates a pointer to storage of a specified -// size, which must be aligned to kMinimumBlockAlignment. +// - AllocateBytes/Allocate<Pointer>: allocates a pointer to storage of a +// specified size, which will be aligned to kMinimumBlockAlignment. // - FreeAll: frees all previous allocations (but retains the internal // buffer to minimize future calls into the system allocator). // @@ -75,79 +43,59 @@ void SystemAlignedFree(void* ptr); // SystemAlignedAlloc/SystemAlignedFree. // // All operations happen on aligned blocks for simplicity. -class AlignedAllocator { +// +// Theory of operation: +// +// - ptr_, current_, and size_ implement a basic bump-ptr allocator. +// +// - in AllocateBytes, the fast path is just a bump-ptr +// allocation. If our bump-ptr allocator doesn't have enough space for an +// allocation, then we allocate a block from the system allocator to +// service the allocation request. We save that block in fallback_blocks_ +// and track the total size of the fallback blocks in +// fallback_blocks_total_size_. +// +// - in FreeAll, the fast path just resets the bump-ptr allocator. If +// there are any fallback blocks, we free them and reallocate the +// bump-ptr allocator's buffer so that the next sequence of allocations +// will hopefully not need any fallback blocks. +class Allocator final { public: - void operator=(const AlignedAllocator&) = delete; - ~AlignedAllocator() { - FreeAll(); - SystemAlignedFree(ptr_); - } + ~Allocator(); - void* AllocateAlignedBytes(std::ptrdiff_t num_bytes) { - RUY_DCHECK_GT(num_bytes, 0); - RUY_DCHECK((num_bytes & (kMinimumBlockAlignment - 1)) == 0); - if (void* p = AllocateFast(num_bytes)) { + void* AllocateBytes(std::ptrdiff_t num_bytes) { + if (num_bytes == 0) { + return nullptr; + } + const std::ptrdiff_t rounded_num_bytes = + round_up_pot(num_bytes, detail::kMinimumBlockAlignment); + if (void* p = AllocateFast(rounded_num_bytes)) { return p; } - return AllocateSlow(num_bytes); + return AllocateSlow(rounded_num_bytes); } - void FreeAll() { - current_ = 0; - if (fallback_blocks_.empty()) { - return; - } - - // No rounding-up of the size means linear instead of logarithmic - // bound on the number of allocation in some worst-case calling patterns. - // This is considered worth it because minimizing memory usage is important - // and actual calling patterns in applications that we care about still - // reach the no-further-allocations steady state in a small finite number - // of iterations. - std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_; - SystemAlignedFree(ptr_); - ptr_ = SystemAlignedAlloc(new_size); - size_ = new_size; - - for (void* p : fallback_blocks_) { - SystemAlignedFree(p); - } - fallback_blocks_.clear(); - fallback_blocks_total_size_ = 0; + template <typename Pointer> + void Allocate(std::ptrdiff_t count, Pointer* out) { + using T = typename std::pointer_traits<Pointer>::element_type; + *out = static_cast<T*>(AllocateBytes(count * sizeof(T))); } + void FreeAll(); + private: + void operator=(const Allocator&) = delete; + void* AllocateSlow(std::ptrdiff_t num_bytes); + void* AllocateFast(std::ptrdiff_t num_bytes) { if (current_ + num_bytes > size_) { return nullptr; } - void* ret = VoidPtrAdd(ptr_, current_); + void* ret = static_cast<char*>(ptr_) + current_; current_ += num_bytes; return ret; } - void* AllocateSlow(std::ptrdiff_t num_bytes) { - void* p = SystemAlignedAlloc(num_bytes); - fallback_blocks_total_size_ += num_bytes; - fallback_blocks_.push_back(p); - return p; - } - - // Theory of operation: - // - // - ptr_, current_, and size_ implement a basic bump-ptr allocator. - // - // - in AllocateAlignedBytes, the fast path is just a bump-ptr - // allocation. If our bump-ptr allocator doesn't have enough space for an - // allocation, then we allocate a block from the system allocator to - // service the allocation request. We save that block in fallback_blocks_ - // and track the total size of the fallback blocks in - // fallback_blocks_total_size_. - // - // - in FreeAll, the fast path just resets the bump-ptr allocator. If - // there are any fallback blocks, we free them and reallocate the - // bump-ptr allocator's buffer so that the next sequence of allocations - // will hopefully not need any fallback blocks. void* ptr_ = nullptr; std::ptrdiff_t current_ = 0; std::ptrdiff_t size_ = 0; @@ -155,31 +103,6 @@ class AlignedAllocator { std::ptrdiff_t fallback_blocks_total_size_ = 0; }; -} // namespace detail - -// The main Allocator class, with a convenient interface for allocating a -// typed buffer. -class Allocator { - public: - void* AllocateBytes(std::ptrdiff_t num_bytes) { - if (num_bytes == 0) { - return nullptr; - } - return aligned.AllocateAlignedBytes( - round_up_pot(num_bytes, detail::kMinimumBlockAlignment)); - } - template <typename Pointer> - void Allocate(std::ptrdiff_t count, Pointer* out) { - using T = typename std::pointer_traits<Pointer>::element_type; - *out = static_cast<T*>(AllocateBytes(count * sizeof(T))); - } - - void FreeAll() { aligned.FreeAll(); } - - private: - detail::AlignedAllocator aligned; -}; - } // namespace ruy #endif // RUY_RUY_ALLOCATOR_H_ diff --git a/ruy/prepacked_cache.cc b/ruy/prepacked_cache.cc index 025ba7f..ee891cb 100644 --- a/ruy/prepacked_cache.cc +++ b/ruy/prepacked_cache.cc @@ -15,9 +15,9 @@ limitations under the License. #include "ruy/prepacked_cache.h" -#include "ruy/allocator.h" #include "ruy/mat.h" #include "ruy/profiler/instrumentation.h" +#include "ruy/system_aligned_alloc.h" namespace ruy { diff --git a/ruy/system_aligned_alloc.cc b/ruy/system_aligned_alloc.cc new file mode 100644 index 0000000..7c86691 --- /dev/null +++ b/ruy/system_aligned_alloc.cc @@ -0,0 +1,51 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "ruy/system_aligned_alloc.h" + +#include <cstddef> +#include <cstdlib> + +#ifdef _WIN32 +#include <malloc.h> +#endif + +namespace ruy { + +namespace detail { + +void *SystemAlignedAlloc(std::ptrdiff_t num_bytes) { +#ifdef _WIN32 + return _aligned_malloc(num_bytes, kMinimumBlockAlignment); +#else + void *ptr; + if (posix_memalign(&ptr, kMinimumBlockAlignment, num_bytes)) { + return nullptr; + } + return ptr; +#endif +} + +void SystemAlignedFree(void *ptr) { +#ifdef _WIN32 + _aligned_free(ptr); +#else + free(ptr); +#endif +} + +} // namespace detail + +} // namespace ruy diff --git a/ruy/system_aligned_alloc.h b/ruy/system_aligned_alloc.h new file mode 100644 index 0000000..5604b5c --- /dev/null +++ b/ruy/system_aligned_alloc.h @@ -0,0 +1,53 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef RUY_RUY_SYSTEM_ALIGNED_ALLOC_H_ +#define RUY_RUY_SYSTEM_ALIGNED_ALLOC_H_ + +#include <cstddef> + +namespace ruy { + +namespace detail { + +// Minimum alignment for blocks. +// +// Considerations: +// - This needs to be at least the alignment of any usual data type. +// - It's useful that this is at least the size of a cache line to limit +// possible cache side effects (if only on performance behavior). +// - It's useful that this is at least the size of SIMD registers, as +// some SIMD instruction sets have at least performance behavior +// differences (e.g. NEON) or even different requirements (e.g. SSE) +// based on that. +// - It's useful that this is at least the size of an "exclusive reservation +// granule" on ARM, meaning that if we use this Allocator to allocate +// an atomic variable, there will be no side effects from other things +// contending for exclusive/atomic memory accesses to it. While the +// ARM reference manual mentions that this granule size may be as large +// as 2048 bytes, in practice we observe it to be 64 bytes. It can +// be queried cheaply, at runtime, from userspace, if needed. +static constexpr std::ptrdiff_t kMinimumBlockAlignment = 64; + +// Primitive allocation functions obtaining aligned memory from the +// operating system. +void* SystemAlignedAlloc(std::ptrdiff_t num_bytes); +void SystemAlignedFree(void* ptr); + +} // namespace detail + +} // namespace ruy + +#endif // RUY_RUY_SYSTEM_ALIGNED_ALLOC_H_ |