diff options
author | Benoit Jacob <benoitjacob@google.com> | 2015-06-25 15:50:59 -0400 |
---|---|---|
committer | Benoit Jacob <benoitjacob@google.com> | 2015-06-25 15:53:04 -0400 |
commit | 75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae (patch) | |
tree | c8e35a06c7d959e6ad0a90b4929305055919e3f8 /profiling | |
download | gemmlowp-75c4ec0ba4dd86e4f763a54e01002ff29f1d57ae.tar.gz |
initial import
Diffstat (limited to 'profiling')
-rw-r--r-- | profiling/instrumentation.h | 217 | ||||
-rw-r--r-- | profiling/profiler.h | 373 |
2 files changed, 590 insertions, 0 deletions
diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h new file mode 100644 index 0000000..b1592c8 --- /dev/null +++ b/profiling/instrumentation.h @@ -0,0 +1,217 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// instrumentation.h: contains the definitions needed to +// instrument code for profiling: +// ScopedProfilingLabel, RegisterCurrentThreadForProfiling. +// +// profiler.h is only needed to drive the profiler: +// StartProfiling, FinishProfiling. +// +// See the usage example in profiler.h. + +#ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ +#define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ + +#include <pthread.h> +#include <cstdint> +#include <cassert> +#include <cstdlib> +#include <algorithm> + +#ifdef GEMMLOWP_PROFILING +#include <set> +#include <cstdio> +#include <cstring> +#endif + +// We should always use C++11 thread_local; unfortunately that +// isn't fully supported on Apple yet. +#ifdef __APPLE__ +#define GEMMLOWP_THREAD_LOCAL static __thread +#else +#define GEMMLOWP_THREAD_LOCAL thread_local +#endif + +namespace gemmlowp { + +inline void ReleaseBuildAssertion(bool condition, const char* msg) { + if (!condition) { + fprintf(stderr, "gemmlowp error: %s\n", msg); + abort(); + } +} + +// To be used as template parameter for GlobalLock. +// GlobalLock<ProfilerLockId> is the profiler global lock: +// registering threads, starting profiling, finishing profiling, and +// the profiler itself as it samples threads, all need to lock it. +struct ProfilerLockId; + +// A very plain global lock. Templated in LockId so we can have multiple +// locks, one for each LockId type. +template <typename LockId> +class GlobalLock { + static pthread_mutex_t* Mutex() { + static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; + return &m; + } + + public: + static void Lock() { pthread_mutex_lock(Mutex()); } + static void Unlock() { pthread_mutex_unlock(Mutex()); } +}; + +// A very simple RAII helper to lock and unlock a GlobalLock +template <typename LockId> +struct AutoGlobalLock { + AutoGlobalLock() { GlobalLock<LockId>::Lock(); } + ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } +}; + +// MemoryBarrier is purely a compile-time thing; it tells two things +// to the compiler: +// 1) It prevents reordering code across it +// (thanks to the 'volatile' after 'asm') +// 2) It requires the compiler to assume that any value previously +// read from memory, may have changed. Thus it offers an alternative +// to using 'volatile' variables. +inline void MemoryBarrier() { asm volatile("" ::: "memory"); } + +// Profiling definitions. Two paths: when profiling is enabled, +// and when profiling is disabled. +#ifdef GEMMLOWP_PROFILING +// This code path is when profiling is enabled. + +// A pseudo-call-stack. Contrary to a real call-stack, this only +// contains pointers to literal strings that were manually entered +// in the instrumented code (see ScopedProfilingLabel). +struct ProfilingStack { + static const std::size_t kMaxSize = 15; + typedef const char* LabelsArrayType[kMaxSize]; + LabelsArrayType labels; + std::size_t size; + + ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } + + void Push(const char* label) { + MemoryBarrier(); + ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); + labels[size] = label; + MemoryBarrier(); + size++; + MemoryBarrier(); + } + + void Pop() { + MemoryBarrier(); + ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); + size--; + MemoryBarrier(); + } + + void UpdateTop(const char* new_label) { + MemoryBarrier(); + assert(size); + labels[size - 1] = new_label; + MemoryBarrier(); + } + + ProfilingStack& operator=(const ProfilingStack& other) { + memcpy(this, &other, sizeof(ProfilingStack)); + return *this; + } + + bool operator==(const ProfilingStack& other) const { + return !memcmp(this, &other, sizeof(ProfilingStack)); + } +}; + +static_assert( + !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), + "ProfilingStack should have power-of-two size to fit in cache lines"); + +struct ThreadInfo; + +// The global set of threads being profiled. +inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { + static std::set<ThreadInfo*> v; + return v; +} + +struct ThreadInfo { + pthread_key_t key; // used only to get a callback at thread exit. + ProfilingStack stack; + + ThreadInfo() { + pthread_key_create(&key, ThreadExitCallback); + pthread_setspecific(key, this); + } + + static void ThreadExitCallback(void* ptr) { + AutoGlobalLock<ProfilerLockId> lock; + ThreadInfo* self = static_cast<ThreadInfo*>(ptr); + ThreadsUnderProfiling().erase(self); + pthread_key_delete(self->key); + } +}; + +inline ThreadInfo& ThreadLocalThreadInfo() { + GEMMLOWP_THREAD_LOCAL ThreadInfo i; + return i; +} + +// ScopedProfilingLabel is how one instruments code for profiling +// with this profiler. Construct local ScopedProfilingLabel variables, +// passing a literal string describing the local code. Profile +// samples will then be annotated with this label, while it is in scope +// (whence the name --- also known as RAII). +// See the example in profiler.h. +class ScopedProfilingLabel { + ProfilingStack* profiling_stack_; + + public: + explicit ScopedProfilingLabel(const char* label) + : profiling_stack_(&ThreadLocalThreadInfo().stack) { + profiling_stack_->Push(label); + } + + ~ScopedProfilingLabel() { profiling_stack_->Pop(); } + + void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } +}; + +// To be called once on each thread to be profiled. +inline void RegisterCurrentThreadForProfiling() { + AutoGlobalLock<ProfilerLockId> lock; + ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); +} + +#else // not GEMMLOWP_PROFILING +// This code path is when profiling is disabled. + +// This empty definition of ScopedProfilingLabel ensures that +// it has zero runtime overhead when profiling is disabled. +struct ScopedProfilingLabel { + explicit ScopedProfilingLabel(const char*) {} + void Update(const char*) {} +}; + +inline void RegisterCurrentThreadForProfiling() {} + +#endif + +} // end namespace gemmlowp + +#endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_ diff --git a/profiling/profiler.h b/profiling/profiler.h new file mode 100644 index 0000000..9ea7a9f --- /dev/null +++ b/profiling/profiler.h @@ -0,0 +1,373 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// profiler.h: a simple sampling profiler that's always just one #include away! +// +// Overview +// ======== +// +// This profiler only samples a pseudo-stack, not the actual call stack. +// The code to be profiled needs to be instrumented with +// pseudo-stack "labels", see ScopedProfilingLabel. +// Using pseudo-stacks allows this profiler to be very simple, low-overhead, +// portable, and independent of compilation details such as function inlining +// and frame pointers. The granularity of instrumentation can be freely chosen, +// and it is possible to get some annotate-like detail, i.e. detail within one +// function without splitting it into multiple functions. +// +// This profiler should remain small and simple; its key feature is to fit in +// a single header file so that there should never be a reason to refrain +// from profiling. More complex and feature-rich alternatives are +// readily available. This one offers a strict superset of its +// functionality: https://github.com/bgirard/GeckoProfiler, including +// intertwining pseudostacks with real call stacks, more annotation options, +// and advanced visualization. +// +// Usage +// ===== +// +// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is +// not enabled, profiling instrumentation from common.h +// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) +// is still defined but does nothing. On the other hand, +// when profiling is not enabled, it is an error to #include the +// present file. +// +// 1. Each thread can opt in to profiling by calling +// RegisterCurrentThreadForProfiling() defined in common.h. +// This can be done at any time, before or during profiling. +// No sample will be collected from a thread until +// it has called RegisterCurrentThreadForProfiling(). +// +// 2. Instrument your code to be profiled with ScopedProfilingLabel, +// which is a RAII helper defined in common.h. The identifier +// names (some_label, etc) do not matter; what will show up +// in the profile is the string passed to the constructor, which +// must be a literal string. See the full example below. +// +// Note: the overhead of ScopedProfilingLabel is zero when not +// enabling profiling (when not defining GEMMLOWP_PROFILING). +// +// 3. Use the profiler.h interface to control profiling. There are two +// functions: StartProfiling() and FinishProfiling(). They must be +// called on the same thread. FinishProfiling() prints the profile +// on stdout. +// +// Full example +// ============ +/* + #define GEMMLOWP_PROFILING + #include "profiling/instrumentation.h" + using namespace gemmlowp; + + const int iters = 100000000; + volatile int i; + + void Bar() { + ScopedProfilingLabel label("Bar"); + for (i = 0; i < iters; i++) {} + } + + void Foo() { + ScopedProfilingLabel label("Foo"); + for (i = 0; i < iters; i++) {} + Bar(); + } + + void Init() { + RegisterCurrentThreadForProfiling(); + } + + #include "profiling/profiler.h" + + int main() { + Init(); + StartProfiling(); + Foo(); + FinishProfiling(); + } +* +* Output: +* + gemmlowp profile (1 threads, 304 samples) + 100.00% Foo + 51.32% other + 48.68% Bar + 0.00% other (outside of any label) +*/ +// +// Interpreting results +// ==================== +// +// Each node shows the absolute percentage, among all the samples, +// of the number of samples that recorded the given pseudo-stack. +// The percentages are *NOT* relative to the parent node. In addition +// to your own labels, you will also see 'other' nodes that collect +// the remainder of samples under the parent node that didn't fall into +// any of the labelled child nodes. Example: +// +// 20% Foo +// 12% Bar +// 6% Xyz +// 2% other +// +// This means that 20% of all labels were under Foo, of which 12%/20%==60% +// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not +// under either Bar or Xyz. +// +// Typically, one wants to keep adding ScopedProfilingLabel's until +// the 'other' nodes show low percentages. +// +// Interpreting results with multiple threads +// ========================================== +// +// At each sample, each thread registered for profiling gets sampled once. +// So if there is one "main thread" spending its time in MainFunc() and +// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the +// samples will be in WorkerFunc, so the profile will look like this: +// +// 80% WorkerFunc +// 20% MainFunc + +#ifndef GEMMLOWP_PROFILING_PROFILER_H_ +#define GEMMLOWP_PROFILING_PROFILER_H_ + +#ifndef GEMMLOWP_PROFILING +#error Profiling is not enabled! +#endif + +#include <vector> + +#include "profiling/instrumentation.h" + +namespace gemmlowp { + +// A tree view of a profile. +class ProfileTreeView { + struct Node { + std::vector<Node*> children; + const char* label; + std::size_t weight; + Node() : label(nullptr), weight(0) {} + ~Node() { + for (auto child : children) { + delete child; + } + } + }; + + static bool CompareNodes(Node* n1, Node* n2) { + return n1->weight > n2->weight; + } + + Node root_; + + void PrintNode(const Node* node, int level) const { + if (level) { + for (int i = 1; i < level; i++) { + printf(" "); + } + printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); + } + for (auto child : node->children) { + PrintNode(child, level + 1); + } + } + + static void AddStackToNode(const ProfilingStack& stack, Node* node, + std::size_t level) { + node->weight++; + if (stack.size == level) { + return; + } + Node* child_to_add_to = nullptr; + for (auto child : node->children) { + if (child->label == stack.labels[level]) { + child_to_add_to = child; + break; + } + } + if (!child_to_add_to) { + child_to_add_to = new Node; + child_to_add_to->label = stack.labels[level]; + node->children.push_back(child_to_add_to); + } + AddStackToNode(stack, child_to_add_to, level + 1); + return; + } + + void AddStack(const ProfilingStack& stack) { + AddStackToNode(stack, &root_, 0); + } + + void AddOtherChildrenToNode(Node* node) { + std::size_t top_level_children_weight = 0; + for (auto c : node->children) { + AddOtherChildrenToNode(c); + top_level_children_weight += c->weight; + } + if (top_level_children_weight) { + Node* other_child = new Node; + other_child->label = + node == &root_ ? "other (outside of any label)" : "other"; + other_child->weight = node->weight - top_level_children_weight; + node->children.push_back(other_child); + } + } + + void AddOtherNodes() { AddOtherChildrenToNode(&root_); } + + void SortNode(Node* node) { + std::sort(node->children.begin(), node->children.end(), CompareNodes); + for (auto child : node->children) { + SortNode(child); + } + } + + void Sort() { SortNode(&root_); } + + public: + explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { + for (auto stack : stacks) { + AddStack(stack); + } + AddOtherNodes(); + Sort(); + } + + void Print() const { + printf("\n"); + printf("gemmlowp profile (%d threads, %d samples)\n", + static_cast<int>(ThreadsUnderProfiling().size()), + static_cast<int>(root_.weight)); + PrintNode(&root_, 0); + printf("\n"); + } +}; + +// This function is the only place that determines our sampling frequency. +inline void WaitOneProfilerTick() { + static const int millisecond = 1000000; + +#if defined __arm__ || defined __aarch64__ + // Reduced sampling frequency on mobile devices helps limit time and memory + // overhead there. + static const int interval = 10 * millisecond; +#else + static const int interval = 1 * millisecond; +#endif + + timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = interval; + nanosleep(&ts, nullptr); +} + +// This is how we track whether we've already started profiling, +// to guard against misuse of the API. +inline bool& IsProfiling() { + static bool b; + return b; +} + +// This is how we tell the profiler thread to finish. +inline bool& ProfilerThreadShouldFinish() { + static bool b; + return b; +} + +// The profiler thread. See ProfilerThreadFunc. +inline pthread_t& ProfilerThread() { + static pthread_t t; + return t; +} + +// Records a stack from a running thread. +// The tricky part is that we're not interrupting the thread. +// This is OK because we're looking at a pseudo-stack of labels, +// not at the real thread stack, and if the pseudo-stack changes +// while we're recording it, we are OK with getting either the +// old or the new stack. Note that ProfilingStack::Pop +// only decrements the size, and doesn't null the popped label, +// so if we're concurrently recording it, it shouldn't change +// under our feet until another label is pushed, at which point +// we are OK with getting either this new label or the old one. +// In the end, the key atomicity property that we are relying on +// here is that pointers are changed atomically, and the labels +// are pointers (to literal strings). +inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { + assert(!dst->size); + while (dst->size < thread->stack.size) { + dst->labels[dst->size] = thread->stack.labels[dst->size]; + dst->size++; + MemoryBarrier(); // thread->stack can change at any time + } +} + +// The profiler thread's entry point. +// Note that a separate thread is to be started each time we call +// StartProfiling(), and finishes when we call FinishProfiling(). +// So here we only need to handle the recording and reporting of +// a single profile. +inline void* ProfilerThreadFunc(void*) { + assert(ProfilerThread() == pthread_self()); + + // Since we only handle one profile per profiler thread, the + // profile data (the array of recorded stacks) can be a local variable here. + std::vector<ProfilingStack> stacks; + + while (!ProfilerThreadShouldFinish()) { + WaitOneProfilerTick(); + { + AutoGlobalLock<ProfilerLockId> lock; + for (auto t : ThreadsUnderProfiling()) { + ProfilingStack s; + RecordStack(t, &s); + stacks.push_back(s); + } + } + } + + // Profiling is finished and we now report the results. + ProfileTreeView(stacks).Print(); + + return nullptr; +} + +// Starts recording samples. +inline void StartProfiling() { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); + IsProfiling() = true; + ProfilerThreadShouldFinish() = false; + pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); +} + +// Stops recording samples, and prints a profile tree-view on stdout. +inline void FinishProfiling() { + { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); + // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, + // as the scary comments below should make clear. + // Should we use a condition variable? + ProfilerThreadShouldFinish() = true; + } // must release the lock here to avoid deadlock with profiler thread. + pthread_join(ProfilerThread(), nullptr); + IsProfiling() = false; // yikes, this should be guarded by the lock! +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PROFILING_PROFILER_H_ |