diff options
Diffstat (limited to 'profiling/profiler.h')
-rw-r--r-- | profiling/profiler.h | 373 |
1 files changed, 373 insertions, 0 deletions
diff --git a/profiling/profiler.h b/profiling/profiler.h new file mode 100644 index 0000000..9ea7a9f --- /dev/null +++ b/profiling/profiler.h @@ -0,0 +1,373 @@ +// Copyright 2014 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// profiler.h: a simple sampling profiler that's always just one #include away! +// +// Overview +// ======== +// +// This profiler only samples a pseudo-stack, not the actual call stack. +// The code to be profiled needs to be instrumented with +// pseudo-stack "labels", see ScopedProfilingLabel. +// Using pseudo-stacks allows this profiler to be very simple, low-overhead, +// portable, and independent of compilation details such as function inlining +// and frame pointers. The granularity of instrumentation can be freely chosen, +// and it is possible to get some annotate-like detail, i.e. detail within one +// function without splitting it into multiple functions. +// +// This profiler should remain small and simple; its key feature is to fit in +// a single header file so that there should never be a reason to refrain +// from profiling. More complex and feature-rich alternatives are +// readily available. This one offers a strict superset of its +// functionality: https://github.com/bgirard/GeckoProfiler, including +// intertwining pseudostacks with real call stacks, more annotation options, +// and advanced visualization. +// +// Usage +// ===== +// +// 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is +// not enabled, profiling instrumentation from common.h +// (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) +// is still defined but does nothing. On the other hand, +// when profiling is not enabled, it is an error to #include the +// present file. +// +// 1. Each thread can opt in to profiling by calling +// RegisterCurrentThreadForProfiling() defined in common.h. +// This can be done at any time, before or during profiling. +// No sample will be collected from a thread until +// it has called RegisterCurrentThreadForProfiling(). +// +// 2. Instrument your code to be profiled with ScopedProfilingLabel, +// which is a RAII helper defined in common.h. The identifier +// names (some_label, etc) do not matter; what will show up +// in the profile is the string passed to the constructor, which +// must be a literal string. See the full example below. +// +// Note: the overhead of ScopedProfilingLabel is zero when not +// enabling profiling (when not defining GEMMLOWP_PROFILING). +// +// 3. Use the profiler.h interface to control profiling. There are two +// functions: StartProfiling() and FinishProfiling(). They must be +// called on the same thread. FinishProfiling() prints the profile +// on stdout. +// +// Full example +// ============ +/* + #define GEMMLOWP_PROFILING + #include "profiling/instrumentation.h" + using namespace gemmlowp; + + const int iters = 100000000; + volatile int i; + + void Bar() { + ScopedProfilingLabel label("Bar"); + for (i = 0; i < iters; i++) {} + } + + void Foo() { + ScopedProfilingLabel label("Foo"); + for (i = 0; i < iters; i++) {} + Bar(); + } + + void Init() { + RegisterCurrentThreadForProfiling(); + } + + #include "profiling/profiler.h" + + int main() { + Init(); + StartProfiling(); + Foo(); + FinishProfiling(); + } +* +* Output: +* + gemmlowp profile (1 threads, 304 samples) + 100.00% Foo + 51.32% other + 48.68% Bar + 0.00% other (outside of any label) +*/ +// +// Interpreting results +// ==================== +// +// Each node shows the absolute percentage, among all the samples, +// of the number of samples that recorded the given pseudo-stack. +// The percentages are *NOT* relative to the parent node. In addition +// to your own labels, you will also see 'other' nodes that collect +// the remainder of samples under the parent node that didn't fall into +// any of the labelled child nodes. Example: +// +// 20% Foo +// 12% Bar +// 6% Xyz +// 2% other +// +// This means that 20% of all labels were under Foo, of which 12%/20%==60% +// were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not +// under either Bar or Xyz. +// +// Typically, one wants to keep adding ScopedProfilingLabel's until +// the 'other' nodes show low percentages. +// +// Interpreting results with multiple threads +// ========================================== +// +// At each sample, each thread registered for profiling gets sampled once. +// So if there is one "main thread" spending its time in MainFunc() and +// 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the +// samples will be in WorkerFunc, so the profile will look like this: +// +// 80% WorkerFunc +// 20% MainFunc + +#ifndef GEMMLOWP_PROFILING_PROFILER_H_ +#define GEMMLOWP_PROFILING_PROFILER_H_ + +#ifndef GEMMLOWP_PROFILING +#error Profiling is not enabled! +#endif + +#include <vector> + +#include "profiling/instrumentation.h" + +namespace gemmlowp { + +// A tree view of a profile. +class ProfileTreeView { + struct Node { + std::vector<Node*> children; + const char* label; + std::size_t weight; + Node() : label(nullptr), weight(0) {} + ~Node() { + for (auto child : children) { + delete child; + } + } + }; + + static bool CompareNodes(Node* n1, Node* n2) { + return n1->weight > n2->weight; + } + + Node root_; + + void PrintNode(const Node* node, int level) const { + if (level) { + for (int i = 1; i < level; i++) { + printf(" "); + } + printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); + } + for (auto child : node->children) { + PrintNode(child, level + 1); + } + } + + static void AddStackToNode(const ProfilingStack& stack, Node* node, + std::size_t level) { + node->weight++; + if (stack.size == level) { + return; + } + Node* child_to_add_to = nullptr; + for (auto child : node->children) { + if (child->label == stack.labels[level]) { + child_to_add_to = child; + break; + } + } + if (!child_to_add_to) { + child_to_add_to = new Node; + child_to_add_to->label = stack.labels[level]; + node->children.push_back(child_to_add_to); + } + AddStackToNode(stack, child_to_add_to, level + 1); + return; + } + + void AddStack(const ProfilingStack& stack) { + AddStackToNode(stack, &root_, 0); + } + + void AddOtherChildrenToNode(Node* node) { + std::size_t top_level_children_weight = 0; + for (auto c : node->children) { + AddOtherChildrenToNode(c); + top_level_children_weight += c->weight; + } + if (top_level_children_weight) { + Node* other_child = new Node; + other_child->label = + node == &root_ ? "other (outside of any label)" : "other"; + other_child->weight = node->weight - top_level_children_weight; + node->children.push_back(other_child); + } + } + + void AddOtherNodes() { AddOtherChildrenToNode(&root_); } + + void SortNode(Node* node) { + std::sort(node->children.begin(), node->children.end(), CompareNodes); + for (auto child : node->children) { + SortNode(child); + } + } + + void Sort() { SortNode(&root_); } + + public: + explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { + for (auto stack : stacks) { + AddStack(stack); + } + AddOtherNodes(); + Sort(); + } + + void Print() const { + printf("\n"); + printf("gemmlowp profile (%d threads, %d samples)\n", + static_cast<int>(ThreadsUnderProfiling().size()), + static_cast<int>(root_.weight)); + PrintNode(&root_, 0); + printf("\n"); + } +}; + +// This function is the only place that determines our sampling frequency. +inline void WaitOneProfilerTick() { + static const int millisecond = 1000000; + +#if defined __arm__ || defined __aarch64__ + // Reduced sampling frequency on mobile devices helps limit time and memory + // overhead there. + static const int interval = 10 * millisecond; +#else + static const int interval = 1 * millisecond; +#endif + + timespec ts; + ts.tv_sec = 0; + ts.tv_nsec = interval; + nanosleep(&ts, nullptr); +} + +// This is how we track whether we've already started profiling, +// to guard against misuse of the API. +inline bool& IsProfiling() { + static bool b; + return b; +} + +// This is how we tell the profiler thread to finish. +inline bool& ProfilerThreadShouldFinish() { + static bool b; + return b; +} + +// The profiler thread. See ProfilerThreadFunc. +inline pthread_t& ProfilerThread() { + static pthread_t t; + return t; +} + +// Records a stack from a running thread. +// The tricky part is that we're not interrupting the thread. +// This is OK because we're looking at a pseudo-stack of labels, +// not at the real thread stack, and if the pseudo-stack changes +// while we're recording it, we are OK with getting either the +// old or the new stack. Note that ProfilingStack::Pop +// only decrements the size, and doesn't null the popped label, +// so if we're concurrently recording it, it shouldn't change +// under our feet until another label is pushed, at which point +// we are OK with getting either this new label or the old one. +// In the end, the key atomicity property that we are relying on +// here is that pointers are changed atomically, and the labels +// are pointers (to literal strings). +inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { + assert(!dst->size); + while (dst->size < thread->stack.size) { + dst->labels[dst->size] = thread->stack.labels[dst->size]; + dst->size++; + MemoryBarrier(); // thread->stack can change at any time + } +} + +// The profiler thread's entry point. +// Note that a separate thread is to be started each time we call +// StartProfiling(), and finishes when we call FinishProfiling(). +// So here we only need to handle the recording and reporting of +// a single profile. +inline void* ProfilerThreadFunc(void*) { + assert(ProfilerThread() == pthread_self()); + + // Since we only handle one profile per profiler thread, the + // profile data (the array of recorded stacks) can be a local variable here. + std::vector<ProfilingStack> stacks; + + while (!ProfilerThreadShouldFinish()) { + WaitOneProfilerTick(); + { + AutoGlobalLock<ProfilerLockId> lock; + for (auto t : ThreadsUnderProfiling()) { + ProfilingStack s; + RecordStack(t, &s); + stacks.push_back(s); + } + } + } + + // Profiling is finished and we now report the results. + ProfileTreeView(stacks).Print(); + + return nullptr; +} + +// Starts recording samples. +inline void StartProfiling() { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); + IsProfiling() = true; + ProfilerThreadShouldFinish() = false; + pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); +} + +// Stops recording samples, and prints a profile tree-view on stdout. +inline void FinishProfiling() { + { + AutoGlobalLock<ProfilerLockId> lock; + ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); + // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, + // as the scary comments below should make clear. + // Should we use a condition variable? + ProfilerThreadShouldFinish() = true; + } // must release the lock here to avoid deadlock with profiler thread. + pthread_join(ProfilerThread(), nullptr); + IsProfiling() = false; // yikes, this should be guarded by the lock! +} + +} // namespace gemmlowp + +#endif // GEMMLOWP_PROFILING_PROFILER_H_ |