diff options
author | Miao Wang <miaowang@google.com> | 2017-12-12 14:22:24 -0800 |
---|---|---|
committer | Miao Wang <miaowang@google.com> | 2017-12-12 16:14:38 -0800 |
commit | 1963df9ac4a0424674e72ef5da522b5d830605fd (patch) | |
tree | efd8fbbe69f13c4057f2cc5a5b1f7852fd57a2ab /profiling | |
parent | cbcfdf963151219ca77f54657defabde8d845bac (diff) | |
download | gemmlowp-1963df9ac4a0424674e72ef5da522b5d830605fd.tar.gz |
Rebase gemmlowp to 6a2a908temp_72223856
Bug: 70573221
Test: mm
Test: mm and Pixel2 boot
Test: NeuralNetworksTest pass
Change-Id: I8fac98811e9a276d3ff8054167dc45225c04147e
Diffstat (limited to 'profiling')
-rw-r--r-- | profiling/instrumentation.h | 115 | ||||
-rw-r--r-- | profiling/profiler.h | 12 | ||||
-rw-r--r-- | profiling/pthread_everywhere.h | 88 |
3 files changed, 149 insertions, 66 deletions
diff --git a/profiling/instrumentation.h b/profiling/instrumentation.h index 51b6525..539076a 100644 --- a/profiling/instrumentation.h +++ b/profiling/instrumentation.h @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -52,15 +52,6 @@ using ::uintptr_t; #include <set> #endif -// We should always use C++11 thread_local; unfortunately that -// isn't fully supported on Apple yet. -#ifdef __APPLE__ -#define GEMMLOWP_THREAD_LOCAL static __thread -#define GEMMLOWP_USING_OLD_THREAD_LOCAL -#else -#define GEMMLOWP_THREAD_LOCAL thread_local -#endif - namespace gemmlowp { inline void ReleaseBuildAssertion(bool condition, const char* msg) { @@ -70,41 +61,42 @@ inline void ReleaseBuildAssertion(bool condition, const char* msg) { } } -// To be used as template parameter for GlobalLock. -// GlobalLock<ProfilerLockId> is the profiler global lock: -// registering threads, starting profiling, finishing profiling, and -// the profiler itself as it samples threads, all need to lock it. -struct ProfilerLockId; - -// A very plain global lock. Templated in LockId so we can have multiple -// locks, one for each LockId type. -template <typename LockId> -class GlobalLock { - static pthread_mutex_t* Mutex() { - static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; +class Mutex { + public: + Mutex(const Mutex&) = delete; + Mutex& operator=(const Mutex&) = delete; + + Mutex() { pthread_mutex_init(&m, NULL); } + ~Mutex() { pthread_mutex_destroy(&m); } + + void Lock() { pthread_mutex_lock(&m); } + void Unlock() { pthread_mutex_unlock(&m); } + + private: + pthread_mutex_t m; +}; + +class GlobalMutexes { + public: + static Mutex* Profiler() { + static Mutex m; return &m; } - public: - static void Lock() { pthread_mutex_lock(Mutex()); } - static void Unlock() { pthread_mutex_unlock(Mutex()); } + static Mutex* EightBitIntGemm() { + static Mutex m; + return &m; + } }; -// A very simple RAII helper to lock and unlock a GlobalLock -template <typename LockId> -struct AutoGlobalLock { - AutoGlobalLock() { GlobalLock<LockId>::Lock(); } - ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } -}; +// A very simple RAII helper to lock and unlock a Mutex +struct ScopedLock { + ScopedLock(Mutex* m) : _m(m) { _m->Lock(); } + ~ScopedLock() { _m->Unlock(); } -// MemoryBarrier is purely a compile-time thing; it tells two things -// to the compiler: -// 1) It prevents reordering code across it -// (thanks to the 'volatile' after 'asm') -// 2) It requires the compiler to assume that any value previously -// read from memory, may have changed. Thus it offers an alternative -// to using 'volatile' variables. -inline void MemoryBarrier() { asm volatile("" ::: "memory"); } + private: + Mutex* _m; +}; // Profiling definitions. Two paths: when profiling is enabled, // and when profiling is disabled. @@ -115,34 +107,31 @@ inline void MemoryBarrier() { asm volatile("" ::: "memory"); } // contains pointers to literal strings that were manually entered // in the instrumented code (see ScopedProfilingLabel). struct ProfilingStack { - static const std::size_t kMaxSize = 15; + static const std::size_t kMaxSize = 14; typedef const char* LabelsArrayType[kMaxSize]; LabelsArrayType labels; std::size_t size; + Mutex* lock; ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } void Push(const char* label) { - MemoryBarrier(); + ScopedLock sl(lock); ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); labels[size] = label; - MemoryBarrier(); size++; - MemoryBarrier(); } void Pop() { - MemoryBarrier(); + ScopedLock sl(lock); ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); size--; - MemoryBarrier(); } void UpdateTop(const char* new_label) { - MemoryBarrier(); + ScopedLock sl(lock); assert(size); labels[size - 1] = new_label; - MemoryBarrier(); } ProfilingStack& operator=(const ProfilingStack& other) { @@ -174,29 +163,35 @@ struct ThreadInfo { ThreadInfo() { pthread_key_create(&key, ThreadExitCallback); pthread_setspecific(key, this); + stack.lock = new Mutex(); } static void ThreadExitCallback(void* ptr) { - AutoGlobalLock<ProfilerLockId> lock; + ScopedLock sl(GlobalMutexes::Profiler()); ThreadInfo* self = static_cast<ThreadInfo*>(ptr); ThreadsUnderProfiling().erase(self); pthread_key_delete(self->key); + delete self->stack.lock; } }; inline ThreadInfo& ThreadLocalThreadInfo() { -#ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL - // We're leaking this ThreadInfo structure, because Apple doesn't support - // non-trivial constructors or destructors for their __thread type modifier. - GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr; - if (i == nullptr) { - i = new ThreadInfo(); + static pthread_key_t key; + static auto DeleteThreadInfo = [](void* threadInfoPtr) { + ThreadInfo* threadInfo = static_cast<ThreadInfo*>(threadInfoPtr); + if (threadInfo) { + delete threadInfo; + } + }; + + static int key_result = pthread_key_create(&key, DeleteThreadInfo); + + ThreadInfo* threadInfo = static_cast<ThreadInfo*>(pthread_getspecific(key)); + if (!threadInfo) { + threadInfo = new ThreadInfo(); + pthread_setspecific(key, threadInfo); } - return *i; -#else - GEMMLOWP_THREAD_LOCAL ThreadInfo i; - return i; -#endif + return *threadInfo; } // ScopedProfilingLabel is how one instruments code for profiling @@ -221,7 +216,7 @@ class ScopedProfilingLabel { // To be called once on each thread to be profiled. inline void RegisterCurrentThreadForProfiling() { - AutoGlobalLock<ProfilerLockId> lock; + ScopedLock sl(GlobalMutexes::Profiler()); ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); } diff --git a/profiling/profiler.h b/profiling/profiler.h index a18c036..018da57 100644 --- a/profiling/profiler.h +++ b/profiling/profiler.h @@ -1,4 +1,4 @@ -// Copyright 2015 Google Inc. All Rights Reserved. +// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -306,12 +306,12 @@ inline pthread_t& ProfilerThread() { // In the end, the key atomicity property that we are relying on // here is that pointers are changed atomically, and the labels // are pointers (to literal strings). -inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { +inline void RecordStack(ThreadInfo* thread, ProfilingStack* dst) { + ScopedLock sl(thread->stack.lock); assert(!dst->size); while (dst->size < thread->stack.size) { dst->labels[dst->size] = thread->stack.labels[dst->size]; dst->size++; - MemoryBarrier(); // thread->stack can change at any time } } @@ -330,7 +330,7 @@ inline void* ProfilerThreadFunc(void*) { while (!ProfilerThreadShouldFinish()) { WaitOneProfilerTick(); { - AutoGlobalLock<ProfilerLockId> lock; + ScopedLock sl(GlobalMutexes::Profiler()); for (auto t : ThreadsUnderProfiling()) { ProfilingStack s; RecordStack(t, &s); @@ -347,7 +347,7 @@ inline void* ProfilerThreadFunc(void*) { // Starts recording samples. inline void StartProfiling() { - AutoGlobalLock<ProfilerLockId> lock; + ScopedLock sl(GlobalMutexes::Profiler()); ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); IsProfiling() = true; ProfilerThreadShouldFinish() = false; @@ -357,7 +357,7 @@ inline void StartProfiling() { // Stops recording samples, and prints a profile tree-view on stdout. inline void FinishProfiling() { { - AutoGlobalLock<ProfilerLockId> lock; + ScopedLock sl(GlobalMutexes::Profiler()); ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, // as the scary comments below should make clear. diff --git a/profiling/pthread_everywhere.h b/profiling/pthread_everywhere.h new file mode 100644 index 0000000..7e12d66 --- /dev/null +++ b/profiling/pthread_everywhere.h @@ -0,0 +1,88 @@ +// Copyright 2017 The Gemmlowp Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// pthread_everywhere.h: Either includes <pthread.h> or implements a +// subset of pthread functionality on top of C++11 <thread> for portability. + +#ifndef GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ +#define GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ + +#include "pthread_everywhere.h" + +#ifndef _WIN32 +#define GEMMLOWP_USE_PTHREAD +#endif + +#if defined GEMMLOWP_USE_PTHREAD +#include <pthread.h> +#else +// Implement a small subset of pthread on top of C++11 threads. +// The function signatures differ from true pthread functions in two ways: +// - True pthread functions return int error codes, ours return void. +// Rationale: the c++11 <thread> equivalent functions return void +// and use exceptions to report errors; we don't want to deal with +// exceptions in this code, so we couldn't meaningfully return errors +// in the polyfill. Also, the gemmlowp code using these pthread functions +// never checks their return values anyway. +// - True pthread *_create/*_init functions take pointers to 'attribute' +// structs; ours take nullptr_t. That is because gemmlowp always passes +// nullptr at the moment, so any support we would code for non-null +// attribs would be unused. +#include <thread> +#include <mutex> +#include <condition_variable> +#include <cstddef> +namespace gemmlowp { +using pthread_t = std::thread*; +using pthread_mutex_t = std::mutex*; +using pthread_cond_t = std::condition_variable*; +inline void pthread_create(pthread_t* thread, std::nullptr_t, + void *(*start_routine) (void *), void *arg) { + *thread = new std::thread(start_routine, arg); +} +inline void pthread_join(pthread_t thread, std::nullptr_t) { + thread->join(); +} +inline void pthread_mutex_init(pthread_mutex_t *mutex, std::nullptr_t) { + *mutex = new std::mutex; +} +inline void pthread_mutex_lock(pthread_mutex_t* mutex) { + (*mutex)->lock(); +} +inline void pthread_mutex_unlock(pthread_mutex_t* mutex) { + (*mutex)->unlock(); +} +inline void pthread_mutex_destroy(pthread_mutex_t *mutex) { + delete *mutex; +} +inline void pthread_cond_init(pthread_cond_t *cond, std::nullptr_t) { + *cond = new std::condition_variable; +} +inline void pthread_cond_signal(pthread_cond_t* cond) { + (*cond)->notify_one(); +} +inline void pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { + std::unique_lock<std::mutex> lock(**mutex, std::adopt_lock); + (*cond)->wait(lock); + // detach lock from mutex so when we leave this conext + // the lock is not released + lock.release(); +} +inline void pthread_cond_destroy(pthread_cond_t *cond) { + delete *cond; +} +} // end namespace gemmlowp +#endif + +#endif // GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
\ No newline at end of file |