diff options
author | Mika Raento <mikie@google.com> | 2018-05-03 14:43:24 +0100 |
---|---|---|
committer | Mika Raento <mikie@google.com> | 2018-05-11 19:16:30 +0100 |
commit | 6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b (patch) | |
tree | c2d531415db8d25c1da7573bad009472181cf6fe | |
parent | 40ac90985a927ff77d234c4d4fc33ab0a9a85648 (diff) | |
download | ml-6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b.tar.gz |
Reduce OpenMP thread busywait time (200 to 1 ms)
Reduce the time OpenMP threads used by Eigen busywait for new work after
completing a piece. This is 200ms by default, which meant that we tried
to continuously run 8 threads for 200ms after completing operations
using Eigen - starving both our other work and the rest of the system.
Bug: 79159165
Test: mm
Test: cherry-picked to systrace branch and look at traces
Test: NeuralNetworksTest_static
Change-Id: I3f21aea6d96c785dd7c47493a739b76876420f7d
-rw-r--r-- | nn/common/CpuExecutor.cpp | 31 | ||||
-rw-r--r-- | nn/common/include/CpuExecutor.h | 38 | ||||
-rw-r--r-- | nn/runtime/test/Android.bp | 1 | ||||
-rw-r--r-- | nn/runtime/test/TestOpenmpSettings.cpp | 90 |
4 files changed, 160 insertions, 0 deletions
diff --git a/nn/common/CpuExecutor.cpp b/nn/common/CpuExecutor.cpp index 283d30720..0c6219308 100644 --- a/nn/common/CpuExecutor.cpp +++ b/nn/common/CpuExecutor.cpp @@ -21,6 +21,8 @@ #include "NeuralNetworks.h" #include "Operations.h" +#include "Eigen/Core" +#include <omp.h> #include <sys/mman.h> namespace android { @@ -196,6 +198,8 @@ int CpuExecutor::run(const V1_1::Model& model, const Request& request, VLOG(CPUEXE) << "CpuExecutor::run() with request(" << SHOW_IF_DEBUG(toString(request)) << ")"; + ScopedOpenmpSettings openMpSettings; + mModel = &model; mRequest = &request; // TODO check if mRequest is needed initializeRunTimeInfo(modelPoolInfos, requestPoolInfos); @@ -1530,5 +1534,32 @@ int CpuExecutor::executeOperation(const Operation& operation) { return ANEURALNETWORKS_NO_ERROR; } +ScopedOpenmpSettings::ScopedOpenmpSettings() { + mBlocktimeInitial = kmp_get_blocktime(); + kmp_set_blocktime(1); // ms + +#if NNAPI_LIMIT_CPU_THREADS + // Code not yet enabled. Choosing the number of threads to be based on + // benchmarking. See longer comment by the class declaration. + mMaxThreadsInitial = Eigen::nbThreads(); + const int nProcs = omp_get_num_procs(); + int threads = nProcs; + if (nProcs >= 8) { + threads = nProcs - 4; + } else if (nProcs >= 4) { + threads = nProcs - 2; + } + Eigen::setNbThreads(threads); +#endif +} + +ScopedOpenmpSettings::~ScopedOpenmpSettings() { + kmp_set_blocktime(mBlocktimeInitial); +#if NNAPI_LIMIT_CPU_THREADS + Eigen::setNbThreads(mMaxThreadsInitial); +#endif +} + + } // namespace nn } // namespace android diff --git a/nn/common/include/CpuExecutor.h b/nn/common/include/CpuExecutor.h index c7a318bc3..64a46b65f 100644 --- a/nn/common/include/CpuExecutor.h +++ b/nn/common/include/CpuExecutor.h @@ -22,6 +22,7 @@ #include "Utils.h" #include <algorithm> +#include <android-base/macros.h> #include <vector> namespace android { @@ -136,6 +137,43 @@ private: std::vector<RunTimeOperandInfo> mOperands; }; +// Class for setting reasonable OpenMP threading settings. (OpenMP is used by +// the Eigen matrix library.) +// +// Currently sets a low blocktime: the time OpenMP threads busy-wait for more +// work before going to sleep. See b/79159165, https://reviews.llvm.org/D18577. +// The default is 200ms, we set to 1ms here. This should allow for the threads +// to not sleep before the next operation, but release CPU to other work +// quickly. +// +// The OpenMP settings are thread-local (applying only to worker threads formed +// from that thread), see https://software.intel.com/en-us/node/522688 and +// http://lists.llvm.org/pipermail/openmp-dev/2016-July/001432.html. This class +// ensures that within the scope in which an object is instantiated we use the +// right settings (scopes may be nested), as long as no other library changes +// them. (Note that in current NNAPI usage only one instance is used in the +// CpuExecutor thread). +// +// TODO(mikie): consider also setting the number of threads used. Using as many +// threads as there are cores results in more variable performance: if we don't +// get all cores for our threads, the latency is doubled as we wait for one core +// to do twice the amount of work. Reality is complicated though as not all +// cores are the same. Decision to be based on benchmarking against a +// representative set of workloads and devices. I'm keeping the code here for +// reference. +class ScopedOpenmpSettings { +public: + ScopedOpenmpSettings(); + ~ScopedOpenmpSettings(); + DISALLOW_COPY_AND_ASSIGN(ScopedOpenmpSettings); +private: + int mBlocktimeInitial; +#if NNAPI_LIMIT_CPU_THREADS + int mMaxThreadsInitial; +#endif +}; + + namespace { template <typename T> diff --git a/nn/runtime/test/Android.bp b/nn/runtime/test/Android.bp index deb240b06..4dc875a55 100644 --- a/nn/runtime/test/Android.bp +++ b/nn/runtime/test/Android.bp @@ -77,6 +77,7 @@ cc_test { // not exported from libneuralnetworks.so). "TestExecution.cpp", "TestMemoryInternal.cpp", + "TestOpenmpSettings.cpp", "TestPartitioning.cpp", "TestPartitioningRandom.cpp", ], diff --git a/nn/runtime/test/TestOpenmpSettings.cpp b/nn/runtime/test/TestOpenmpSettings.cpp new file mode 100644 index 000000000..549473b46 --- /dev/null +++ b/nn/runtime/test/TestOpenmpSettings.cpp @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2018 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "CpuExecutor.h" + +#include <algorithm> +#include <gtest/gtest.h> +#include <memory> +#include <omp.h> +#include <random> +#include <thread> +#include <unistd.h> +#include <vector> + +namespace { + +class OpenmpSettingsTest : public ::testing::Test { +protected: + virtual void SetUp() override { + const int blocktimeInitial = kmp_get_blocktime(); + ASSERT_EQ(blocktimeInitial, kOpenmpDefaultBlockTime); + } + virtual void TearDown() override { + const int blocktimeRestored = kmp_get_blocktime(); + ASSERT_EQ(blocktimeRestored, kOpenmpDefaultBlockTime); + } + static const int kOpenmpDefaultBlockTime; +}; + +const int OpenmpSettingsTest::kOpenmpDefaultBlockTime = 200; + +using ::android::nn::ScopedOpenmpSettings; + +TEST_F(OpenmpSettingsTest, Test1) { + ScopedOpenmpSettings s; + const int blocktimeSet = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet, 1); +} + +TEST_F(OpenmpSettingsTest, Test2) { + ScopedOpenmpSettings s1; + const int blocktimeSet1 = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet1, 1); + + ScopedOpenmpSettings s2; + const int blocktimeSet2 = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet2, 1); +} + +TEST_F(OpenmpSettingsTest, TestThreaded) { + // Threaded test to validate that each thread gets its own settings. + std::vector<std::thread> threads; + std::mt19937 randGen; + std::uniform_int_distribution<> rand(1, 20); + for (int i = 0; i < 10; i++) { + const int sleepFor = rand(randGen); + threads.push_back(std::thread([sleepFor]() { + const int blocktimeSet1 = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet1, kOpenmpDefaultBlockTime); + + ScopedOpenmpSettings s; + + const int blocktimeSet2 = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet2, 1); + + usleep(sleepFor); + + const int blocktimeSet3 = kmp_get_blocktime(); + ASSERT_EQ(blocktimeSet3, 1); + })); + } + std::for_each(threads.begin(), threads.end(), [](std::thread& t) { + t.join(); + }); +} + +} // end namespace |