summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMika Raento <mikie@google.com>2018-05-03 14:43:24 +0100
committerMika Raento <mikie@google.com>2018-05-11 19:16:30 +0100
commit6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b (patch)
treec2d531415db8d25c1da7573bad009472181cf6fe
parent40ac90985a927ff77d234c4d4fc33ab0a9a85648 (diff)
downloadml-6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b.tar.gz
Reduce OpenMP thread busywait time (200 to 1 ms)
Reduce the time OpenMP threads used by Eigen busywait for new work after completing a piece. This is 200ms by default, which meant that we tried to continuously run 8 threads for 200ms after completing operations using Eigen - starving both our other work and the rest of the system. Bug: 79159165 Test: mm Test: cherry-picked to systrace branch and look at traces Test: NeuralNetworksTest_static Change-Id: I3f21aea6d96c785dd7c47493a739b76876420f7d
-rw-r--r--nn/common/CpuExecutor.cpp31
-rw-r--r--nn/common/include/CpuExecutor.h38
-rw-r--r--nn/runtime/test/Android.bp1
-rw-r--r--nn/runtime/test/TestOpenmpSettings.cpp90
4 files changed, 160 insertions, 0 deletions
diff --git a/nn/common/CpuExecutor.cpp b/nn/common/CpuExecutor.cpp
index 283d30720..0c6219308 100644
--- a/nn/common/CpuExecutor.cpp
+++ b/nn/common/CpuExecutor.cpp
@@ -21,6 +21,8 @@
#include "NeuralNetworks.h"
#include "Operations.h"
+#include "Eigen/Core"
+#include <omp.h>
#include <sys/mman.h>
namespace android {
@@ -196,6 +198,8 @@ int CpuExecutor::run(const V1_1::Model& model, const Request& request,
VLOG(CPUEXE) << "CpuExecutor::run() with request("
<< SHOW_IF_DEBUG(toString(request)) << ")";
+ ScopedOpenmpSettings openMpSettings;
+
mModel = &model;
mRequest = &request; // TODO check if mRequest is needed
initializeRunTimeInfo(modelPoolInfos, requestPoolInfos);
@@ -1530,5 +1534,32 @@ int CpuExecutor::executeOperation(const Operation& operation) {
return ANEURALNETWORKS_NO_ERROR;
}
+ScopedOpenmpSettings::ScopedOpenmpSettings() {
+ mBlocktimeInitial = kmp_get_blocktime();
+ kmp_set_blocktime(1); // ms
+
+#if NNAPI_LIMIT_CPU_THREADS
+ // Code not yet enabled. Choosing the number of threads to be based on
+ // benchmarking. See longer comment by the class declaration.
+ mMaxThreadsInitial = Eigen::nbThreads();
+ const int nProcs = omp_get_num_procs();
+ int threads = nProcs;
+ if (nProcs >= 8) {
+ threads = nProcs - 4;
+ } else if (nProcs >= 4) {
+ threads = nProcs - 2;
+ }
+ Eigen::setNbThreads(threads);
+#endif
+}
+
+ScopedOpenmpSettings::~ScopedOpenmpSettings() {
+ kmp_set_blocktime(mBlocktimeInitial);
+#if NNAPI_LIMIT_CPU_THREADS
+ Eigen::setNbThreads(mMaxThreadsInitial);
+#endif
+}
+
+
} // namespace nn
} // namespace android
diff --git a/nn/common/include/CpuExecutor.h b/nn/common/include/CpuExecutor.h
index c7a318bc3..64a46b65f 100644
--- a/nn/common/include/CpuExecutor.h
+++ b/nn/common/include/CpuExecutor.h
@@ -22,6 +22,7 @@
#include "Utils.h"
#include <algorithm>
+#include <android-base/macros.h>
#include <vector>
namespace android {
@@ -136,6 +137,43 @@ private:
std::vector<RunTimeOperandInfo> mOperands;
};
+// Class for setting reasonable OpenMP threading settings. (OpenMP is used by
+// the Eigen matrix library.)
+//
+// Currently sets a low blocktime: the time OpenMP threads busy-wait for more
+// work before going to sleep. See b/79159165, https://reviews.llvm.org/D18577.
+// The default is 200ms, we set to 1ms here. This should allow for the threads
+// to not sleep before the next operation, but release CPU to other work
+// quickly.
+//
+// The OpenMP settings are thread-local (applying only to worker threads formed
+// from that thread), see https://software.intel.com/en-us/node/522688 and
+// http://lists.llvm.org/pipermail/openmp-dev/2016-July/001432.html. This class
+// ensures that within the scope in which an object is instantiated we use the
+// right settings (scopes may be nested), as long as no other library changes
+// them. (Note that in current NNAPI usage only one instance is used in the
+// CpuExecutor thread).
+//
+// TODO(mikie): consider also setting the number of threads used. Using as many
+// threads as there are cores results in more variable performance: if we don't
+// get all cores for our threads, the latency is doubled as we wait for one core
+// to do twice the amount of work. Reality is complicated though as not all
+// cores are the same. Decision to be based on benchmarking against a
+// representative set of workloads and devices. I'm keeping the code here for
+// reference.
+class ScopedOpenmpSettings {
+public:
+ ScopedOpenmpSettings();
+ ~ScopedOpenmpSettings();
+ DISALLOW_COPY_AND_ASSIGN(ScopedOpenmpSettings);
+private:
+ int mBlocktimeInitial;
+#if NNAPI_LIMIT_CPU_THREADS
+ int mMaxThreadsInitial;
+#endif
+};
+
+
namespace {
template <typename T>
diff --git a/nn/runtime/test/Android.bp b/nn/runtime/test/Android.bp
index deb240b06..4dc875a55 100644
--- a/nn/runtime/test/Android.bp
+++ b/nn/runtime/test/Android.bp
@@ -77,6 +77,7 @@ cc_test {
// not exported from libneuralnetworks.so).
"TestExecution.cpp",
"TestMemoryInternal.cpp",
+ "TestOpenmpSettings.cpp",
"TestPartitioning.cpp",
"TestPartitioningRandom.cpp",
],
diff --git a/nn/runtime/test/TestOpenmpSettings.cpp b/nn/runtime/test/TestOpenmpSettings.cpp
new file mode 100644
index 000000000..549473b46
--- /dev/null
+++ b/nn/runtime/test/TestOpenmpSettings.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CpuExecutor.h"
+
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <memory>
+#include <omp.h>
+#include <random>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+
+namespace {
+
+class OpenmpSettingsTest : public ::testing::Test {
+protected:
+ virtual void SetUp() override {
+ const int blocktimeInitial = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeInitial, kOpenmpDefaultBlockTime);
+ }
+ virtual void TearDown() override {
+ const int blocktimeRestored = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeRestored, kOpenmpDefaultBlockTime);
+ }
+ static const int kOpenmpDefaultBlockTime;
+};
+
+const int OpenmpSettingsTest::kOpenmpDefaultBlockTime = 200;
+
+using ::android::nn::ScopedOpenmpSettings;
+
+TEST_F(OpenmpSettingsTest, Test1) {
+ ScopedOpenmpSettings s;
+ const int blocktimeSet = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet, 1);
+}
+
+TEST_F(OpenmpSettingsTest, Test2) {
+ ScopedOpenmpSettings s1;
+ const int blocktimeSet1 = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet1, 1);
+
+ ScopedOpenmpSettings s2;
+ const int blocktimeSet2 = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet2, 1);
+}
+
+TEST_F(OpenmpSettingsTest, TestThreaded) {
+ // Threaded test to validate that each thread gets its own settings.
+ std::vector<std::thread> threads;
+ std::mt19937 randGen;
+ std::uniform_int_distribution<> rand(1, 20);
+ for (int i = 0; i < 10; i++) {
+ const int sleepFor = rand(randGen);
+ threads.push_back(std::thread([sleepFor]() {
+ const int blocktimeSet1 = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet1, kOpenmpDefaultBlockTime);
+
+ ScopedOpenmpSettings s;
+
+ const int blocktimeSet2 = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet2, 1);
+
+ usleep(sleepFor);
+
+ const int blocktimeSet3 = kmp_get_blocktime();
+ ASSERT_EQ(blocktimeSet3, 1);
+ }));
+ }
+ std::for_each(threads.begin(), threads.end(), [](std::thread& t) {
+ t.join();
+ });
+}
+
+} // end namespace