Reduce OpenMP thread busywait time (200 to 1 ms)

Reduce the time OpenMP threads used by Eigen busywait for new work after completing a piece. This is 200ms by default, which meant that we tried to continuously run 8 threads for 200ms after completing operations using Eigen - starving both our other work and the rest of the system. Bug: 79159165 Test: mm Test: cherry-picked to systrace branch and look at traces Test: NeuralNetworksTest_static Change-Id: I3f21aea6d96c785dd7c47493a739b76876420f7d
author: Mika Raento <mikie@google.com> 2018-05-03 14:43:24 +0100
committer: Mika Raento <mikie@google.com> 2018-05-11 19:16:30 +0100
commit: 6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b (patch)
tree: c2d531415db8d25c1da7573bad009472181cf6fe
parent: 40ac90985a927ff77d234c4d4fc33ab0a9a85648 (diff)
download: ml-6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b.tar.gz
4 files changed, 160 insertions, 0 deletions
diff --git a/nn/common/CpuExecutor.cpp b/nn/common/CpuExecutor.cpp
index 283d30720..0c6219308 100644
--- a/nn/common/CpuExecutor.cpp
+++ b/nn/common/CpuExecutor.cpp
@@ -21,6 +21,8 @@
 #include "NeuralNetworks.h"
 #include "Operations.h"
 
+#include "Eigen/Core"
+#include <omp.h>
 #include <sys/mman.h>
 
 namespace android {
@@ -196,6 +198,8 @@ int CpuExecutor::run(const V1_1::Model& model, const Request& request,
     VLOG(CPUEXE) << "CpuExecutor::run() with request("
                  << SHOW_IF_DEBUG(toString(request)) << ")";
 
+    ScopedOpenmpSettings openMpSettings;
+
     mModel = &model;
     mRequest = &request; // TODO check if mRequest is needed
     initializeRunTimeInfo(modelPoolInfos, requestPoolInfos);
@@ -1530,5 +1534,32 @@ int CpuExecutor::executeOperation(const Operation& operation) {
     return ANEURALNETWORKS_NO_ERROR;
 }
 
+ScopedOpenmpSettings::ScopedOpenmpSettings() {
+    mBlocktimeInitial = kmp_get_blocktime();
+    kmp_set_blocktime(1);  // ms
+
+#if NNAPI_LIMIT_CPU_THREADS
+    // Code not yet enabled. Choosing the number of threads to be based on
+    // benchmarking. See longer comment by the class declaration.
+    mMaxThreadsInitial = Eigen::nbThreads();
+    const int nProcs = omp_get_num_procs();
+    int threads = nProcs;
+    if (nProcs >= 8) {
+        threads = nProcs - 4;
+    } else if (nProcs >= 4) {
+        threads = nProcs - 2;
+    }
+    Eigen::setNbThreads(threads);
+#endif
+}
+
+ScopedOpenmpSettings::~ScopedOpenmpSettings() {
+    kmp_set_blocktime(mBlocktimeInitial);
+#if NNAPI_LIMIT_CPU_THREADS
+    Eigen::setNbThreads(mMaxThreadsInitial);
+#endif
+}
+
+
 } // namespace nn
 } // namespace android
diff --git a/nn/common/include/CpuExecutor.h b/nn/common/include/CpuExecutor.h
index c7a318bc3..64a46b65f 100644
--- a/nn/common/include/CpuExecutor.h
+++ b/nn/common/include/CpuExecutor.h
@@ -22,6 +22,7 @@
 #include "Utils.h"
 
 #include <algorithm>
+#include <android-base/macros.h>
 #include <vector>
 
 namespace android {
@@ -136,6 +137,43 @@ private:
     std::vector<RunTimeOperandInfo> mOperands;
 };
 
+// Class for setting reasonable OpenMP threading settings. (OpenMP is used by
+// the Eigen matrix library.)
+//
+// Currently sets a low blocktime: the time OpenMP threads busy-wait for more
+// work before going to sleep. See b/79159165, https://reviews.llvm.org/D18577.
+// The default is 200ms, we set to 1ms here. This should allow for the threads
+// to not sleep before the next operation, but release CPU to other work
+// quickly.
+//
+// The OpenMP settings are thread-local (applying only to worker threads formed
+// from that thread), see https://software.intel.com/en-us/node/522688 and
+// http://lists.llvm.org/pipermail/openmp-dev/2016-July/001432.html. This class
+// ensures that within the scope in which an object is instantiated we use the
+// right settings (scopes may be nested), as long as no other library changes
+// them.  (Note that in current NNAPI usage only one instance is used in the
+// CpuExecutor thread).
+//
+// TODO(mikie): consider also setting the number of threads used. Using as many
+// threads as there are cores results in more variable performance: if we don't
+// get all cores for our threads, the latency is doubled as we wait for one core
+// to do twice the amount of work. Reality is complicated though as not all
+// cores are the same. Decision to be based on benchmarking against a
+// representative set of workloads and devices. I'm keeping the code here for
+// reference.
+class ScopedOpenmpSettings {
+public:
+    ScopedOpenmpSettings();
+    ~ScopedOpenmpSettings();
+    DISALLOW_COPY_AND_ASSIGN(ScopedOpenmpSettings);
+private:
+    int mBlocktimeInitial;
+#if NNAPI_LIMIT_CPU_THREADS
+    int mMaxThreadsInitial;
+#endif
+};
+
+
 namespace {
 
 template <typename T>
diff --git a/nn/runtime/test/Android.bp b/nn/runtime/test/Android.bp
index deb240b06..4dc875a55 100644
--- a/nn/runtime/test/Android.bp
+++ b/nn/runtime/test/Android.bp
@@ -77,6 +77,7 @@ cc_test {
         // not exported from libneuralnetworks.so).
         "TestExecution.cpp",
         "TestMemoryInternal.cpp",
+        "TestOpenmpSettings.cpp",
         "TestPartitioning.cpp",
         "TestPartitioningRandom.cpp",
     ],
diff --git a/nn/runtime/test/TestOpenmpSettings.cpp b/nn/runtime/test/TestOpenmpSettings.cpp
new file mode 100644
index 000000000..549473b46
--- /dev/null
+++ b/nn/runtime/test/TestOpenmpSettings.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2018 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CpuExecutor.h"
+
+#include <algorithm>
+#include <gtest/gtest.h>
+#include <memory>
+#include <omp.h>
+#include <random>
+#include <thread>
+#include <unistd.h>
+#include <vector>
+
+namespace {
+
+class OpenmpSettingsTest : public ::testing::Test {
+protected:
+    virtual void SetUp() override {
+        const int blocktimeInitial = kmp_get_blocktime();
+        ASSERT_EQ(blocktimeInitial, kOpenmpDefaultBlockTime);
+    }
+    virtual void TearDown() override {
+        const int blocktimeRestored = kmp_get_blocktime();
+        ASSERT_EQ(blocktimeRestored, kOpenmpDefaultBlockTime);
+    }
+    static const int kOpenmpDefaultBlockTime;
+};
+
+const int OpenmpSettingsTest::kOpenmpDefaultBlockTime = 200;
+
+using ::android::nn::ScopedOpenmpSettings;
+
+TEST_F(OpenmpSettingsTest, Test1) {
+    ScopedOpenmpSettings s;
+    const int blocktimeSet = kmp_get_blocktime();
+    ASSERT_EQ(blocktimeSet, 1);
+}
+
+TEST_F(OpenmpSettingsTest, Test2) {
+    ScopedOpenmpSettings s1;
+    const int blocktimeSet1 = kmp_get_blocktime();
+    ASSERT_EQ(blocktimeSet1, 1);
+
+    ScopedOpenmpSettings s2;
+    const int blocktimeSet2 = kmp_get_blocktime();
+    ASSERT_EQ(blocktimeSet2, 1);
+}
+
+TEST_F(OpenmpSettingsTest, TestThreaded) {
+    // Threaded test to validate that each thread gets its own settings.
+    std::vector<std::thread> threads;
+    std::mt19937 randGen;
+    std::uniform_int_distribution<> rand(1, 20);
+    for (int i = 0; i < 10; i++) {
+        const int sleepFor = rand(randGen);
+        threads.push_back(std::thread([sleepFor]() {
+            const int blocktimeSet1 = kmp_get_blocktime();
+            ASSERT_EQ(blocktimeSet1, kOpenmpDefaultBlockTime);
+
+            ScopedOpenmpSettings s;
+
+            const int blocktimeSet2 = kmp_get_blocktime();
+            ASSERT_EQ(blocktimeSet2, 1);
+
+            usleep(sleepFor);
+
+            const int blocktimeSet3 = kmp_get_blocktime();
+            ASSERT_EQ(blocktimeSet3, 1);
+        }));
+    }
+    std::for_each(threads.begin(), threads.end(), [](std::thread& t) {
+        t.join();
+    });
+}
+
+}  // end namespace
author	Mika Raento <mikie@google.com>	2018-05-03 14:43:24 +0100
committer	Mika Raento <mikie@google.com>	2018-05-11 19:16:30 +0100
commit	6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b (patch)
tree	c2d531415db8d25c1da7573bad009472181cf6fe
parent	40ac90985a927ff77d234c4d4fc33ab0a9a85648 (diff)
download	ml-6a2a11dc1edaa61647c35650d7dbdd8893d6ce8b.tar.gz