summaryrefslogtreecommitdiff
path: root/nn/driver
diff options
context:
space:
mode:
authorMichael Butler <butlermichael@google.com>2019-06-24 10:36:20 -0700
committerMichael Butler <butlermichael@google.com>2019-10-28 13:16:09 -0700
commita7f867566450998ce6cc8447456f628c6764ec17 (patch)
treea8e32e59a98b3cd2e4e290bdefdd0511ff2bd234 /nn/driver
parent769088e2702973d0ffd5be70fcda2f11726b507d (diff)
downloadml-a7f867566450998ce6cc8447456f628c6764ec17.tar.gz
Improve performance of Burst executions
Prior to this CL, the Burst object operated under one of two modes: (1) "non-blocking" mode, where the Burst controller and server both constantly poll whether data is available in the FMQ. This approach is good because it results in very low IPC latency times, but has the potential to waste power because the CPU is constantly doing work. (2) "blocking" mode, where the Burst controller and server both wait on a futex whenever they attempt to read data from the FMQ. This approach is good because it saves power (the thread is idle), but results in higher IPC latency times because the thread must be awoken before it can continue to retrieve the data. This CL fuses the two approaches for better performance. Specifically, the FMQ consumer will poll/spin for a period of time to see if the data is available. If the data becomes available in this time, it immediately retrieves it and continues processing. If the data does not become available within this time, the consumer waits on the futex until the data becomes available in order to save power. This makes Burst operate with very low IPC latencies when the driver service executes fully within the polling time window, and makes the Burst operate with IPC latencies similar to the synchronous execution path when the driver service has a longer run time. In this CL, the default polling time is 0us for power saving execution preference and 50us otherwise. The polling time is configurable with a system property, and the time can be specified by running either: adb shell setprop debug.nn.burst-controller-polling-window <microseconds> adb shell setprop debug.nn.sample-driver-burst-polling-window <microseconds> This change also adds includes that were missing indicated by the IWYU repohook. Bug: 132073143 Test: mma Test: NeuralNetworksTest_static Test: VtsHalNeuralnetworksV1_*TargetTest Test: inspected logcat and ensured that both spinning- and futex-based waiting schemes were used for both the ExecutionBurstController and ExecutionBurstServer Change-Id: I120e0b24c7236105d75d93696dde8deddd0e3507 Merged-In: I120e0b24c7236105d75d93696dde8deddd0e3507 (cherry picked from commit c82044a1e298e1361fc07c0e6948e77ee2925f47)
Diffstat (limited to 'nn/driver')
-rw-r--r--nn/driver/sample/SampleDriver.cpp30
-rw-r--r--nn/driver/sample/SampleDriver.h6
2 files changed, 30 insertions, 6 deletions
diff --git a/nn/driver/sample/SampleDriver.cpp b/nn/driver/sample/SampleDriver.cpp
index 0448c2d79..50cb7729a 100644
--- a/nn/driver/sample/SampleDriver.cpp
+++ b/nn/driver/sample/SampleDriver.cpp
@@ -19,6 +19,7 @@
#include "SampleDriver.h"
#include <android-base/logging.h>
+#include <android-base/properties.h>
#include <hidl/LegacySupport.h>
#include <algorithm>
@@ -185,9 +186,9 @@ Return<ErrorStatus> prepareModelBase(const T_Model& model, const SampleDriver* d
}
// asynchronously prepare the model from a new, detached thread
- std::thread([model, driver, callback] {
+ std::thread([model, driver, preference, callback] {
sp<SamplePreparedModel> preparedModel =
- new SamplePreparedModel(convertToV1_3(model), driver);
+ new SamplePreparedModel(convertToV1_3(model), driver, preference);
if (!preparedModel->initialize()) {
notify(callback, ErrorStatus::INVALID_ARGUMENT, nullptr);
return;
@@ -472,6 +473,22 @@ class BurstExecutorWithCache : public ExecutionBurstServer::IBurstExecutorWithCa
std::map<int32_t, std::optional<RunTimePoolInfo>> mMemoryCache; // cached requestPoolInfos
};
+// This is the amount of time the ExecutionBurstServer should spend polling the
+// FMQ to see if it has data available before it should fall back to waiting on
+// the futex.
+static std::chrono::microseconds getPollingTimeWindow() {
+ constexpr int32_t defaultPollingTimeWindow = 50;
+#ifdef NN_DEBUGGABLE
+ constexpr int32_t minPollingTimeWindow = 0;
+ const int32_t selectedPollingTimeWindow =
+ base::GetIntProperty("debug.nn.sample-driver-burst-polling-window",
+ defaultPollingTimeWindow, minPollingTimeWindow);
+ return std::chrono::microseconds{selectedPollingTimeWindow};
+#else
+ return std::chrono::microseconds{defaultPollingTimeWindow};
+#endif // NN_DEBUGGABLE
+}
+
Return<void> SamplePreparedModel::configureExecutionBurst(
const sp<V1_2::IBurstCallback>& callback,
const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
@@ -480,17 +497,22 @@ Return<void> SamplePreparedModel::configureExecutionBurst(
NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
"SampleDriver::configureExecutionBurst");
+ const bool preferPowerOverLatency = (kPreference == hal::ExecutionPreference::LOW_POWER);
+ const auto pollingTimeWindow =
+ (preferPowerOverLatency ? std::chrono::microseconds{0} : getPollingTimeWindow());
+
// Alternatively, the burst could be configured via:
// const sp<V1_2::IBurstContext> burst =
// ExecutionBurstServer::create(callback, requestChannel,
- // resultChannel, this);
+ // resultChannel, this,
+ // pollingTimeWindow);
//
// However, this alternative representation does not include a memory map
// caching optimization, and adds overhead.
const std::shared_ptr<BurstExecutorWithCache> executorWithCache =
std::make_shared<BurstExecutorWithCache>(mModel, mDriver, mPoolInfos);
const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(
- callback, requestChannel, resultChannel, executorWithCache);
+ callback, requestChannel, resultChannel, executorWithCache, pollingTimeWindow);
if (burst == nullptr) {
cb(ErrorStatus::GENERAL_FAILURE, {});
diff --git a/nn/driver/sample/SampleDriver.h b/nn/driver/sample/SampleDriver.h
index a85dcd5ea..8788ed3a8 100644
--- a/nn/driver/sample/SampleDriver.h
+++ b/nn/driver/sample/SampleDriver.h
@@ -91,8 +91,9 @@ class SampleDriver : public hal::IDevice {
class SamplePreparedModel : public hal::IPreparedModel {
public:
- SamplePreparedModel(const hal::Model& model, const SampleDriver* driver)
- : mModel(model), mDriver(driver) {}
+ SamplePreparedModel(const hal::Model& model, const SampleDriver* driver,
+ hal::ExecutionPreference preference)
+ : mModel(model), mDriver(driver), kPreference(preference) {}
~SamplePreparedModel() override {}
bool initialize();
hal::Return<hal::ErrorStatus> execute(
@@ -113,6 +114,7 @@ class SamplePreparedModel : public hal::IPreparedModel {
hal::Model mModel;
const SampleDriver* mDriver;
std::vector<RunTimePoolInfo> mPoolInfos;
+ const hal::ExecutionPreference kPreference;
};
} // namespace sample_driver