diff options
author | Michael Butler <butlermichael@google.com> | 2019-06-24 10:36:20 -0700 |
---|---|---|
committer | Michael Butler <butlermichael@google.com> | 2019-10-28 13:16:09 -0700 |
commit | a7f867566450998ce6cc8447456f628c6764ec17 (patch) | |
tree | a8e32e59a98b3cd2e4e290bdefdd0511ff2bd234 /nn/driver | |
parent | 769088e2702973d0ffd5be70fcda2f11726b507d (diff) | |
download | ml-a7f867566450998ce6cc8447456f628c6764ec17.tar.gz |
Improve performance of Burst executions
Prior to this CL, the Burst object operated under one of two modes:
(1) "non-blocking" mode, where the Burst controller and server both
constantly poll whether data is available in the FMQ. This approach
is good because it results in very low IPC latency times, but has
the potential to waste power because the CPU is constantly doing
work.
(2) "blocking" mode, where the Burst controller and server both wait on
a futex whenever they attempt to read data from the FMQ. This
approach is good because it saves power (the thread is idle), but
results in higher IPC latency times because the thread must be
awoken before it can continue to retrieve the data.
This CL fuses the two approaches for better performance. Specifically,
the FMQ consumer will poll/spin for a period of time to see if the data
is available. If the data becomes available in this time, it
immediately retrieves it and continues processing. If the data does not
become available within this time, the consumer waits on the futex
until the data becomes available in order to save power.
This makes Burst operate with very low IPC latencies when the driver
service executes fully within the polling time window, and makes the
Burst operate with IPC latencies similar to the synchronous execution
path when the driver service has a longer run time.
In this CL, the default polling time is 0us for power saving execution
preference and 50us otherwise. The polling time is configurable with a
system property, and the time can be specified by running either:
adb shell setprop debug.nn.burst-controller-polling-window <microseconds>
adb shell setprop debug.nn.sample-driver-burst-polling-window <microseconds>
This change also adds includes that were missing indicated by the IWYU
repohook.
Bug: 132073143
Test: mma
Test: NeuralNetworksTest_static
Test: VtsHalNeuralnetworksV1_*TargetTest
Test: inspected logcat and ensured that both spinning- and futex-based
waiting schemes were used for both the ExecutionBurstController and
ExecutionBurstServer
Change-Id: I120e0b24c7236105d75d93696dde8deddd0e3507
Merged-In: I120e0b24c7236105d75d93696dde8deddd0e3507
(cherry picked from commit c82044a1e298e1361fc07c0e6948e77ee2925f47)
Diffstat (limited to 'nn/driver')
-rw-r--r-- | nn/driver/sample/SampleDriver.cpp | 30 | ||||
-rw-r--r-- | nn/driver/sample/SampleDriver.h | 6 |
2 files changed, 30 insertions, 6 deletions
diff --git a/nn/driver/sample/SampleDriver.cpp b/nn/driver/sample/SampleDriver.cpp index 0448c2d79..50cb7729a 100644 --- a/nn/driver/sample/SampleDriver.cpp +++ b/nn/driver/sample/SampleDriver.cpp @@ -19,6 +19,7 @@ #include "SampleDriver.h" #include <android-base/logging.h> +#include <android-base/properties.h> #include <hidl/LegacySupport.h> #include <algorithm> @@ -185,9 +186,9 @@ Return<ErrorStatus> prepareModelBase(const T_Model& model, const SampleDriver* d } // asynchronously prepare the model from a new, detached thread - std::thread([model, driver, callback] { + std::thread([model, driver, preference, callback] { sp<SamplePreparedModel> preparedModel = - new SamplePreparedModel(convertToV1_3(model), driver); + new SamplePreparedModel(convertToV1_3(model), driver, preference); if (!preparedModel->initialize()) { notify(callback, ErrorStatus::INVALID_ARGUMENT, nullptr); return; @@ -472,6 +473,22 @@ class BurstExecutorWithCache : public ExecutionBurstServer::IBurstExecutorWithCa std::map<int32_t, std::optional<RunTimePoolInfo>> mMemoryCache; // cached requestPoolInfos }; +// This is the amount of time the ExecutionBurstServer should spend polling the +// FMQ to see if it has data available before it should fall back to waiting on +// the futex. +static std::chrono::microseconds getPollingTimeWindow() { + constexpr int32_t defaultPollingTimeWindow = 50; +#ifdef NN_DEBUGGABLE + constexpr int32_t minPollingTimeWindow = 0; + const int32_t selectedPollingTimeWindow = + base::GetIntProperty("debug.nn.sample-driver-burst-polling-window", + defaultPollingTimeWindow, minPollingTimeWindow); + return std::chrono::microseconds{selectedPollingTimeWindow}; +#else + return std::chrono::microseconds{defaultPollingTimeWindow}; +#endif // NN_DEBUGGABLE +} + Return<void> SamplePreparedModel::configureExecutionBurst( const sp<V1_2::IBurstCallback>& callback, const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel, @@ -480,17 +497,22 @@ Return<void> SamplePreparedModel::configureExecutionBurst( NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION, "SampleDriver::configureExecutionBurst"); + const bool preferPowerOverLatency = (kPreference == hal::ExecutionPreference::LOW_POWER); + const auto pollingTimeWindow = + (preferPowerOverLatency ? std::chrono::microseconds{0} : getPollingTimeWindow()); + // Alternatively, the burst could be configured via: // const sp<V1_2::IBurstContext> burst = // ExecutionBurstServer::create(callback, requestChannel, - // resultChannel, this); + // resultChannel, this, + // pollingTimeWindow); // // However, this alternative representation does not include a memory map // caching optimization, and adds overhead. const std::shared_ptr<BurstExecutorWithCache> executorWithCache = std::make_shared<BurstExecutorWithCache>(mModel, mDriver, mPoolInfos); const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create( - callback, requestChannel, resultChannel, executorWithCache); + callback, requestChannel, resultChannel, executorWithCache, pollingTimeWindow); if (burst == nullptr) { cb(ErrorStatus::GENERAL_FAILURE, {}); diff --git a/nn/driver/sample/SampleDriver.h b/nn/driver/sample/SampleDriver.h index a85dcd5ea..8788ed3a8 100644 --- a/nn/driver/sample/SampleDriver.h +++ b/nn/driver/sample/SampleDriver.h @@ -91,8 +91,9 @@ class SampleDriver : public hal::IDevice { class SamplePreparedModel : public hal::IPreparedModel { public: - SamplePreparedModel(const hal::Model& model, const SampleDriver* driver) - : mModel(model), mDriver(driver) {} + SamplePreparedModel(const hal::Model& model, const SampleDriver* driver, + hal::ExecutionPreference preference) + : mModel(model), mDriver(driver), kPreference(preference) {} ~SamplePreparedModel() override {} bool initialize(); hal::Return<hal::ErrorStatus> execute( @@ -113,6 +114,7 @@ class SamplePreparedModel : public hal::IPreparedModel { hal::Model mModel; const SampleDriver* mDriver; std::vector<RunTimePoolInfo> mPoolInfos; + const hal::ExecutionPreference kPreference; }; } // namespace sample_driver |