Improve performance of Burst executions

Prior to this CL, the Burst object operated under one of two modes: (1) "non-blocking" mode, where the Burst controller and server both constantly poll whether data is available in the FMQ. This approach is good because it results in very low IPC latency times, but has the potential to waste power because the CPU is constantly doing work. (2) "blocking" mode, where the Burst controller and server both wait on a futex whenever they attempt to read data from the FMQ. This approach is good because it saves power (the thread is idle), but results in higher IPC latency times because the thread must be awoken before it can continue to retrieve the data. This CL fuses the two approaches for better performance. Specifically, the FMQ consumer will poll/spin for a period of time to see if the data is available. If the data becomes available in this time, it immediately retrieves it and continues processing. If the data does not become available within this time, the consumer waits on the futex until the data becomes available in order to save power. This makes Burst operate with very low IPC latencies when the driver service executes fully within the polling time window, and makes the Burst operate with IPC latencies similar to the synchronous execution path when the driver service has a longer run time. In this CL, the default polling time is 0us for power saving execution preference and 50us otherwise. The polling time is configurable with a system property, and the time can be specified by running either: adb shell setprop debug.nn.burst-controller-polling-window <microseconds> adb shell setprop debug.nn.sample-driver-burst-polling-window <microseconds> This change also adds includes that were missing indicated by the IWYU repohook. Bug: 132073143 Test: mma Test: NeuralNetworksTest_static Test: VtsHalNeuralnetworksV1_*TargetTest Test: inspected logcat and ensured that both spinning- and futex-based waiting schemes were used for both the ExecutionBurstController and ExecutionBurstServer Change-Id: I120e0b24c7236105d75d93696dde8deddd0e3507 Merged-In: I120e0b24c7236105d75d93696dde8deddd0e3507 (cherry picked from commit c82044a1e298e1361fc07c0e6948e77ee2925f47)
author: Michael Butler <butlermichael@google.com> 2019-06-24 10:36:20 -0700
committer: Michael Butler <butlermichael@google.com> 2019-10-28 13:16:09 -0700
commit: a7f867566450998ce6cc8447456f628c6764ec17 (patch)
tree: a8e32e59a98b3cd2e4e290bdefdd0511ff2bd234
parent: 769088e2702973d0ffd5be70fcda2f11726b507d (diff)
download: ml-a7f867566450998ce6cc8447456f628c6764ec17.tar.gz
13 files changed, 307 insertions, 172 deletions
diff --git a/nn/common/ExecutionBurstController.cpp b/nn/common/ExecutionBurstController.cpp
index f3a771b11..4456ed10d 100644
--- a/nn/common/ExecutionBurstController.cpp
+++ b/nn/common/ExecutionBurstController.cpp
@@ -19,9 +19,16 @@
 #include "ExecutionBurstController.h"
 
 #include <android-base/logging.h>
+
+#include <algorithm>
 #include <cstring>
 #include <limits>
+#include <memory>
 #include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 #include "Tracing.h"
 #include "Utils.h"
 
@@ -30,9 +37,8 @@ namespace {
 
 using namespace hal;
 
-using hardware::MQDescriptorSync;
-using FmqRequestDescriptor = MQDescriptorSync<FmqRequestDatum>;
-using FmqResultDescriptor = MQDescriptorSync<FmqResultDatum>;
+using FmqRequestDescriptor = hardware::MQDescriptorSync<FmqRequestDatum>;
+using FmqResultDescriptor = hardware::MQDescriptorSync<FmqResultDatum>;
 
 constexpr Timing kNoTiming = {std::numeric_limits<uint64_t>::max(),
                               std::numeric_limits<uint64_t>::max()};
@@ -221,22 +227,23 @@ std::optional<std::tuple<ErrorStatus, std::vector<OutputShape>, Timing>> deseria
 }
 
 std::pair<std::unique_ptr<ResultChannelReceiver>, const FmqResultDescriptor*>
-ResultChannelReceiver::create(size_t channelLength, bool blocking) {
+ResultChannelReceiver::create(size_t channelLength, std::chrono::microseconds pollingTimeWindow) {
     std::unique_ptr<FmqResultChannel> fmqResultChannel =
-            std::make_unique<FmqResultChannel>(channelLength, /*confEventFlag=*/blocking);
+            std::make_unique<FmqResultChannel>(channelLength, /*confEventFlag=*/true);
     if (!fmqResultChannel->isValid()) {
         LOG(ERROR) << "Unable to create ResultChannelReceiver";
         return {nullptr, nullptr};
     }
+
     const FmqResultDescriptor* descriptor = fmqResultChannel->getDesc();
     return std::make_pair(
-            std::make_unique<ResultChannelReceiver>(std::move(fmqResultChannel), blocking),
+            std::make_unique<ResultChannelReceiver>(std::move(fmqResultChannel), pollingTimeWindow),
             descriptor);
 }
 
 ResultChannelReceiver::ResultChannelReceiver(std::unique_ptr<FmqResultChannel> fmqResultChannel,
-                                             bool blocking)
-    : mFmqResultChannel(std::move(fmqResultChannel)), mBlocking(blocking) {}
+                                             std::chrono::microseconds pollingTimeWindow)
+    : mFmqResultChannel(std::move(fmqResultChannel)), kPollingTimeWindow(pollingTimeWindow) {}
 
 std::optional<std::tuple<ErrorStatus, std::vector<OutputShape>, Timing>>
 ResultChannelReceiver::getBlocking() {
@@ -254,16 +261,14 @@ void ResultChannelReceiver::invalidate() {
     // force unblock
     // ExecutionBurstController waits on a result packet after sending a
     // request. If the driver containing ExecutionBurstServer crashes, the
-    // controller will still be waiting on the futex (assuming mBlocking is
-    // true). This force unblock wakes up any thread waiting on the futex.
-    if (mBlocking) {
-        // TODO: look for a different/better way to signal/notify the futex to
-        // wake up any thread waiting on it
-        FmqResultDatum datum;
-        datum.packetInformation({/*.packetSize=*/0, /*.errorStatus=*/ErrorStatus::GENERAL_FAILURE,
-                                 /*.numberOfOperands=*/0});
-        mFmqResultChannel->writeBlocking(&datum, 1);
-    }
+    // controller may be waiting on the futex. This force unblock wakes up any
+    // thread waiting on the futex.
+    // TODO: look for a different/better way to signal/notify the futex to
+    // wake up any thread waiting on it
+    FmqResultDatum datum;
+    datum.packetInformation({/*.packetSize=*/0, /*.errorStatus=*/ErrorStatus::GENERAL_FAILURE,
+                             /*.numberOfOperands=*/0});
+    mFmqResultChannel->writeBlocking(&datum, 1);
 }
 
 std::optional<std::vector<FmqResultDatum>> ResultChannelReceiver::getPacketBlocking() {
@@ -273,17 +278,42 @@ std::optional<std::vector<FmqResultDatum>> ResultChannelReceiver::getPacketBlock
         return std::nullopt;
     }
 
-    // wait for result packet and read first element of result packet
-    FmqResultDatum datum;
-    bool success = true;
-    if (mBlocking) {
-        success = mFmqResultChannel->readBlocking(&datum, 1);
-    } else {
-        while ((success = mValid.load(std::memory_order_relaxed)) &&
-               !mFmqResultChannel->read(&datum, 1)) {
+    // First spend time polling if results are available in FMQ instead of
+    // waiting on the futex. Polling is more responsive (yielding lower
+    // latencies), but can take up more power, so only poll for a limited period
+    // of time.
+
+    auto& getCurrentTime = std::chrono::high_resolution_clock::now;
+    const auto timeToStopPolling = getCurrentTime() + kPollingTimeWindow;
+
+    while (getCurrentTime() < timeToStopPolling) {
+        // if class is being torn down, immediately return
+        if (!mValid.load(std::memory_order_relaxed)) {
+            return std::nullopt;
+        }
+
+        // Check if data is available. If it is, immediately retrieve it and
+        // return.
+        const size_t available = mFmqResultChannel->availableToRead();
+        if (available > 0) {
+            std::vector<FmqResultDatum> packet(available);
+            const bool success = mFmqResultChannel->read(packet.data(), available);
+            if (!success) {
+                LOG(ERROR) << "Error receiving packet";
+                return std::nullopt;
+            }
+            return std::make_optional(std::move(packet));
         }
     }
 
+    // If we get to this point, we either stopped polling because it was taking
+    // too long or polling was not allowed. Instead, perform a blocking call
+    // which uses a futex to save power.
+
+    // wait for result packet and read first element of result packet
+    FmqResultDatum datum;
+    bool success = mFmqResultChannel->readBlocking(&datum, 1);
+
     // retrieve remaining elements
     // NOTE: all of the data is already available at this point, so there's no
     // need to do a blocking wait to wait for more data. This is known because
@@ -310,22 +340,21 @@ std::optional<std::vector<FmqResultDatum>> ResultChannelReceiver::getPacketBlock
 }
 
 std::pair<std::unique_ptr<RequestChannelSender>, const FmqRequestDescriptor*>
-RequestChannelSender::create(size_t channelLength, bool blocking) {
+RequestChannelSender::create(size_t channelLength) {
     std::unique_ptr<FmqRequestChannel> fmqRequestChannel =
-            std::make_unique<FmqRequestChannel>(channelLength, /*confEventFlag=*/blocking);
+            std::make_unique<FmqRequestChannel>(channelLength, /*confEventFlag=*/true);
     if (!fmqRequestChannel->isValid()) {
         LOG(ERROR) << "Unable to create RequestChannelSender";
         return {nullptr, nullptr};
     }
+
     const FmqRequestDescriptor* descriptor = fmqRequestChannel->getDesc();
-    return std::make_pair(
-            std::make_unique<RequestChannelSender>(std::move(fmqRequestChannel), blocking),
-            descriptor);
+    return std::make_pair(std::make_unique<RequestChannelSender>(std::move(fmqRequestChannel)),
+                          descriptor);
 }
 
-RequestChannelSender::RequestChannelSender(std::unique_ptr<FmqRequestChannel> fmqRequestChannel,
-                                           bool blocking)
-    : mFmqRequestChannel(std::move(fmqRequestChannel)), mBlocking(blocking) {}
+RequestChannelSender::RequestChannelSender(std::unique_ptr<FmqRequestChannel> fmqRequestChannel)
+    : mFmqRequestChannel(std::move(fmqRequestChannel)) {}
 
 bool RequestChannelSender::send(const Request& request, MeasureTiming measure,
                                 const std::vector<int32_t>& slots) {
@@ -344,11 +373,9 @@ bool RequestChannelSender::sendPacket(const std::vector<FmqRequestDatum>& packet
         return false;
     }
 
-    if (mBlocking) {
-        return mFmqRequestChannel->writeBlocking(packet.data(), packet.size());
-    } else {
-        return mFmqRequestChannel->write(packet.data(), packet.size());
-    }
+    // Always send the packet with "blocking" because this signals the futex and
+    // unblocks the consumer if it is waiting on the futex.
+    return mFmqRequestChannel->writeBlocking(packet.data(), packet.size());
 }
 
 void RequestChannelSender::invalidate() {
@@ -438,7 +465,7 @@ int32_t ExecutionBurstController::ExecutionBurstCallback::allocateSlotLocked() {
 }
 
 std::unique_ptr<ExecutionBurstController> ExecutionBurstController::create(
-        const sp<IPreparedModel>& preparedModel, bool blocking) {
+        const sp<IPreparedModel>& preparedModel, std::chrono::microseconds pollingTimeWindow) {
     // check inputs
     if (preparedModel == nullptr) {
         LOG(ERROR) << "ExecutionBurstController::create passed a nullptr";
@@ -450,9 +477,9 @@ std::unique_ptr<ExecutionBurstController> ExecutionBurstController::create(
 
     // create FMQ objects
     auto [requestChannelSenderTemp, requestChannelDescriptor] =
-            RequestChannelSender::create(kExecutionBurstChannelLength, blocking);
+            RequestChannelSender::create(kExecutionBurstChannelLength);
     auto [resultChannelReceiverTemp, resultChannelDescriptor] =
-            ResultChannelReceiver::create(kExecutionBurstChannelLength, blocking);
+            ResultChannelReceiver::create(kExecutionBurstChannelLength, pollingTimeWindow);
     std::shared_ptr<RequestChannelSender> requestChannelSender =
             std::move(requestChannelSenderTemp);
     std::shared_ptr<ResultChannelReceiver> resultChannelReceiver =
@@ -543,15 +570,13 @@ static std::tuple<int, std::vector<OutputShape>, Timing, bool> getExecutionResul
     return {n, std::move(checkedOutputShapes), checkedTiming, fallback};
 }
 
-std::tuple<ErrorStatus, std::vector<OutputShape>, Timing> ExecutionBurstController::compute(
-        const Request& request, MeasureTiming measure, const std::vector<intptr_t>& memoryIds) {
-    auto [status, outputShapes, timing, fallback] = tryCompute(request, measure, memoryIds);
-    (void)fallback;  // ignore fallback field
-    return {convertResultCodeToErrorStatus(status), std::move(outputShapes), timing};
-}
-
-std::tuple<int, std::vector<OutputShape>, Timing, bool> ExecutionBurstController::tryCompute(
+std::tuple<int, std::vector<OutputShape>, Timing, bool> ExecutionBurstController::compute(
         const Request& request, MeasureTiming measure, const std::vector<intptr_t>& memoryIds) {
+    // This is the first point when we know an execution is occurring, so begin
+    // to collect systraces. Note that the first point we can begin collecting
+    // systraces in ExecutionBurstServer is when the RequestChannelReceiver
+    // realizes there is data in the FMQ, so ExecutionBurstServer collects
+    // systraces at different points in the code.
     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION, "ExecutionBurstController::compute");
 
     std::lock_guard<std::mutex> guard(mMutex);
diff --git a/nn/common/ExecutionBurstServer.cpp b/nn/common/ExecutionBurstServer.cpp
index 74bc34058..ec935dad6 100644
--- a/nn/common/ExecutionBurstServer.cpp
+++ b/nn/common/ExecutionBurstServer.cpp
@@ -20,9 +20,14 @@
 
 #include <android-base/logging.h>
 
+#include <algorithm>
 #include <cstring>
 #include <limits>
 #include <map>
+#include <memory>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #include "Tracing.h"
 
@@ -31,6 +36,8 @@ namespace {
 
 using namespace hal;
 
+using hardware::MQDescriptorSync;
+
 constexpr Timing kNoTiming = {std::numeric_limits<uint64_t>::max(),
                               std::numeric_limits<uint64_t>::max()};
 
@@ -298,20 +305,27 @@ std::optional<std::tuple<Request, std::vector<int32_t>, MeasureTiming>> deserial
 // RequestChannelReceiver methods
 
 std::unique_ptr<RequestChannelReceiver> RequestChannelReceiver::create(
-        const FmqRequestDescriptor& requestChannel) {
+        const FmqRequestDescriptor& requestChannel, std::chrono::microseconds pollingTimeWindow) {
     std::unique_ptr<FmqRequestChannel> fmqRequestChannel =
             std::make_unique<FmqRequestChannel>(requestChannel);
+
     if (!fmqRequestChannel->isValid()) {
         LOG(ERROR) << "Unable to create RequestChannelReceiver";
         return nullptr;
     }
-    const bool blocking = fmqRequestChannel->getEventFlagWord() != nullptr;
-    return std::make_unique<RequestChannelReceiver>(std::move(fmqRequestChannel), blocking);
+    if (fmqRequestChannel->getEventFlagWord() == nullptr) {
+        LOG(ERROR)
+                << "RequestChannelReceiver::create was passed an MQDescriptor without an EventFlag";
+        return nullptr;
+    }
+
+    return std::make_unique<RequestChannelReceiver>(std::move(fmqRequestChannel),
+                                                    pollingTimeWindow);
 }
 
 RequestChannelReceiver::RequestChannelReceiver(std::unique_ptr<FmqRequestChannel> fmqRequestChannel,
-                                               bool blocking)
-    : mFmqRequestChannel(std::move(fmqRequestChannel)), mBlocking(blocking) {}
+                                               std::chrono::microseconds pollingTimeWindow)
+    : mFmqRequestChannel(std::move(fmqRequestChannel)), kPollingTimeWindow(pollingTimeWindow) {}
 
 std::optional<std::tuple<Request, std::vector<int32_t>, MeasureTiming>>
 RequestChannelReceiver::getBlocking() {
@@ -328,17 +342,15 @@ void RequestChannelReceiver::invalidate() {
 
     // force unblock
     // ExecutionBurstServer is by default waiting on a request packet. If the
-    // client process destroys its burst object, the server will still be
-    // waiting on the futex (assuming mBlocking is true). This force unblock
-    // wakes up any thread waiting on the futex.
-    if (mBlocking) {
-        // TODO: look for a different/better way to signal/notify the futex to
-        // wake up any thread waiting on it
-        FmqRequestDatum datum;
-        datum.packetInformation({/*.packetSize=*/0, /*.numberOfInputOperands=*/0,
-                                 /*.numberOfOutputOperands=*/0, /*.numberOfPools=*/0});
-        mFmqRequestChannel->writeBlocking(&datum, 1);
-    }
+    // client process destroys its burst object, the server may still be waiting
+    // on the futex. This force unblock wakes up any thread waiting on the
+    // futex.
+    // TODO: look for a different/better way to signal/notify the futex to wake
+    // up any thread waiting on it
+    FmqRequestDatum datum;
+    datum.packetInformation({/*.packetSize=*/0, /*.numberOfInputOperands=*/0,
+                             /*.numberOfOutputOperands=*/0, /*.numberOfPools=*/0});
+    mFmqRequestChannel->writeBlocking(&datum, 1);
 }
 
 std::optional<std::vector<FmqRequestDatum>> RequestChannelReceiver::getPacketBlocking() {
@@ -348,17 +360,53 @@ std::optional<std::vector<FmqRequestDatum>> RequestChannelReceiver::getPacketBlo
         return std::nullopt;
     }
 
-    // wait for request packet and read first element of request packet
-    FmqRequestDatum datum;
-    bool success = false;
-    if (mBlocking) {
-        success = mFmqRequestChannel->readBlocking(&datum, 1);
-    } else {
-        while ((success = !mTeardown.load(std::memory_order_relaxed)) &&
-               !mFmqRequestChannel->read(&datum, 1)) {
+    // First spend time polling if results are available in FMQ instead of
+    // waiting on the futex. Polling is more responsive (yielding lower
+    // latencies), but can take up more power, so only poll for a limited period
+    // of time.
+
+    auto& getCurrentTime = std::chrono::high_resolution_clock::now;
+    const auto timeToStopPolling = getCurrentTime() + kPollingTimeWindow;
+
+    while (getCurrentTime() < timeToStopPolling) {
+        // if class is being torn down, immediately return
+        if (mTeardown.load(std::memory_order_relaxed)) {
+            return std::nullopt;
+        }
+
+        // Check if data is available. If it is, immediately retrieve it and
+        // return.
+        const size_t available = mFmqRequestChannel->availableToRead();
+        if (available > 0) {
+            // This is the first point when we know an execution is occurring,
+            // so begin to collect systraces. Note that a similar systrace does
+            // not exist at the corresponding point in
+            // ResultChannelReceiver::getPacketBlocking because the execution is
+            // already in flight.
+            NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
+                         "ExecutionBurstServer getting packet");
+            std::vector<FmqRequestDatum> packet(available);
+            const bool success = mFmqRequestChannel->read(packet.data(), available);
+            if (!success) {
+                LOG(ERROR) << "Error receiving packet";
+                return std::nullopt;
+            }
+            return std::make_optional(std::move(packet));
         }
     }
 
+    // If we get to this point, we either stopped polling because it was taking
+    // too long or polling was not allowed. Instead, perform a blocking call
+    // which uses a futex to save power.
+
+    // wait for request packet and read first element of request packet
+    FmqRequestDatum datum;
+    bool success = mFmqRequestChannel->readBlocking(&datum, 1);
+
+    // This is the first point when we know an execution is occurring, so begin
+    // to collect systraces. Note that a similar systrace does not exist at the
+    // corresponding point in ResultChannelReceiver::getPacketBlocking because
+    // the execution is already in flight.
     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION, "ExecutionBurstServer getting packet");
 
     // retrieve remaining elements
@@ -393,17 +441,21 @@ std::unique_ptr<ResultChannelSender> ResultChannelSender::create(
         const FmqResultDescriptor& resultChannel) {
     std::unique_ptr<FmqResultChannel> fmqResultChannel =
             std::make_unique<FmqResultChannel>(resultChannel);
+
     if (!fmqResultChannel->isValid()) {
         LOG(ERROR) << "Unable to create RequestChannelSender";
         return nullptr;
     }
-    const bool blocking = fmqResultChannel->getEventFlagWord() != nullptr;
-    return std::make_unique<ResultChannelSender>(std::move(fmqResultChannel), blocking);
+    if (fmqResultChannel->getEventFlagWord() == nullptr) {
+        LOG(ERROR) << "ResultChannelSender::create was passed an MQDescriptor without an EventFlag";
+        return nullptr;
+    }
+
+    return std::make_unique<ResultChannelSender>(std::move(fmqResultChannel));
 }
 
-ResultChannelSender::ResultChannelSender(std::unique_ptr<FmqResultChannel> fmqResultChannel,
-                                         bool blocking)
-    : mFmqResultChannel(std::move(fmqResultChannel)), mBlocking(blocking) {}
+ResultChannelSender::ResultChannelSender(std::unique_ptr<FmqResultChannel> fmqResultChannel)
+    : mFmqResultChannel(std::move(fmqResultChannel)) {}
 
 bool ResultChannelSender::send(ErrorStatus errorStatus,
                                const std::vector<OutputShape>& outputShapes, Timing timing) {
@@ -417,18 +469,15 @@ bool ResultChannelSender::sendPacket(const std::vector<FmqResultDatum>& packet)
                 << "ResultChannelSender::sendPacket -- packet size exceeds size available in FMQ";
         const std::vector<FmqResultDatum> errorPacket =
                 serialize(ErrorStatus::GENERAL_FAILURE, {}, kNoTiming);
-        if (mBlocking) {
-            return mFmqResultChannel->writeBlocking(errorPacket.data(), errorPacket.size());
-        } else {
-            return mFmqResultChannel->write(errorPacket.data(), errorPacket.size());
-        }
-    }
 
-    if (mBlocking) {
-        return mFmqResultChannel->writeBlocking(packet.data(), packet.size());
-    } else {
-        return mFmqResultChannel->write(packet.data(), packet.size());
+        // Always send the packet with "blocking" because this signals the futex
+        // and unblocks the consumer if it is waiting on the futex.
+        return mFmqResultChannel->writeBlocking(errorPacket.data(), errorPacket.size());
     }
+
+    // Always send the packet with "blocking" because this signals the futex and
+    // unblocks the consumer if it is waiting on the futex.
+    return mFmqResultChannel->writeBlocking(packet.data(), packet.size());
 }
 
 // ExecutionBurstServer methods
@@ -436,7 +485,8 @@ bool ResultChannelSender::sendPacket(const std::vector<FmqResultDatum>& packet)
 sp<ExecutionBurstServer> ExecutionBurstServer::create(
         const sp<IBurstCallback>& callback, const MQDescriptorSync<FmqRequestDatum>& requestChannel,
         const MQDescriptorSync<FmqResultDatum>& resultChannel,
-        std::shared_ptr<IBurstExecutorWithCache> executorWithCache) {
+        std::shared_ptr<IBurstExecutorWithCache> executorWithCache,
+        std::chrono::microseconds pollingTimeWindow) {
     // check inputs
     if (callback == nullptr || executorWithCache == nullptr) {
         LOG(ERROR) << "ExecutionBurstServer::create passed a nullptr";
@@ -445,7 +495,7 @@ sp<ExecutionBurstServer> ExecutionBurstServer::create(
 
     // create FMQ objects
     std::unique_ptr<RequestChannelReceiver> requestChannelReceiver =
-            RequestChannelReceiver::create(requestChannel);
+            RequestChannelReceiver::create(requestChannel, pollingTimeWindow);
     std::unique_ptr<ResultChannelSender> resultChannelSender =
             ResultChannelSender::create(resultChannel);
 
@@ -462,7 +512,8 @@ sp<ExecutionBurstServer> ExecutionBurstServer::create(
 
 sp<ExecutionBurstServer> ExecutionBurstServer::create(
         const sp<IBurstCallback>& callback, const MQDescriptorSync<FmqRequestDatum>& requestChannel,
-        const MQDescriptorSync<FmqResultDatum>& resultChannel, IPreparedModel* preparedModel) {
+        const MQDescriptorSync<FmqResultDatum>& resultChannel, IPreparedModel* preparedModel,
+        std::chrono::microseconds pollingTimeWindow) {
     // check relevant input
     if (preparedModel == nullptr) {
         LOG(ERROR) << "ExecutionBurstServer::create passed a nullptr";
@@ -475,7 +526,7 @@ sp<ExecutionBurstServer> ExecutionBurstServer::create(
 
     // make and return context
     return ExecutionBurstServer::create(callback, requestChannel, resultChannel,
-                                        preparedModelAdapter);
+                                        preparedModelAdapter, pollingTimeWindow);
 }
 
 ExecutionBurstServer::ExecutionBurstServer(
diff --git a/nn/common/include/ExecutionBurstController.h b/nn/common/include/ExecutionBurstController.h
index 6328096b0..652b0d911 100644
--- a/nn/common/include/ExecutionBurstController.h
+++ b/nn/common/include/ExecutionBurstController.h
@@ -17,18 +17,21 @@
 #ifndef ANDROID_FRAMEWORKS_ML_NN_COMMON_EXECUTION_BURST_CONTROLLER_H
 #define ANDROID_FRAMEWORKS_ML_NN_COMMON_EXECUTION_BURST_CONTROLLER_H
 
-#include "HalInterfaces.h"
-
 #include <android-base/macros.h>
 #include <fmq/MessageQueue.h>
 #include <hidl/MQDescriptor.h>
 
 #include <atomic>
+#include <chrono>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <stack>
 #include <tuple>
+#include <utility>
+#include <vector>
+
+#include "HalInterfaces.h"
 
 namespace android::nn {
 
@@ -70,10 +73,10 @@ std::optional<std::tuple<hal::ErrorStatus, std::vector<hal::OutputShape>, hal::T
  *
  * Because the receiver can wait on a packet that may never come (e.g., because
  * the sending side of the packet has been closed), this object can be
- * invalidating, unblocking the receiver.
+ * invalidated, unblocking the receiver.
  */
 class ResultChannelReceiver {
-    using FmqResultDescriptor = ::android::hardware::MQDescriptorSync<hal::FmqResultDatum>;
+    using FmqResultDescriptor = hardware::MQDescriptorSync<hal::FmqResultDatum>;
     using FmqResultChannel =
             hardware::MessageQueue<hal::FmqResultDatum, hardware::kSynchronizedReadWrite>;
 
@@ -84,13 +87,15 @@ class ResultChannelReceiver {
      * Prefer this call over the constructor.
      *
      * @param channelLength Number of elements in the FMQ.
-     * @param blocking 'true' if FMQ should use futex, 'false' if it should
-     *     spin-wait.
+     * @param pollingTimeWindow How much time (in microseconds) the
+     *     ResultChannelReceiver is allowed to poll the FMQ before waiting on
+     *     the blocking futex. Polling may result in lower latencies at the
+     *     potential cost of more power usage.
      * @return A pair of ResultChannelReceiver and the FMQ descriptor on
      *     successful creation, both nullptr otherwise.
      */
     static std::pair<std::unique_ptr<ResultChannelReceiver>, const FmqResultDescriptor*> create(
-            size_t channelLength, bool blocking);
+            size_t channelLength, std::chrono::microseconds pollingTimeWindow);
 
     /**
      * Get the result from the channel.
@@ -114,12 +119,13 @@ class ResultChannelReceiver {
     // prefer calling ResultChannelReceiver::getBlocking
     std::optional<std::vector<hal::FmqResultDatum>> getPacketBlocking();
 
-    ResultChannelReceiver(std::unique_ptr<FmqResultChannel> fmqResultChannel, bool blocking);
+    ResultChannelReceiver(std::unique_ptr<FmqResultChannel> fmqResultChannel,
+                          std::chrono::microseconds pollingTimeWindow);
 
    private:
     const std::unique_ptr<FmqResultChannel> mFmqResultChannel;
     std::atomic<bool> mValid{true};
-    const bool mBlocking;
+    const std::chrono::microseconds kPollingTimeWindow;
 };
 
 /**
@@ -128,7 +134,7 @@ class ResultChannelReceiver {
  * available.
  */
 class RequestChannelSender {
-    using FmqRequestDescriptor = ::android::hardware::MQDescriptorSync<hal::FmqRequestDatum>;
+    using FmqRequestDescriptor = hardware::MQDescriptorSync<hal::FmqRequestDatum>;
     using FmqRequestChannel =
             hardware::MessageQueue<hal::FmqRequestDatum, hardware::kSynchronizedReadWrite>;
 
@@ -139,13 +145,11 @@ class RequestChannelSender {
      * Prefer this call over the constructor.
      *
      * @param channelLength Number of elements in the FMQ.
-     * @param blocking 'true' if FMQ should use futex, 'false' if it should
-     *     spin-wait.
      * @return A pair of ResultChannelReceiver and the FMQ descriptor on
      *     successful creation, both nullptr otherwise.
      */
     static std::pair<std::unique_ptr<RequestChannelSender>, const FmqRequestDescriptor*> create(
-            size_t channelLength, bool blocking);
+            size_t channelLength);
 
     /**
      * Send the request to the channel.
@@ -169,12 +173,11 @@ class RequestChannelSender {
     // prefer calling RequestChannelSender::send
     bool sendPacket(const std::vector<hal::FmqRequestDatum>& packet);
 
-    RequestChannelSender(std::unique_ptr<FmqRequestChannel> fmqRequestChannel, bool blocking);
+    RequestChannelSender(std::unique_ptr<FmqRequestChannel> fmqRequestChannel);
 
    private:
     const std::unique_ptr<FmqRequestChannel> mFmqRequestChannel;
     std::atomic<bool> mValid{true};
-    const bool mBlocking;
 };
 
 /**
@@ -260,15 +263,15 @@ class ExecutionBurstController {
      * Prefer this over ExecutionBurstController's constructor.
      *
      * @param preparedModel Model prepared for execution to execute on.
-     * @param blocking 'true' if the FMQ should use a futex to perform blocking
-     *     until data is available in a less responsive, but more energy
-     *     efficient manner. 'false' if the FMQ should use spin-looping to
-     *     wait until data is available in a more responsive, but less energy
-     *     efficient manner.
+     * @param pollingTimeWindow How much time (in microseconds) the
+     *     ExecutionBurstController is allowed to poll the FMQ before waiting on
+     *     the blocking futex. Polling may result in lower latencies at the
+     *     potential cost of more power usage.
      * @return ExecutionBurstController Execution burst controller object.
      */
     static std::unique_ptr<ExecutionBurstController> create(
-            const sp<hal::IPreparedModel>& preparedModel, bool blocking);
+            const sp<hal::IPreparedModel>& preparedModel,
+            std::chrono::microseconds pollingTimeWindow);
 
     // prefer calling ExecutionBurstController::create
     ExecutionBurstController(const std::shared_ptr<RequestChannelSender>& requestChannelSender,
@@ -288,34 +291,13 @@ class ExecutionBurstController {
      * @param memoryIds Identifiers corresponding to each memory object in the
      *     request's pools.
      * @return A tuple of:
-     *     - status of the execution
-     *     - dynamic output shapes from the execution
-     *     - any execution time measurements of the execution
-     */
-    std::tuple<hal::ErrorStatus, std::vector<hal::OutputShape>, hal::Timing> compute(
-            const hal::Request& request, hal::MeasureTiming measure,
-            const std::vector<intptr_t>& memoryIds);
-
-    // TODO: combine "compute" and "tryCompute" back into a single function.
-    // "tryCompute" was created later to return the "fallback" boolean. This
-    // could not be done directly in "compute" because the VTS test cases (which
-    // test burst using "compute") had already been locked down and could not be
-    // changed.
-    /**
-     * Execute a request on a model.
-     *
-     * @param request Arguments to be executed on a model.
-     * @param measure Whether to collect timing measurements, either YES or NO
-     * @param memoryIds Identifiers corresponding to each memory object in the
-     *     request's pools.
-     * @return A tuple of:
      *     - result code of the execution
      *     - dynamic output shapes from the execution
      *     - any execution time measurements of the execution
      *     - whether or not a failed burst execution should be re-run using a
      *       different path (e.g., IPreparedModel::executeSynchronously)
      */
-    std::tuple<int, std::vector<hal::OutputShape>, hal::Timing, bool> tryCompute(
+    std::tuple<int, std::vector<hal::OutputShape>, hal::Timing, bool> compute(
             const hal::Request& request, hal::MeasureTiming measure,
             const std::vector<intptr_t>& memoryIds);
 
diff --git a/nn/common/include/ExecutionBurstServer.h b/nn/common/include/ExecutionBurstServer.h
index 977d0d375..9da0dc742 100644
--- a/nn/common/include/ExecutionBurstServer.h
+++ b/nn/common/include/ExecutionBurstServer.h
@@ -17,23 +17,24 @@
 #ifndef ANDROID_FRAMEWORKS_ML_NN_COMMON_EXECUTION_BURST_SERVER_H
 #define ANDROID_FRAMEWORKS_ML_NN_COMMON_EXECUTION_BURST_SERVER_H
 
-#include "HalInterfaces.h"
-
 #include <android-base/macros.h>
 #include <fmq/MessageQueue.h>
 #include <hidl/MQDescriptor.h>
 
 #include <atomic>
+#include <chrono>
 #include <memory>
 #include <optional>
 #include <thread>
+#include <tuple>
 #include <vector>
 
+#include "HalInterfaces.h"
+
 namespace android::nn {
 
-using hardware::MQDescriptorSync;
-using FmqRequestDescriptor = MQDescriptorSync<hal::FmqRequestDatum>;
-using FmqResultDescriptor = MQDescriptorSync<hal::FmqResultDatum>;
+using FmqRequestDescriptor = hardware::MQDescriptorSync<hal::FmqRequestDatum>;
+using FmqResultDescriptor = hardware::MQDescriptorSync<hal::FmqResultDatum>;
 
 /**
  * Function to serialize results.
@@ -69,7 +70,7 @@ std::optional<std::tuple<hal::Request, std::vector<int32_t>, hal::MeasureTiming>
  *
  * Because the receiver can wait on a packet that may never come (e.g., because
  * the sending side of the packet has been closed), this object can be
- * invalidating, unblocking the receiver.
+ * invalidated, unblocking the receiver.
  */
 class RequestChannelReceiver {
     using FmqRequestChannel =
@@ -82,10 +83,15 @@ class RequestChannelReceiver {
      * Prefer this call over the constructor.
      *
      * @param requestChannel Descriptor for the request channel.
+     * @param pollingTimeWindow How much time (in microseconds) the
+     *     RequestChannelReceiver is allowed to poll the FMQ before waiting on
+     *     the blocking futex. Polling may result in lower latencies at the
+     *     potential cost of more power usage.
      * @return RequestChannelReceiver on successful creation, nullptr otherwise.
      */
     static std::unique_ptr<RequestChannelReceiver> create(
-            const FmqRequestDescriptor& requestChannel);
+            const FmqRequestDescriptor& requestChannel,
+            std::chrono::microseconds pollingTimeWindow);
 
     /**
      * Get the request from the channel.
@@ -105,14 +111,15 @@ class RequestChannelReceiver {
      */
     void invalidate();
 
-    RequestChannelReceiver(std::unique_ptr<FmqRequestChannel> fmqRequestChannel, bool blocking);
+    RequestChannelReceiver(std::unique_ptr<FmqRequestChannel> fmqRequestChannel,
+                           std::chrono::microseconds pollingTimeWindow);
 
    private:
     std::optional<std::vector<hal::FmqRequestDatum>> getPacketBlocking();
 
     const std::unique_ptr<FmqRequestChannel> mFmqRequestChannel;
     std::atomic<bool> mTeardown{false};
-    const bool mBlocking;
+    const std::chrono::microseconds kPollingTimeWindow;
 };
 
 /**
@@ -149,11 +156,10 @@ class ResultChannelSender {
     // prefer calling ResultChannelSender::send
     bool sendPacket(const std::vector<hal::FmqResultDatum>& packet);
 
-    ResultChannelSender(std::unique_ptr<FmqResultChannel> fmqResultChannel, bool blocking);
+    ResultChannelSender(std::unique_ptr<FmqResultChannel> fmqResultChannel);
 
    private:
     const std::unique_ptr<FmqResultChannel> mFmqResultChannel;
-    const bool mBlocking;
 };
 
 /**
@@ -247,12 +253,17 @@ class ExecutionBurstServer : public hal::IBurstContext {
      *     the result of the execution.
      * @param executorWithCache Object which maintains a local cache of the
      *     memory pools and executes using the cached memory pools.
+     * @param pollingTimeWindow How much time (in microseconds) the
+     *     ExecutionBurstServer is allowed to poll the FMQ before waiting on
+     *     the blocking futex. Polling may result in lower latencies at the
+     *     potential cost of more power usage.
      * @result IBurstContext Handle to the burst context.
      */
     static sp<ExecutionBurstServer> create(
             const sp<hal::IBurstCallback>& callback, const FmqRequestDescriptor& requestChannel,
             const FmqResultDescriptor& resultChannel,
-            std::shared_ptr<IBurstExecutorWithCache> executorWithCache);
+            std::shared_ptr<IBurstExecutorWithCache> executorWithCache,
+            std::chrono::microseconds pollingTimeWindow = std::chrono::microseconds{0});
 
     /**
      * Create automated context to manage FMQ-based executions.
@@ -271,12 +282,16 @@ class ExecutionBurstServer : public hal::IBurstContext {
      * @param preparedModel PreparedModel that the burst object was created from.
      *     IPreparedModel::executeSynchronously will be used to perform the
      *     execution.
+     * @param pollingTimeWindow How much time (in microseconds) the
+     *     ExecutionBurstServer is allowed to poll the FMQ before waiting on
+     *     the blocking futex. Polling may result in lower latencies at the
+     *     potential cost of more power usage.
      * @result IBurstContext Handle to the burst context.
      */
-    static sp<ExecutionBurstServer> create(const sp<hal::IBurstCallback>& callback,
-                                           const FmqRequestDescriptor& requestChannel,
-                                           const FmqResultDescriptor& resultChannel,
-                                           hal::IPreparedModel* preparedModel);
+    static sp<ExecutionBurstServer> create(
+            const sp<hal::IBurstCallback>& callback, const FmqRequestDescriptor& requestChannel,
+            const FmqResultDescriptor& resultChannel, hal::IPreparedModel* preparedModel,
+            std::chrono::microseconds pollingTimeWindow = std::chrono::microseconds{0});
 
     ExecutionBurstServer(const sp<hal::IBurstCallback>& callback,
                          std::unique_ptr<RequestChannelReceiver> requestChannel,
diff --git a/nn/driver/sample/SampleDriver.cpp b/nn/driver/sample/SampleDriver.cpp
index 0448c2d79..50cb7729a 100644
--- a/nn/driver/sample/SampleDriver.cpp
+++ b/nn/driver/sample/SampleDriver.cpp
@@ -19,6 +19,7 @@
 #include "SampleDriver.h"
 
 #include <android-base/logging.h>
+#include <android-base/properties.h>
 #include <hidl/LegacySupport.h>
 
 #include <algorithm>
@@ -185,9 +186,9 @@ Return<ErrorStatus> prepareModelBase(const T_Model& model, const SampleDriver* d
     }
 
     // asynchronously prepare the model from a new, detached thread
-    std::thread([model, driver, callback] {
+    std::thread([model, driver, preference, callback] {
         sp<SamplePreparedModel> preparedModel =
-                new SamplePreparedModel(convertToV1_3(model), driver);
+                new SamplePreparedModel(convertToV1_3(model), driver, preference);
         if (!preparedModel->initialize()) {
             notify(callback, ErrorStatus::INVALID_ARGUMENT, nullptr);
             return;
@@ -472,6 +473,22 @@ class BurstExecutorWithCache : public ExecutionBurstServer::IBurstExecutorWithCa
     std::map<int32_t, std::optional<RunTimePoolInfo>> mMemoryCache;  // cached requestPoolInfos
 };
 
+// This is the amount of time the ExecutionBurstServer should spend polling the
+// FMQ to see if it has data available before it should fall back to waiting on
+// the futex.
+static std::chrono::microseconds getPollingTimeWindow() {
+    constexpr int32_t defaultPollingTimeWindow = 50;
+#ifdef NN_DEBUGGABLE
+    constexpr int32_t minPollingTimeWindow = 0;
+    const int32_t selectedPollingTimeWindow =
+            base::GetIntProperty("debug.nn.sample-driver-burst-polling-window",
+                                 defaultPollingTimeWindow, minPollingTimeWindow);
+    return std::chrono::microseconds{selectedPollingTimeWindow};
+#else
+    return std::chrono::microseconds{defaultPollingTimeWindow};
+#endif  // NN_DEBUGGABLE
+}
+
 Return<void> SamplePreparedModel::configureExecutionBurst(
         const sp<V1_2::IBurstCallback>& callback,
         const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
@@ -480,17 +497,22 @@ Return<void> SamplePreparedModel::configureExecutionBurst(
     NNTRACE_FULL(NNTRACE_LAYER_DRIVER, NNTRACE_PHASE_EXECUTION,
                  "SampleDriver::configureExecutionBurst");
 
+    const bool preferPowerOverLatency = (kPreference == hal::ExecutionPreference::LOW_POWER);
+    const auto pollingTimeWindow =
+            (preferPowerOverLatency ? std::chrono::microseconds{0} : getPollingTimeWindow());
+
     // Alternatively, the burst could be configured via:
     // const sp<V1_2::IBurstContext> burst =
     //         ExecutionBurstServer::create(callback, requestChannel,
-    //                                      resultChannel, this);
+    //                                      resultChannel, this,
+    //                                      pollingTimeWindow);
     //
     // However, this alternative representation does not include a memory map
     // caching optimization, and adds overhead.
     const std::shared_ptr<BurstExecutorWithCache> executorWithCache =
             std::make_shared<BurstExecutorWithCache>(mModel, mDriver, mPoolInfos);
     const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(
-            callback, requestChannel, resultChannel, executorWithCache);
+            callback, requestChannel, resultChannel, executorWithCache, pollingTimeWindow);
 
     if (burst == nullptr) {
         cb(ErrorStatus::GENERAL_FAILURE, {});
diff --git a/nn/driver/sample/SampleDriver.h b/nn/driver/sample/SampleDriver.h
index a85dcd5ea..8788ed3a8 100644
--- a/nn/driver/sample/SampleDriver.h
+++ b/nn/driver/sample/SampleDriver.h
@@ -91,8 +91,9 @@ class SampleDriver : public hal::IDevice {
 
 class SamplePreparedModel : public hal::IPreparedModel {
    public:
-    SamplePreparedModel(const hal::Model& model, const SampleDriver* driver)
-        : mModel(model), mDriver(driver) {}
+    SamplePreparedModel(const hal::Model& model, const SampleDriver* driver,
+                        hal::ExecutionPreference preference)
+        : mModel(model), mDriver(driver), kPreference(preference) {}
     ~SamplePreparedModel() override {}
     bool initialize();
     hal::Return<hal::ErrorStatus> execute(
@@ -113,6 +114,7 @@ class SamplePreparedModel : public hal::IPreparedModel {
     hal::Model mModel;
     const SampleDriver* mDriver;
     std::vector<RunTimePoolInfo> mPoolInfos;
+    const hal::ExecutionPreference kPreference;
 };
 
 }  // namespace sample_driver
diff --git a/nn/runtime/CompilationBuilder.cpp b/nn/runtime/CompilationBuilder.cpp
index 912f0087b..be0260ca2 100644
--- a/nn/runtime/CompilationBuilder.cpp
+++ b/nn/runtime/CompilationBuilder.cpp
@@ -18,6 +18,11 @@
 
 #include "CompilationBuilder.h"
 
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #include "BurstBuilder.h"
 #include "ExecutionBuilder.h"
 #include "ExecutionBurstController.h"
@@ -156,7 +161,8 @@ int CompilationBuilder::createBurst(BurstBuilder** burst) {
         *burst = nullptr;
         return ANEURALNETWORKS_BAD_STATE;
     }
-    std::vector<std::shared_ptr<ExecutionBurstController>> burstControllers = mPlan.makeBursts();
+    std::vector<std::shared_ptr<ExecutionBurstController>> burstControllers =
+            mPlan.makeBursts(mPreference);
     *burst = new (std::nothrow) BurstBuilder(this, std::move(burstControllers));
     return (*burst ? ANEURALNETWORKS_NO_ERROR : ANEURALNETWORKS_OUT_OF_MEMORY);
 }
diff --git a/nn/runtime/ExecutionPlan.cpp b/nn/runtime/ExecutionPlan.cpp
index 5f656e1c2..901305216 100644
--- a/nn/runtime/ExecutionPlan.cpp
+++ b/nn/runtime/ExecutionPlan.cpp
@@ -557,7 +557,8 @@ ExecutionPlan::Controller::Controller(
 // indicate the regular execution path should be used. This can occur either
 // because PreparedModel was nullptr (cpu was best choice), or because the
 // IPreparedModel was of insufficient version or failed to configure the burst.
-std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts() const {
+std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts(
+        int preference) const {
     switch (mState) {
         // burst object for each partition in the compound case
         case COMPOUND: {
@@ -565,7 +566,10 @@ std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts
             bursts.reserve(compound()->mSteps.size());
             for (const auto& step : compound()->mSteps) {
                 if (const auto preparedModel = step->getPreparedSubModel()) {
-                    bursts.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
+                    const bool preferPowerOverLatency =
+                            (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
+                    bursts.push_back(
+                            preparedModel->configureExecutionBurst(preferPowerOverLatency));
                 } else {
                     bursts.push_back(nullptr);
                 }
@@ -577,7 +581,9 @@ std::vector<std::shared_ptr<ExecutionBurstController>> ExecutionPlan::makeBursts
             std::vector<std::shared_ptr<ExecutionBurstController>> burst;
             auto simpleBody = simple();
             if (const auto preparedModel = simpleBody->mPreparedModel) {
-                burst.push_back(preparedModel->configureExecutionBurst(/*blocking=*/true));
+                const bool preferPowerOverLatency =
+                        (preference == ANEURALNETWORKS_PREFER_LOW_POWER);
+                burst.push_back(preparedModel->configureExecutionBurst(preferPowerOverLatency));
             } else {
                 burst.push_back(nullptr);
             }
diff --git a/nn/runtime/ExecutionPlan.h b/nn/runtime/ExecutionPlan.h
index cd3c01848..8fa7083c1 100644
--- a/nn/runtime/ExecutionPlan.h
+++ b/nn/runtime/ExecutionPlan.h
@@ -209,7 +209,7 @@ class ExecutionPlan {
         size_t mNextStepIndex;
     };
 
-    std::vector<std::shared_ptr<ExecutionBurstController>> makeBursts() const;
+    std::vector<std::shared_ptr<ExecutionBurstController>> makeBursts(int preference) const;
 
     std::shared_ptr<Controller> makeController(ExecutionBuilder* executionBuilder,
                                                const BurstBuilder* burstBuilder) const;
diff --git a/nn/runtime/Manager.cpp b/nn/runtime/Manager.cpp
index 6a3882dc1..34378b3fd 100644
--- a/nn/runtime/Manager.cpp
+++ b/nn/runtime/Manager.cpp
@@ -524,10 +524,10 @@ std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
             memoryIds.push_back(memory->getKey());
         }
 
-        VLOG(EXECUTION) << "Before ExecutionBurstController->tryCompute() "
+        VLOG(EXECUTION) << "Before ExecutionBurstController->compute() "
                         << SHOW_IF_DEBUG(toString(request));
         std::tie(n, outputShapes, timing, burstFallback) =
-                burstController->tryCompute(request, measure, memoryIds);
+                burstController->compute(request, measure, memoryIds);
     }
 
     // compute from IPreparedModel if either:
diff --git a/nn/runtime/VersionedInterfaces.cpp b/nn/runtime/VersionedInterfaces.cpp
index 325b75c0e..ba6e2af7c 100644
--- a/nn/runtime/VersionedInterfaces.cpp
+++ b/nn/runtime/VersionedInterfaces.cpp
@@ -19,9 +19,11 @@
 #include "VersionedInterfaces.h"
 
 #include <android-base/logging.h>
+#include <android-base/properties.h>
 #include <android-base/scopeguard.h>
 #include <android-base/thread_annotations.h>
 
+#include <chrono>
 #include <functional>
 #include <memory>
 #include <string>
@@ -276,12 +278,30 @@ std::tuple<int, std::vector<OutputShape>, Timing> VersionedIPreparedModel::execu
     return executeAsynchronously(request, measure);
 }
 
+// This is the amount of time the ExecutionBurstController should spend polling
+// the FMQ to see if it has data available before it should fall back to
+// waiting on the futex.
+static std::chrono::microseconds getPollingTimeWindow() {
+    constexpr int32_t defaultPollingTimeWindow = 50;
+#ifdef NN_DEBUGGABLE
+    constexpr int32_t minPollingTimeWindow = 0;
+    const int32_t selectedPollingTimeWindow =
+            base::GetIntProperty("debug.nn.burst-conrtoller-polling-window",
+                                 defaultPollingTimeWindow, minPollingTimeWindow);
+    return std::chrono::microseconds{selectedPollingTimeWindow};
+#else
+    return std::chrono::microseconds{defaultPollingTimeWindow};
+#endif  // NN_DEBUGGABLE
+}
+
 std::shared_ptr<ExecutionBurstController> VersionedIPreparedModel::configureExecutionBurst(
-        bool blocking) const {
+        bool preferPowerOverLatency) const {
     if (mPreparedModelV1_2 == nullptr) {
         return nullptr;
     }
-    return ExecutionBurstController::create(mPreparedModelV1_2, blocking);
+    const auto pollingTimeWindow =
+            (preferPowerOverLatency ? std::chrono::microseconds{0} : getPollingTimeWindow());
+    return ExecutionBurstController::create(mPreparedModelV1_2, pollingTimeWindow);
 }
 
 std::shared_ptr<VersionedIDevice> VersionedIDevice::create(std::string serviceName,
diff --git a/nn/runtime/VersionedInterfaces.h b/nn/runtime/VersionedInterfaces.h
index 8665745b6..87e776507 100644
--- a/nn/runtime/VersionedInterfaces.h
+++ b/nn/runtime/VersionedInterfaces.h
@@ -687,12 +687,16 @@ class VersionedIPreparedModel {
     /**
      * Creates a burst controller on a prepared model.
      *
-     * @param blocking 'true' if the FMQ should block until data is available.
+     * @param preferPowerOverLatency 'true' if the Burst object should run in a
+     *                               more power efficient mode, 'false' if more
+     *                               power can be used to possibly reduce
+     *                               burst compute latency.
      * @return ExecutionBurstController Execution burst controller object.
      *                                  nullptr is returned if the burst cannot
      *                                  be configured for any reason.
      */
-    std::shared_ptr<ExecutionBurstController> configureExecutionBurst(bool blocking) const;
+    std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
+            bool preferPowerOverLatency) const;
 
    private:
     std::tuple<int, std::vector<hal::OutputShape>, hal::Timing> executeAsynchronously(
diff --git a/nn/runtime/test/TestIntrospectionControl.cpp b/nn/runtime/test/TestIntrospectionControl.cpp
index 84617d9b0..9d0cbe6c3 100644
--- a/nn/runtime/test/TestIntrospectionControl.cpp
+++ b/nn/runtime/test/TestIntrospectionControl.cpp
@@ -16,6 +16,7 @@
 
 #include <gtest/gtest.h>
 
+#include <chrono>
 #include <iterator>
 #include <map>
 #include <queue>
@@ -309,7 +310,8 @@ std::set<Success> expectedPassSet = {Success::PASS_NEITHER, Success::PASS_DEVICE
 class TestPreparedModel12 : public SamplePreparedModel {
    public:
     TestPreparedModel12(const HidlModel& model, const SampleDriver* driver, Success success)
-        : SamplePreparedModel(model, driver), mSuccess(success) {}
+        : SamplePreparedModel(model, driver, ExecutionPreference::FAST_SINGLE_ANSWER),
+          mSuccess(success) {}
 
     Return<ErrorStatus> execute(const Request&,
                                 const sp<V1_0::IExecutionCallback>& callback) override {
@@ -384,8 +386,8 @@ class TestPreparedModel12 : public SamplePreparedModel {
             const MQDescriptorSync<V1_2::FmqRequestDatum>& requestChannel,
             const MQDescriptorSync<V1_2::FmqResultDatum>& resultChannel,
             configureExecutionBurst_cb cb) override {
-        const sp<V1_2::IBurstContext> burst =
-                ExecutionBurstServer::create(callback, requestChannel, resultChannel, this);
+        const sp<V1_2::IBurstContext> burst = ExecutionBurstServer::create(
+                callback, requestChannel, resultChannel, this, std::chrono::microseconds{0});
 
         cb(burst == nullptr ? ErrorStatus::GENERAL_FAILURE : ErrorStatus::NONE, burst);
         return Void();
author	Michael Butler <butlermichael@google.com>	2019-06-24 10:36:20 -0700
committer	Michael Butler <butlermichael@google.com>	2019-10-28 13:16:09 -0700
commit	a7f867566450998ce6cc8447456f628c6764ec17 (patch)
tree	a8e32e59a98b3cd2e4e290bdefdd0511ff2bd234
parent	769088e2702973d0ffd5be70fcda2f11726b507d (diff)
download	ml-a7f867566450998ce6cc8447456f628c6764ec17.tar.gz