Wrap the CpuExecutor as a device during execution step.

Create a common interface for abstraction of NNAPI CPU preparedModel as well as the actual driver preparedModels. Modify the execution builder to use Device and PreparedModel. Prior to this CL, the driver and cpu execution path have non-trivial difference related to data copying: * In the driver execution path, since HIDL cannot take raw pointers, we allocate separate memory pools for inputs and outputs specified by pointers. * In the cpu execution path, we take the raw points directly with no additional data copying. This behavior is not changed in this CL: the execute interface of PreparedModel uses ModelArgumentInfo as Request and MemoryTracker as memory pool directly, so that the driver path can append new memory pools and update data location info. Fix: 123178652 Bug: 70692640 Bug: 72506261 Test: NNT_static Change-Id: Ifa58bfe1b6e4a9d7e4c1752bb1109f7f5878718c Merged-In: Ifa58bfe1b6e4a9d7e4c1752bb1109f7f5878718c (cherry picked from commit 52d2df8adbb2ac40a8c99b9bd655f7718ca07901)
author: Xusong Wang <xusongw@google.com> 2019-07-02 13:53:25 -0700
committer: Slava Shklyaev <slavash@google.com> 2019-08-16 13:07:33 +0100
commit: 0480f1b27f4872bb81202f111ad434aaa8363882 (patch)
tree: 6d8f2c9046ce34bcd509797c8a096b7b4990ac40
parent: d3794f86d973565ded716f5930e447342b84366b (diff)
download: ml-0480f1b27f4872bb81202f111ad434aaa8363882.tar.gz
9 files changed, 667 insertions, 473 deletions
diff --git a/nn/runtime/Android.bp b/nn/runtime/Android.bp
index b4cc1b48b..0c6a3fb3b 100644
--- a/nn/runtime/Android.bp
+++ b/nn/runtime/Android.bp
@@ -43,6 +43,7 @@ cc_library {
         "ExecutionPlan.cpp",
         "Manager.cpp",
         "Memory.cpp",
+        "ModelArgumentInfo.cpp",
         "ModelBuilder.cpp",
         "NeuralNetworks.cpp",
         "TypeManager.cpp",
diff --git a/nn/runtime/ExecutionBuilder.cpp b/nn/runtime/ExecutionBuilder.cpp
index 57d871604..d65936985 100644
--- a/nn/runtime/ExecutionBuilder.cpp
+++ b/nn/runtime/ExecutionBuilder.cpp
@@ -18,21 +18,26 @@
 
 #include "ExecutionBuilder.h"
 
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
+
 #include "CompilationBuilder.h"
 #include "CpuExecutor.h"
 #include "ExecutionBurstController.h"
 #include "HalInterfaces.h"
 #include "Manager.h"
+#include "ModelArgumentInfo.h"
 #include "ModelBuilder.h"
 #include "Tracing.h"
 #include "TypeManager.h"
 #include "Utils.h"
 
-#include <mutex>
-#include <optional>
-#include <thread>
-#include <vector>
-
 namespace android {
 namespace nn {
 
@@ -81,86 +86,6 @@ static bool checkDimensionInfo(const Operand& operand, const ANeuralNetworksOper
     return true;
 }
 
-int ModelArgumentInfo::setFromPointer(const Operand& operand,
-                                      const ANeuralNetworksOperandType* type, void* data,
-                                      uint32_t length) {
-    if ((data == nullptr) != (length == 0)) {
-        const char* dataPtrMsg = data ? "NOT_NULLPTR" : "NULLPTR";
-        LOG(ERROR) << "Data pointer must be nullptr if and only if length is zero (data = "
-                   << dataPtrMsg << ", length = " << length << ")";
-        return ANEURALNETWORKS_BAD_DATA;
-    }
-    if (data == nullptr) {
-        state = ModelArgumentInfo::HAS_NO_VALUE;
-    } else {
-        NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type));
-        if (operand.type != OperandType::OEM) {
-            uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
-            if (neededLength != length && neededLength != 0) {
-                LOG(ERROR) << "Setting argument with invalid length: " << length
-                           << ", expected length: " << neededLength;
-                return ANEURALNETWORKS_BAD_DATA;
-            }
-        }
-        state = ModelArgumentInfo::POINTER;
-    }
-    buffer = data;
-    locationAndLength = {.poolIndex = 0, .offset = 0, .length = length};
-    return ANEURALNETWORKS_NO_ERROR;
-}
-
-int ModelArgumentInfo::setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type,
-                                     uint32_t poolIndex, uint32_t offset, uint32_t length) {
-    NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type));
-    if (operand.type != OperandType::OEM) {
-        uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
-        if (neededLength != length && neededLength != 0) {
-            LOG(ERROR) << "Setting argument with invalid length: " << length
-                       << ", expected length: " << neededLength;
-            return ANEURALNETWORKS_BAD_DATA;
-        }
-    }
-
-    state = ModelArgumentInfo::MEMORY;
-    locationAndLength = {.poolIndex = poolIndex, .offset = offset, .length = length};
-    buffer = nullptr;
-    return ANEURALNETWORKS_NO_ERROR;
-}
-
-int ModelArgumentInfo::setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex,
-                                              uint32_t offset, uint32_t length) {
-    NN_RETURN_IF_ERROR(updateDimensionInfo(operand, nullptr));
-    if (operand.type != OperandType::OEM) {
-        uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
-        if (neededLength != length) {
-            LOG(ERROR) << "Setting argument with invalid length: " << length
-                       << ", expected length: " << neededLength;
-            return ANEURALNETWORKS_BAD_DATA;
-        }
-    }
-
-    state = ModelArgumentInfo::MEMORY;
-    locationAndLength = {
-            .poolIndex = poolIndex,
-            .offset = offset,
-            .length = length,
-    };
-    buffer = nullptr;
-    return ANEURALNETWORKS_NO_ERROR;
-}
-
-int ModelArgumentInfo::updateDimensionInfo(const Operand& operand,
-                                           const ANeuralNetworksOperandType* newType) {
-    if (newType == nullptr) {
-        dimensions = operand.dimensions;
-    } else {
-        const uint32_t count = newType->dimensionCount;
-        dimensions = hidl_vec<uint32_t>(count);
-        std::copy(&newType->dimensions[0], &newType->dimensions[count], dimensions.begin());
-    }
-    return ANEURALNETWORKS_NO_ERROR;
-}
-
 ExecutionBuilder::ExecutionBuilder(const CompilationBuilder* compilation)
     : mCompilation(compilation),
       mModel(compilation->mModel),
@@ -389,7 +314,7 @@ static void cpuFallbackFull(ExecutionBuilder* executionBuilder,
                           DeviceManager::getCpuDevice(), /*preparedModel=*/nullptr);
     executor.mapInputsAndOutputsTrivially();
     sp<ExecutionCallback> fallbackCallback;
-    int n = executor.startCompute(&fallbackCallback);
+    int n = executor.startComputeOnCpuFallback(&fallbackCallback);
     if (n != ANEURALNETWORKS_NO_ERROR) {
         executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
         return;
@@ -422,7 +347,7 @@ static bool cpuFallbackPartial(ExecutionBuilder* executionBuilder, const Executi
         return false;
     }
     sp<ExecutionCallback> fallbackCallback;
-    if (executor->startComputeOnCpu(&fallbackCallback) != ANEURALNETWORKS_NO_ERROR) {
+    if (executor->startComputeOnCpuFallback(&fallbackCallback) != ANEURALNETWORKS_NO_ERROR) {
         cpuFallbackFull(executionBuilder, executionCallback);
         return false;
     }
@@ -564,7 +489,7 @@ int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
 
     // TODO: For asynchronous execution, entire plan-based-path should run in an
     // asynchronous thread -- take the asynchronous thread logic out of
-    // startComputeOnCpu() and use it to wrap the plan-based-path.
+    // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
     mStarted = true;
     const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
     std::shared_ptr<ExecutionPlan::Controller> controller =
@@ -672,52 +597,9 @@ bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
     return true;
 }
 
-// Figures out how to place each of the input or outputs in a buffer. This just does the layout,
-// it does not copy data.  Aligns each input a bit.
-int StepExecutor::allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args,
-                                                 Memory* memory) {
-    uint32_t nextPoolIndex = mMemories.size();
-    int64_t total = 0;
-    for (auto& info : *args) {
-        if (info.state == ModelArgumentInfo::POINTER) {
-            DataLocation& loc = info.locationAndLength;
-            // TODO Good enough alignment?
-            total += alignBytesNeeded(static_cast<uint32_t>(total), loc.length);
-            loc.poolIndex = nextPoolIndex;
-            loc.offset = static_cast<uint32_t>(total);
-            total += loc.length;
-        }
-    };
-    if (total > 0xFFFFFFFF) {
-        LOG(ERROR) << "StepExecutor::allocatePointerArgumentsToPool: ANeuralNetworksExecution: "
-                      "Size of all inputs or outputs exceeds 2^32.";
-        return ANEURALNETWORKS_BAD_DATA;
-    }
-    hidl_memory hidlMemory;
-    if (total > 0) {
-        memory->create(total);  // TODO check error
-        mMemories.add(memory);
-    }
-    return ANEURALNETWORKS_NO_ERROR;
-}
-
-static void setRequestArgumentArray(const std::vector<ModelArgumentInfo>& argumentInfos,
-                                    hidl_vec<RequestArgument>* ioInfos) {
-    size_t count = argumentInfos.size();
-    ioInfos->resize(count);
-    for (size_t i = 0; i < count; i++) {
-        const auto& info = argumentInfos[i];
-        (*ioInfos)[i] = {
-                .hasNoValue = info.state == ModelArgumentInfo::HAS_NO_VALUE,
-                .location = info.locationAndLength,
-                .dimensions = info.dimensions,
-        };
-    }
-}
-
 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                            std::shared_ptr<Device> device,
-                           std::shared_ptr<VersionedIPreparedModel> preparedModel)
+                           std::shared_ptr<PreparedModel> preparedModel)
     : mExecutionBuilder(executionBuilder),
       mModel(model),
       mDevice(device),
@@ -794,7 +676,7 @@ static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>&
 }
 
 bool StepExecutor::isCpu() const {
-    return mDevice->getInterface() == nullptr;
+    return mDevice == DeviceManager::getCpuDevice();
 }
 
 int StepExecutor::startCompute(sp<ExecutionCallback>* synchronizationCallback,
@@ -803,243 +685,34 @@ int StepExecutor::startCompute(sp<ExecutionCallback>* synchronizationCallback,
         logArguments("input", mInputs);
         logArguments("output", mOutputs);
     }
-    if (isCpu()) {
-        return startComputeOnCpu(synchronizationCallback);
-    } else {
-        return startComputeOnDevice(synchronizationCallback, burstController);
-    }
-}
-
-int StepExecutor::startComputeOnDevice(
-        sp<ExecutionCallback>* synchronizationCallback,
-        const std::shared_ptr<ExecutionBurstController>& burstController) {
-    CHECK(!isCpu());
 
     // Initialize timing information in case we take an error path to exit.
     mExecutionBuilder->reportTiming(kNoTiming);
 
-    *synchronizationCallback = nullptr;
-
-    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "StepExecutor::startComputeOnDevice");
-    // We separate the input & output pools so that we reduce the copying done if we
-    // do an eventual remoting (hidl_memory->update()).  We could also use it to set
-    // protection on read only memory but that's not currently done.
-    Memory inputPointerArguments;
-    Memory outputPointerArguments;
-
-    // Layout the input and output data
-    int n = allocatePointerArgumentsToPool(&mInputs, &inputPointerArguments);
-    if (n != ANEURALNETWORKS_NO_ERROR) {
-        return n;
-    }
-    n = allocatePointerArgumentsToPool(&mOutputs, &outputPointerArguments);
-    if (n != ANEURALNETWORKS_NO_ERROR) {
-        return n;
-    }
-
-    // Copy the input data that was specified via a pointer.
-    // inputPointerArguments.update();
-    for (auto& info : mInputs) {
-        if (info.state == ModelArgumentInfo::POINTER) {
-            DataLocation& loc = info.locationAndLength;
-            uint8_t* data = nullptr;
-            int n = inputPointerArguments.getPointer(&data);
-            if (n != ANEURALNETWORKS_NO_ERROR) {
-                return n;
-            }
-            memcpy(data + loc.offset, info.buffer, loc.length);
-        }
-    }
-    // TODO: Add inputPointerArguments.commit() and .update() at all the right places
-
-    Request request;
-    setRequestArgumentArray(mInputs, &request.inputs);
-    setRequestArgumentArray(mOutputs, &request.outputs);
-    uint32_t count = mMemories.size();
-    request.pools.resize(count);
-    for (uint32_t i = 0; i < count; i++) {
-        request.pools[i] = mMemories[i]->getHidlMemory();
-    }
-
-    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
-                        "StepExecutor::startComputeOnDevice::execute");
-
-    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
-    // object is returned when the execution has been successfully launched,
-    // otherwise a nullptr is returned. The executionCallback is abstracted in
-    // the NN API as an "event".
-    //
-    // The sp is used for ref-counting purposes. Without it, the HIDL service
-    // could attempt to communicate with a dead callback object.
-    //
-    // TODO: Explain the "dead callback" problem further, either here or
-    // in the design document.
-    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
-
-    // compute using burst if present
-    const bool burstCompute = (burstController != nullptr);
-    bool burstFallback = false;
-    if (burstCompute) {
-        std::vector<intptr_t> memoryIds;
-        memoryIds.reserve(mMemories.size());
-        for (const Memory* memory : mMemories) {
-            memory->usedBy(burstController);
-            memoryIds.push_back(memory->getKey());
-        }
-
-        VLOG(EXECUTION) << "Before ExecutionBurstController->tryCompute() "
-                        << SHOW_IF_DEBUG(toString(request));
-        auto [status, outputShapes, timing, fallback] =
-                burstController->tryCompute(request, measureTiming(mExecutionBuilder), memoryIds);
-
-        burstFallback = fallback;
-        if (!fallback) {
-            executionCallback->notify(status, outputShapes, timing);
-        }
-    }
-
-    // compute from IPreparedModel if either:
-    // (1) burst was not supplied, or
-    // (2) the burst execution failed and requested a fallback execution
-    if (!burstCompute || burstFallback) {
-        if (DeviceManager::get()->syncExecHal()) {
-            VLOG(EXECUTION) << "Before mPreparedModel->executeSynchronously() "
-                            << SHOW_IF_DEBUG(toString(request));
-            auto syncExecuteResult =
-                    mPreparedModel->executeSynchronously(request, measureTiming(mExecutionBuilder));
-            executionCallback->notify(std::get<0>(syncExecuteResult),
-                                      std::get<1>(syncExecuteResult),
-                                      std::get<2>(syncExecuteResult));
-        } else {
-            VLOG(EXECUTION) << "Before mPreparedModel->execute() "
-                            << SHOW_IF_DEBUG(toString(request));
-            // Execute.
-            // TODO: What happens to the Callback if the service dies abnormally
-            // -- won't that keep the Callback live forever, because the service
-            // never has the opportunity to bump the reference count down? Or
-            // maybe the HIDL infrastructure handles this magically? At worst,
-            // it seems like this is a small memory leak, if the Callback stays
-            // alive forever.
-            Return<ErrorStatus> executeStatus = mPreparedModel->execute(
-                    request, measureTiming(mExecutionBuilder), executionCallback);
-            if (!executeStatus.isOk() || executeStatus != ErrorStatus::NONE) {
-                VLOG(EXECUTION) << "**Execute launch failed**";
-                return executeStatus.isOk() ? convertErrorStatusToResultCode(executeStatus)
-                                            : ANEURALNETWORKS_OP_FAILED;
-            }
-        }
-    }
-
-    // TODO: Remove this synchronization point when the block of code below is
-    // removed.
-    executionCallback->wait();
-    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION,
-                        "StepExecutor::startComputeOnDevice::waited");
-    Return<ErrorStatus> callbackStatus = executionCallback->getStatus();
-    if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) {
-        VLOG(EXECUTION) << "**Execution failed**";
-        if (callbackStatus == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
-            *synchronizationCallback = executionCallback;
-            return ANEURALNETWORKS_NO_ERROR;
-        }
-        return callbackStatus.isOk() ? convertErrorStatusToResultCode(callbackStatus)
-                                     : ANEURALNETWORKS_OP_FAILED;
-    }
-
-    mExecutionBuilder->reportTiming(executionCallback->getTiming());
+    CHECK(mPreparedModel != nullptr);
+    NN_RETURN_IF_ERROR(mPreparedModel->execute(burstController, measureTiming(mExecutionBuilder),
+                                               &mInputs, &mOutputs, &mMemories,
+                                               synchronizationCallback));
 
-    // Copy the output data from shared memory to the output buffers.
-    // TODO: Move this block of code somewhere else. It should not be in the
-    // startCompute function.
-    // TODO: outputMemory->update(); outputMemory->commit()
-    NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "StepExecutor::startComputeOnDevice");
-    for (auto& info : mOutputs) {
-        if (info.state == ModelArgumentInfo::POINTER) {
-            DataLocation& loc = info.locationAndLength;
-            uint8_t* data = nullptr;
-            int n = outputPointerArguments.getPointer(&data);
-            if (n != ANEURALNETWORKS_NO_ERROR) {
-                return n;
-            }
-            memcpy(info.buffer, data + loc.offset, loc.length);
-        }
+    if (*synchronizationCallback != nullptr) {
+        mExecutionBuilder->reportTiming((*synchronizationCallback)->getTiming());
     }
-    VLOG(EXECUTION) << "StepExecutor::startComputeOnDevice completed";
-
-    *synchronizationCallback = executionCallback;
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-static void computeOnCpu(const Model& model, const Request& request,
-                         const std::vector<RunTimePoolInfo>& modelPoolInfos,
-                         const std::vector<RunTimePoolInfo>& requestPoolInfos,
-                         const sp<IExecutionCallback>& executionCallback) {
-    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
-    CpuExecutor executor;
-    int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
-    const auto& outputShapes = executor.getOutputShapes();
-    executionCallback->notify_1_2(convertResultCodeToErrorStatus(err), outputShapes, kNoTiming);
-}
-
-int StepExecutor::startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback) {
-    // TODO: use a thread pool
-    // TODO(mikie): this could have NNTRACE so we could measure the overhead of
-    //              spinning up a new thread.
-
+// For cpuFallback{Partial,Full}, recompile the model on CPU and then start compute.
+int StepExecutor::startComputeOnCpuFallback(sp<ExecutionCallback>* synchronizationCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "StepExecutor::startComputeOnCpuFallback");
+    VLOG(EXECUTION) << "Re-compile the model on CPU";
     const Model model = mModel->makeHidlModel();
-
-    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
-    // object is returned when the execution has been successfully launched,
-    // otherwise a nullptr is returned. The executionCallback is abstracted in
-    // the NN API as an "event".
-    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
-    *synchronizationCallback = nullptr;
-
-    std::vector<RunTimePoolInfo> modelPoolInfos;
-    if (!setRunTimePoolInfosFromHidlMemories(&modelPoolInfos, model.pools)) {
-        return ANEURALNETWORKS_UNMAPPABLE;
-    }
-
-    std::vector<RunTimePoolInfo> requestPoolInfos;
-    requestPoolInfos.reserve(mMemories.size());
-    for (const Memory* mem : mMemories) {
-        if (std::optional<RunTimePoolInfo> poolInfo =
-                    RunTimePoolInfo::createFromHidlMemory(mem->getHidlMemory())) {
-            requestPoolInfos.emplace_back(*poolInfo);
-        } else {
-            return ANEURALNETWORKS_UNMAPPABLE;
-        }
-    }
-    // Create as many pools as there are input / output.
-    auto fixPointerArguments = [&requestPoolInfos](std::vector<ModelArgumentInfo>& argumentInfos) {
-        for (ModelArgumentInfo& argumentInfo : argumentInfos) {
-            if (argumentInfo.state == ModelArgumentInfo::POINTER) {
-                argumentInfo.locationAndLength.poolIndex =
-                        static_cast<uint32_t>(requestPoolInfos.size());
-                argumentInfo.locationAndLength.offset = 0;
-                requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
-                        static_cast<uint8_t*>(argumentInfo.buffer)));
-            }
-        }
-    };
-    fixPointerArguments(mInputs);
-    fixPointerArguments(mOutputs);
-
-    Request request;
-    setRequestArgumentArray(mInputs, &request.inputs);
-    setRequestArgumentArray(mOutputs, &request.outputs);
-
-    if (DeviceManager::get()->syncExecCpu()) {
-        computeOnCpu(model, request, modelPoolInfos, requestPoolInfos, executionCallback);
-    } else {
-        // TODO: should model be moved with a std::cref?
-        std::thread thread(computeOnCpu, model, std::move(request), std::move(modelPoolInfos),
-                           std::move(requestPoolInfos), executionCallback);
-        executionCallback->bindThread(std::move(thread));
-    }
-
-    *synchronizationCallback = executionCallback;
-    return ANEURALNETWORKS_NO_ERROR;
+    mDevice = DeviceManager::getCpuDevice();
+    mPreparedModel = nullptr;
+    // TODO: Propagate user preference to this point instead of using default value of
+    // ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER.
+    ExecutionPreference preference =
+            static_cast<ExecutionPreference>(ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER);
+    NN_RETURN_IF_ERROR(mDevice->prepareModel(model, preference, {}, {}, {}, &mPreparedModel));
+    return startCompute(synchronizationCallback, /*burstController=*/nullptr);
 }
 
 }  // namespace nn
diff --git a/nn/runtime/ExecutionBuilder.h b/nn/runtime/ExecutionBuilder.h
index 6b730713f..1c8b1d68c 100644
--- a/nn/runtime/ExecutionBuilder.h
+++ b/nn/runtime/ExecutionBuilder.h
@@ -17,55 +17,30 @@
 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_BUILDER_H
 
+#include <atomic>
+#include <memory>
+#include <vector>
+
 #include "Callbacks.h"
 #include "HalInterfaces.h"
 #include "Memory.h"
+#include "ModelArgumentInfo.h"
 #include "ModelBuilder.h"
 #include "NeuralNetworks.h"
-#include "VersionedInterfaces.h"
-
-#include <atomic>
-#include <unordered_map>
-#include <vector>
 
 namespace android {
 namespace nn {
 
 class BurstBuilder;
 class CompilationBuilder;
-class ExecutionPlan;
+class Device;
 class ExecutionBurstController;
+class ExecutionPlan;
 class ExecutionStep;
 class Memory;
 class ModelBuilder;
+class PreparedModel;
 class StepExecutor;
-class Device;
-
-// TODO move length out of DataLocation
-struct ModelArgumentInfo {
-    // Whether the argument was specified as being in a Memory, as a pointer,
-    // has no value, or has not been specified.
-    // If POINTER then:
-    //   locationAndLength.length is valid.
-    //   dimensions is valid.
-    //   buffer is valid
-    // If MEMORY then:
-    //   locationAndLength.{poolIndex, offset, length} is valid.
-    //   dimensions is valid.
-    enum { POINTER, MEMORY, HAS_NO_VALUE, UNSPECIFIED } state = UNSPECIFIED;
-    hal::DataLocation locationAndLength;
-    std::vector<uint32_t> dimensions;
-    void* buffer;
-    bool isSufficient = true;
-
-    int setFromPointer(const hal::Operand& operand, const ANeuralNetworksOperandType* type,
-                       void* buffer, uint32_t length);
-    int setFromMemory(const hal::Operand& operand, const ANeuralNetworksOperandType* type,
-                      uint32_t poolIndex, uint32_t offset, uint32_t length);
-    int setFromTemporaryMemory(const hal::Operand& operand, uint32_t poolIndex, uint32_t offset,
-                               uint32_t length);
-    int updateDimensionInfo(const hal::Operand& operand, const ANeuralNetworksOperandType* newType);
-};
 
 class ExecutionBuilder {
     friend class StepExecutor;
@@ -175,8 +150,7 @@ class StepExecutor {
     //     model to execute on that device.  (Both are nullptr in the
     //     case of CPU.)
     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
-                 std::shared_ptr<Device> device,
-                 std::shared_ptr<VersionedIPreparedModel> preparedModel);
+                 std::shared_ptr<Device> device, std::shared_ptr<PreparedModel> preparedModel);
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // in the case where we have a single-"step" execution (i.e., the executor
@@ -218,9 +192,9 @@ class StepExecutor {
     int startCompute(sp<ExecutionCallback>* synchronizationCallback,
                      const std::shared_ptr<ExecutionBurstController>& burstController = nullptr);
 
-    // Executes using the CPU, regardless of the (driver,
+    // Re-compiles and executes using the CPU, regardless of the (driver,
     // preparedModel) specified at construction time.
-    int startComputeOnCpu(sp<ExecutionCallback>* synchronizationCallback);
+    int startComputeOnCpuFallback(sp<ExecutionCallback>* synchronizationCallback);
 
     bool isCpu() const;
 
@@ -230,10 +204,6 @@ class StepExecutor {
     }
 
    private:
-    int allocatePointerArgumentsToPool(std::vector<ModelArgumentInfo>* args, Memory* memory);
-    int startComputeOnDevice(sp<ExecutionCallback>* synchronizationCallback,
-                             const std::shared_ptr<ExecutionBurstController>& burstController);
-
     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
                           ModelArgumentInfo* executorInputOrOutput);
 
@@ -251,8 +221,7 @@ class StepExecutor {
     // compiled forms; and device on which to execute it
     const ModelBuilder* mModel;
     std::shared_ptr<Device> mDevice;
-    std::shared_ptr<VersionedIPreparedModel>
-            mPreparedModel;  // nullptr if CPU execution or if bypassing ExecutionPlan
+    std::shared_ptr<PreparedModel> mPreparedModel;
 
     // The information we'll send to the driver about the inputs and outputs.
     // Note that we build this in two steps:
diff --git a/nn/runtime/ExecutionPlan.cpp b/nn/runtime/ExecutionPlan.cpp
index f8225e834..4e5409da5 100644
--- a/nn/runtime/ExecutionPlan.cpp
+++ b/nn/runtime/ExecutionPlan.cpp
@@ -18,11 +18,30 @@
 
 #include "ExecutionPlan.h"
 
+#include <cutils/native_handle.h>
+#include <fcntl.h>
+#include <openssl/sha.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <set>
+#include <string>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include "BurstBuilder.h"
 #include "Callbacks.h"
 #include "CompilationBuilder.h"
 #include "ExecutionBuilder.h"
 #include "ExecutionBurstController.h"
+#include "GraphDump.h"
 #include "Manager.h"
 #include "MetaModel.h"
 #include "ModelBuilder.h"
@@ -32,19 +51,6 @@
 #include "TypeManager.h"
 #include "Utils.h"
 
-#include <cutils/native_handle.h>
-#include <fcntl.h>
-#include <openssl/sha.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <functional>
-#include <map>
-#include <mutex>
-#include <queue>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
 namespace android {
 namespace nn {
 
@@ -118,8 +124,7 @@ bool getCacheHandles(const std::string& cacheDir, const uint8_t* token,
 
 // Tries to compile directly from cache, returns false on fail.
 bool compileFromCache(const std::shared_ptr<Device>& device, const std::string& cacheDir,
-                      const uint8_t* token,
-                      std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+                      const uint8_t* token, std::shared_ptr<PreparedModel>* preparedModel) {
     CHECK(token != nullptr && device != nullptr);
     VLOG(COMPILATION) << "compileFromCache";
     *preparedModel = nullptr;
@@ -133,8 +138,7 @@ bool compileFromCache(const std::shared_ptr<Device>& device, const std::string&
 
 int compileModelAndCache(const std::shared_ptr<Device>& device, const ModelBuilder* model,
                          int32_t executionPreference, const std::string& cacheDir,
-                         const uint8_t* token,
-                         std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+                         const uint8_t* token, std::shared_ptr<PreparedModel>* preparedModel) {
     CHECK(device != nullptr);
     *preparedModel = nullptr;
     uint8_t dummyToken[ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN] = {0};
@@ -157,7 +161,7 @@ int compileModelAndCache(const std::shared_ptr<Device>& device, const ModelBuild
 // device name, device version string, and the execution preference in this function.
 int compile(std::shared_ptr<Device> device, const ModelBuilder* model, int32_t executionPreference,
             const std::string& cacheDir, TokenHasher* token,
-            std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+            std::shared_ptr<PreparedModel>* preparedModel) {
     CHECK(device != nullptr);
     const uint8_t* tokenData = nullptr;
     if (device->isCachingSupported() && token->ok() && token->updateFromString(device->getName()) &&
diff --git a/nn/runtime/ExecutionPlan.h b/nn/runtime/ExecutionPlan.h
index 4ce4d71b7..be9b9aae0 100644
--- a/nn/runtime/ExecutionPlan.h
+++ b/nn/runtime/ExecutionPlan.h
@@ -19,21 +19,24 @@
 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_PLAN_H
 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_EXECUTION_PLAN_H
 
-#include "HalInterfaces.h"
-#include "Memory.h"
-#include "ModelBuilder.h"
-#include "NeuralNetworks.h"
-#include "TokenHasher.h"
-#include "Utils.h"
-#include "VersionedInterfaces.h"
-
 #include <openssl/sha.h>
 
+#include <map>
+#include <memory>
 #include <ostream>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
+#include "HalInterfaces.h"
+#include "Memory.h"
+#include "ModelBuilder.h"
+#include "NeuralNetworks.h"
+#include "TokenHasher.h"
+#include "Utils.h"
+
 namespace android {
 namespace nn {
 
@@ -44,6 +47,7 @@ class ExecutionBuilder;
 class ExecutionPlan;
 class ExecutionBurstController;
 class Memory;
+class PreparedModel;
 class StepExecutor;
 
 class ExecutionStep {
@@ -97,9 +101,7 @@ public:
     std::shared_ptr<Device> getDevice() const { return mDevice; }
 
     // only available after calling finishSubModel()
-    std::shared_ptr<VersionedIPreparedModel> getPreparedSubModel() const {
-        return mPreparedSubModel;
-    }
+    std::shared_ptr<PreparedModel> getPreparedSubModel() const { return mPreparedSubModel; }
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor.
     void mapInputsAndOutputs(std::shared_ptr<StepExecutor> stepExecutor) const;
@@ -120,7 +122,7 @@ public:
     uint32_t mIndex;  // index of step within plan
     ModelBuilder mSubModel;
     std::shared_ptr<Device> mDevice;
-    std::shared_ptr<VersionedIPreparedModel> mPreparedSubModel;  // not used for CPU
+    std::shared_ptr<PreparedModel> mPreparedSubModel;
 
     // Inputs of original model that are also inputs of this submodel:
     //     (fromModel index, subModel index)
@@ -284,7 +286,7 @@ public:
 
         std::shared_ptr<Device> mDevice;
         const ModelBuilder* mModel;
-        std::shared_ptr<VersionedIPreparedModel> mPreparedModel;  // not used for CPU
+        std::shared_ptr<PreparedModel> mPreparedModel;
 
         const std::string* mCacheDir;
         TokenHasher mToken;
diff --git a/nn/runtime/Manager.cpp b/nn/runtime/Manager.cpp
index b06af8594..2511cd343 100644
--- a/nn/runtime/Manager.cpp
+++ b/nn/runtime/Manager.cpp
@@ -17,11 +17,6 @@
 #define LOG_TAG "Manager"
 
 #include "Manager.h"
-#include "Callbacks.h"
-#include "HalInterfaces.h"
-#include "MetaModel.h"
-#include "Tracing.h"
-#include "Utils.h"
 
 #include <android/hidl/manager/1.2/IServiceManager.h>
 #include <build/version.h>
@@ -30,6 +25,21 @@
 
 #include <algorithm>
 #include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "Callbacks.h"
+#include "CpuExecutor.h"
+#include "ExecutionBurstController.h"
+#include "HalInterfaces.h"
+#include "Memory.h"
+#include "MetaModel.h"
+#include "ModelArgumentInfo.h"
+#include "Tracing.h"
+#include "Utils.h"
+#include "VersionedInterfaces.h"
 
 namespace android {
 namespace nn {
@@ -38,6 +48,8 @@ using namespace hal;
 
 using HidlToken = hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>;
 
+const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
+
 bool Device::isCachingSupported() const {
     auto pair = getNumberOfCacheFilesNeeded();
     // Caching is supported if either of numModelCache or numDataCache is greater than 0.
@@ -46,8 +58,6 @@ bool Device::isCachingSupported() const {
 
 // A Device with actual underlying driver
 class DriverDevice : public Device {
-    DISALLOW_IMPLICIT_CONSTRUCTORS(DriverDevice);
-
    public:
     DriverDevice(std::string name, const sp<V1_0::IDevice>& device);
 
@@ -56,12 +66,11 @@ class DriverDevice : public Device {
 
     const char* getName() const override { return mName.c_str(); }
     const char* getVersionString() const override { return mVersionString.c_str(); }
-    VersionedIDevice* getInterface() override { return mInterface.get(); }
-    int64_t getFeatureLevel() override { return mInterface->getFeatureLevel(); }
+    int64_t getFeatureLevel() const override { return mInterface->getFeatureLevel(); }
     int32_t getType() const override { return mInterface->getType(); }
     hidl_vec<Extension> getSupportedExtensions() const override;
     void getSupportedOperations(const MetaModel& metaModel,
-                                hidl_vec<bool>* supportedOperations) override;
+                                hidl_vec<bool>* supportedOperations) const override;
     PerformanceInfo getPerformance(OperandType type) const override;
     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
         return mCapabilities.relaxedFloat32toFloat16PerformanceScalar;
@@ -76,10 +85,10 @@ class DriverDevice : public Device {
     int prepareModel(const Model& hidlModel, ExecutionPreference executionPreference,
                      const hidl_vec<hidl_handle>& modelCache,
                      const hidl_vec<hidl_handle>& dataCache, const HidlToken& token,
-                     std::shared_ptr<VersionedIPreparedModel>* preparedModel) override;
+                     std::shared_ptr<PreparedModel>* preparedModel) const override;
     int prepareModelFromCache(const hidl_vec<hidl_handle>& modelCache,
                               const hidl_vec<hidl_handle>& dataCache, const HidlToken& token,
-                              std::shared_ptr<VersionedIPreparedModel>* preparedModel) override;
+                              std::shared_ptr<PreparedModel>* preparedModel) const override;
 
    private:
     std::string mName;
@@ -97,6 +106,26 @@ class DriverDevice : public Device {
 #endif  // NN_DEBUGGABLE
 };
 
+// A PreparedModel with underlying IPreparedModel instance return by actual driver.
+class DriverPreparedModel : public PreparedModel {
+   public:
+    DriverPreparedModel(const std::shared_ptr<VersionedIPreparedModel>& preparedModel)
+        : mPreparedModel(preparedModel) {}
+
+    int execute(const std::shared_ptr<ExecutionBurstController>& burstController,
+                MeasureTiming measure, std::vector<ModelArgumentInfo>* inputs,
+                std::vector<ModelArgumentInfo>* outputs, MemoryTracker* memories,
+                sp<ExecutionCallback>* synchronizationCallback) const override;
+
+    std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
+            bool blocking) const override {
+        return mPreparedModel->configureExecutionBurst(blocking);
+    }
+
+   private:
+    const std::shared_ptr<VersionedIPreparedModel> mPreparedModel;
+};
+
 DriverDevice::DriverDevice(std::string name, const sp<V1_0::IDevice>& device)
     : mName(std::move(name)), mInterface(VersionedIDevice::create(mName, device)) {}
 
@@ -160,7 +189,7 @@ hidl_vec<Extension> DriverDevice::getSupportedExtensions() const {
 }
 
 void DriverDevice::getSupportedOperations(const MetaModel& metaModel,
-                                          hidl_vec<bool>* outSupportedOperations) {
+                                          hidl_vec<bool>* outSupportedOperations) const {
     // Query the driver for what it can do.
     ErrorStatus status = ErrorStatus::GENERAL_FAILURE;
     hidl_vec<bool> supportedOperations;
@@ -231,7 +260,7 @@ PerformanceInfo DriverDevice::getPerformance(OperandType type) const {
 static int prepareModelCheck(ErrorStatus status,
                              const std::shared_ptr<VersionedIPreparedModel>& preparedModel,
                              const char* prepareName, const char* serviceName,
-                             std::shared_ptr<VersionedIPreparedModel>* preparedModelOut) {
+                             std::shared_ptr<PreparedModel>* preparedModelOut) {
     CHECK(preparedModelOut != nullptr) << "prepareModelCheck -- preparedModelOut must be non-null";
     *preparedModelOut = nullptr;
 
@@ -245,14 +274,14 @@ static int prepareModelCheck(ErrorStatus status,
         return ANEURALNETWORKS_OP_FAILED;
     }
 
-    *preparedModelOut = preparedModel;
+    *preparedModelOut = std::make_shared<DriverPreparedModel>(preparedModel);
     return ANEURALNETWORKS_NO_ERROR;
 }
 
 int DriverDevice::prepareModel(const Model& hidlModel, ExecutionPreference executionPreference,
                                const hidl_vec<hidl_handle>& modelCache,
                                const hidl_vec<hidl_handle>& dataCache, const HidlToken& token,
-                               std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+                               std::shared_ptr<PreparedModel>* preparedModel) const {
     // Note that some work within VersionedIDevice will be subtracted from the IPC layer
     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModel");
 
@@ -265,7 +294,7 @@ int DriverDevice::prepareModel(const Model& hidlModel, ExecutionPreference execu
 int DriverDevice::prepareModelFromCache(const hidl_vec<hidl_handle>& modelCache,
                                         const hidl_vec<hidl_handle>& dataCache,
                                         const HidlToken& token,
-                                        std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+                                        std::shared_ptr<PreparedModel>* preparedModel) const {
     // Note that some work within VersionedIDevice will be subtracted from the IPC layer
     NNTRACE_FULL(NNTRACE_LAYER_IPC, NNTRACE_PHASE_COMPILATION, "prepareModelFromCache");
 
@@ -276,11 +305,207 @@ int DriverDevice::prepareModelFromCache(const hidl_vec<hidl_handle>& modelCache,
                              preparedModel);
 }
 
+static void setRequestArgumentArray(const std::vector<ModelArgumentInfo>& argumentInfos,
+                                    hidl_vec<RequestArgument>* ioInfos) {
+    size_t count = argumentInfos.size();
+    ioInfos->resize(count);
+    for (size_t i = 0; i < count; i++) {
+        const auto& info = argumentInfos[i];
+        (*ioInfos)[i] = {
+                .hasNoValue = info.state == ModelArgumentInfo::HAS_NO_VALUE,
+                .location = info.locationAndLength,
+                .dimensions = info.dimensions,
+        };
+    }
+}
+
+// Figures out how to place each of the input or outputs in a buffer. This just does the layout,
+// it does not copy data.  Aligns each input a bit.
+static int allocatePointerArgumentsToPool(MemoryTracker* memories,
+                                          std::vector<ModelArgumentInfo>* args, Memory* memory) {
+    CHECK(memories != nullptr);
+    CHECK(args != nullptr);
+    CHECK(memory != nullptr);
+    uint32_t nextPoolIndex = memories->size();
+    int64_t total = 0;
+    for (auto& info : *args) {
+        if (info.state == ModelArgumentInfo::POINTER) {
+            DataLocation& loc = info.locationAndLength;
+            // TODO Good enough alignment?
+            total += alignBytesNeeded(static_cast<uint32_t>(total), loc.length);
+            loc.poolIndex = nextPoolIndex;
+            loc.offset = static_cast<uint32_t>(total);
+            total += loc.length;
+        }
+    };
+    if (total > 0xFFFFFFFF) {
+        LOG(ERROR) << "allocatePointerArgumentsToPool: ANeuralNetworksExecution: "
+                      "Size of all inputs or outputs exceeds 2^32.";
+        return ANEURALNETWORKS_BAD_DATA;
+    }
+    if (total > 0) {
+        memory->create(total);  // TODO check error
+        memories->add(memory);
+    }
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+// Start compute on an actual HIDL driver.
+//
+// Two separate memory pools will be allocated for inputs and outputs specified by pointers. The
+// loc field in each ModelArgumentInfo structure will be updated accordingly. The input pointer
+// data will be copied to the input pool prior to execution, and the output pointer data will be
+// copied out from the output pool after the execution.
+//
+// The HIDL invocation will choose between sync/async execution according to
+// DeviceManager::mSyncExecHal.
+int DriverPreparedModel::execute(const std::shared_ptr<ExecutionBurstController>& burstController,
+                                 MeasureTiming measure, std::vector<ModelArgumentInfo>* inputs,
+                                 std::vector<ModelArgumentInfo>* outputs, MemoryTracker* memories,
+                                 sp<ExecutionCallback>* synchronizationCallback) const {
+    CHECK(inputs != nullptr);
+    CHECK(outputs != nullptr);
+    CHECK(memories != nullptr);
+    CHECK(synchronizationCallback != nullptr);
+    *synchronizationCallback = nullptr;
+
+    NNTRACE_RT(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "DriverPreparedModel::execute");
+    // We separate the input & output pools so that we reduce the copying done if we
+    // do an eventual remoting (hidl_memory->update()).  We could also use it to set
+    // protection on read only memory but that's not currently done.
+    Memory inputPointerArguments;
+    Memory outputPointerArguments;
+
+    // Layout the input and output data
+    NN_RETURN_IF_ERROR(allocatePointerArgumentsToPool(memories, inputs, &inputPointerArguments));
+    NN_RETURN_IF_ERROR(allocatePointerArgumentsToPool(memories, outputs, &outputPointerArguments));
+
+    // Copy the input data that was specified via a pointer.
+    // inputPointerArguments.update();
+    for (auto& info : *inputs) {
+        if (info.state == ModelArgumentInfo::POINTER) {
+            DataLocation& loc = info.locationAndLength;
+            uint8_t* data = nullptr;
+            NN_RETURN_IF_ERROR(inputPointerArguments.getPointer(&data));
+            memcpy(data + loc.offset, info.buffer, loc.length);
+        }
+    }
+    // TODO: Add inputPointerArguments.commit() and .update() at all the right places
+
+    Request request;
+    setRequestArgumentArray(*inputs, &request.inputs);
+    setRequestArgumentArray(*outputs, &request.outputs);
+    uint32_t count = memories->size();
+    request.pools.resize(count);
+    for (uint32_t i = 0; i < count; i++) {
+        request.pools[i] = (*memories)[i]->getHidlMemory();
+    }
+
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_IPC, NNTRACE_PHASE_EXECUTION,
+                        "DriverPreparedModel::execute::execute");
+
+    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
+    // object is returned when the execution has been successfully launched,
+    // otherwise a nullptr is returned. The executionCallback is abstracted in
+    // the NN API as an "event".
+    //
+    // The sp is used for ref-counting purposes. Without it, the HIDL service
+    // could attempt to communicate with a dead callback object.
+    //
+    // TODO: Explain the "dead callback" problem further, either here or
+    // in the design document.
+    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
+
+    // compute using burst if present
+    const bool burstCompute = (burstController != nullptr);
+    bool burstFallback = false;
+    if (burstCompute) {
+        std::vector<intptr_t> memoryIds;
+        memoryIds.reserve(memories->size());
+        for (const Memory* memory : *memories) {
+            memory->usedBy(burstController);
+            memoryIds.push_back(memory->getKey());
+        }
+
+        VLOG(EXECUTION) << "Before ExecutionBurstController->tryCompute() "
+                        << SHOW_IF_DEBUG(toString(request));
+        auto [status, outputShapes, timing, fallback] =
+                burstController->tryCompute(request, measure, memoryIds);
+
+        burstFallback = fallback;
+        if (!fallback) {
+            executionCallback->notify(status, outputShapes, timing);
+        }
+    }
+
+    // compute from IPreparedModel if either:
+    // (1) burst was not supplied, or
+    // (2) the burst execution failed and requested a fallback execution
+    if (!burstCompute || burstFallback) {
+        if (DeviceManager::get()->syncExecHal()) {
+            VLOG(EXECUTION) << "Before mPreparedModel->executeSynchronously() "
+                            << SHOW_IF_DEBUG(toString(request));
+            auto syncExecuteResult = mPreparedModel->executeSynchronously(request, measure);
+            executionCallback->notify(std::get<0>(syncExecuteResult),
+                                      std::get<1>(syncExecuteResult),
+                                      std::get<2>(syncExecuteResult));
+        } else {
+            VLOG(EXECUTION) << "Before mPreparedModel->execute() "
+                            << SHOW_IF_DEBUG(toString(request));
+            // Execute.
+            // TODO: What happens to the Callback if the service dies abnormally
+            // -- won't that keep the Callback live forever, because the service
+            // never has the opportunity to bump the reference count down? Or
+            // maybe the HIDL infrastructure handles this magically? At worst,
+            // it seems like this is a small memory leak, if the Callback stays
+            // alive forever.
+            Return<ErrorStatus> executeStatus =
+                    mPreparedModel->execute(request, measure, executionCallback);
+            if (!executeStatus.isOk() || executeStatus != ErrorStatus::NONE) {
+                VLOG(EXECUTION) << "**Execute launch failed**";
+                return executeStatus.isOk() ? convertErrorStatusToResultCode(executeStatus)
+                                            : ANEURALNETWORKS_OP_FAILED;
+            }
+        }
+    }
+
+    // TODO: Remove this synchronization point when the block of code below is removed.
+    executionCallback->wait();
+    NNTRACE_FULL_SWITCH(NNTRACE_LAYER_RUNTIME, NNTRACE_PHASE_EXECUTION,
+                        "DriverPreparedModel::execute::waited");
+    Return<ErrorStatus> callbackStatus = executionCallback->getStatus();
+    if (!callbackStatus.isOk() || callbackStatus != ErrorStatus::NONE) {
+        VLOG(EXECUTION) << "**Execution failed**";
+        if (callbackStatus == ErrorStatus::OUTPUT_INSUFFICIENT_SIZE) {
+            *synchronizationCallback = executionCallback;
+            return ANEURALNETWORKS_NO_ERROR;
+        }
+        return callbackStatus.isOk() ? convertErrorStatusToResultCode(callbackStatus)
+                                     : ANEURALNETWORKS_OP_FAILED;
+    }
+
+    // Copy the output data from shared memory to the output buffers.
+    // TODO: Move this block of code somewhere else. It should not be in the
+    // startCompute function.
+    // TODO: outputMemory->update(); outputMemory->commit()
+    NNTRACE_RT_SWITCH(NNTRACE_PHASE_RESULTS, "DriverPreparedModel::execute");
+    for (auto& info : *outputs) {
+        if (info.state == ModelArgumentInfo::POINTER) {
+            DataLocation& loc = info.locationAndLength;
+            uint8_t* data = nullptr;
+            NN_RETURN_IF_ERROR(outputPointerArguments.getPointer(&data));
+            memcpy(info.buffer, data + loc.offset, loc.length);
+        }
+    }
+    VLOG(EXECUTION) << "DriverPreparedModel::execute completed";
+
+    *synchronizationCallback = executionCallback;
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
 // A special abstracted device for the CPU. Only one instance of this class will exist.
 // Use get() to retrieve it.
 class CpuDevice : public Device {
-    DISALLOW_COPY_AND_ASSIGN(CpuDevice);
-
    public:
     // Returns the singleton CPU fallback device.
     static std::shared_ptr<CpuDevice> get() {
@@ -290,12 +515,11 @@ class CpuDevice : public Device {
 
     const char* getName() const override { return kName.c_str(); }
     const char* getVersionString() const override { return kVersionString.c_str(); }
-    VersionedIDevice* getInterface() override { return nullptr; }
-    int64_t getFeatureLevel() override { return kFeatureLevel; }
+    int64_t getFeatureLevel() const override { return kFeatureLevel; }
     int32_t getType() const override { return ANEURALNETWORKS_DEVICE_CPU; }
     hidl_vec<Extension> getSupportedExtensions() const override { return {/* No extensions. */}; }
     void getSupportedOperations(const MetaModel& metaModel,
-                                hidl_vec<bool>* supportedOperations) override;
+                                hidl_vec<bool>* supportedOperations) const override;
     PerformanceInfo getPerformance(OperandType) const override { return kPerformance; }
     PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const override {
         return kPerformance;
@@ -310,10 +534,9 @@ class CpuDevice : public Device {
     int prepareModel(const Model& hidlModel, ExecutionPreference executionPreference,
                      const hidl_vec<hidl_handle>& modelCache,
                      const hidl_vec<hidl_handle>& dataCache, const HidlToken&,
-                     std::shared_ptr<VersionedIPreparedModel>* preparedModel) override;
+                     std::shared_ptr<PreparedModel>* preparedModel) const override;
     int prepareModelFromCache(const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&,
-                              const HidlToken&,
-                              std::shared_ptr<VersionedIPreparedModel>*) override {
+                              const HidlToken&, std::shared_ptr<PreparedModel>*) const override {
         CHECK(false) << "Should never call prepareModelFromCache on CpuDevice";
         return ANEURALNETWORKS_OP_FAILED;
     }
@@ -331,8 +554,32 @@ class CpuDevice : public Device {
                                                           /*numDataCache=*/0};
 };
 
+// A special abstracted PreparedModel for the CPU, constructed by CpuDevice.
+class CpuPreparedModel : public PreparedModel {
+   public:
+    // Factory method for CpuPreparedModel. Returns ANEURALNETWORKS_NO_ERROR if
+    // successfully created.
+    static int create(Model hidlModel, std::shared_ptr<PreparedModel>* preparedModel);
+
+    int execute(const std::shared_ptr<ExecutionBurstController>& burstController,
+                MeasureTiming measure, std::vector<ModelArgumentInfo>* inputs,
+                std::vector<ModelArgumentInfo>* outputs, MemoryTracker* memories,
+                sp<ExecutionCallback>* synchronizationCallback) const override;
+
+    std::shared_ptr<ExecutionBurstController> configureExecutionBurst(bool) const override {
+        return nullptr;
+    }
+
+   private:
+    CpuPreparedModel(Model model, std::vector<RunTimePoolInfo> poolInfos)
+        : mModel(std::move(model)), mModelPoolInfos(std::move(poolInfos)) {}
+
+    const Model mModel;
+    const std::vector<RunTimePoolInfo> mModelPoolInfos;
+};
+
 void CpuDevice::getSupportedOperations(const MetaModel& metaModel,
-                                       hidl_vec<bool>* supportedOperations) {
+                                       hidl_vec<bool>* supportedOperations) const {
     const Model& hidlModel = metaModel.getModel();
     const size_t count = hidlModel.operations.size();
     hidl_vec<bool> result(count);
@@ -350,13 +597,110 @@ void CpuDevice::getSupportedOperations(const MetaModel& metaModel,
 int CpuDevice::prepareModel(const Model& hidlModel, ExecutionPreference executionPreference,
                             const hidl_vec<hidl_handle>& modelCache,
                             const hidl_vec<hidl_handle>& dataCache, const HidlToken&,
-                            std::shared_ptr<VersionedIPreparedModel>* preparedModel) {
+                            std::shared_ptr<PreparedModel>* preparedModel) const {
     CHECK(modelCache.size() == 0 && dataCache.size() == 0)
             << "Should never call prepareModel with cache information on CpuDevice";
+    CHECK(preparedModel != nullptr) << "CpuDevice::prepareModel -- preparedModel must be non-null";
     *preparedModel = nullptr;
+
     if (!validateModel(hidlModel) || !validateExecutionPreference(executionPreference)) {
         return ANEURALNETWORKS_OP_FAILED;
     }
+
+    return CpuPreparedModel::create(hidlModel, preparedModel);
+}
+
+int CpuPreparedModel::create(Model hidlModel, std::shared_ptr<PreparedModel>* preparedModel) {
+    CHECK(preparedModel != nullptr);
+    *preparedModel = nullptr;
+
+    std::vector<RunTimePoolInfo> poolInfos;
+    if (!setRunTimePoolInfosFromHidlMemories(&poolInfos, hidlModel.pools)) {
+        return ANEURALNETWORKS_UNMAPPABLE;
+    }
+
+    *preparedModel = std::shared_ptr<CpuPreparedModel>(
+            new CpuPreparedModel(std::move(hidlModel), std::move(poolInfos)));
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+static void computeOnCpu(const Model& model, const Request& request,
+                         const std::vector<RunTimePoolInfo>& modelPoolInfos,
+                         const std::vector<RunTimePoolInfo>& requestPoolInfos,
+                         const sp<IExecutionCallback>& executionCallback) {
+    NNTRACE_RT(NNTRACE_PHASE_EXECUTION, "computeOnCpu");
+    CpuExecutor executor;
+    int err = executor.run(model, request, modelPoolInfos, requestPoolInfos);
+    const auto& outputShapes = executor.getOutputShapes();
+    executionCallback->notify_1_2(convertResultCodeToErrorStatus(err), outputShapes, kNoTiming);
+}
+
+// Start compute on NNAPI CPU reference implementation.
+//
+// Contrary to DriverPreparedModel::execute, the NNAPI CPU reference executor lives in the
+// same process as the NNAPI runtime and can take raw pointers. We will create as many pools as
+// there are input/output in this method to avoid data copying.
+//
+// Will choose between sync/async execution according to DeviceManager::mSyncExecCpu.
+int CpuPreparedModel::execute(const std::shared_ptr<ExecutionBurstController>& /*burstController*/,
+                              MeasureTiming /*measure*/, std::vector<ModelArgumentInfo>* inputs,
+                              std::vector<ModelArgumentInfo>* outputs, MemoryTracker* memories,
+                              sp<ExecutionCallback>* synchronizationCallback) const {
+    CHECK(inputs != nullptr);
+    CHECK(outputs != nullptr);
+    CHECK(memories != nullptr);
+    CHECK(synchronizationCallback != nullptr);
+
+    // TODO: use a thread pool
+    // TODO(mikie): this could have NNTRACE so we could measure the overhead of
+    //              spinning up a new thread.
+
+    // Prepare the callback for asynchronous execution. sp<ExecutionCallback>
+    // object is returned when the execution has been successfully launched,
+    // otherwise a nullptr is returned. The executionCallback is abstracted in
+    // the NN API as an "event".
+    sp<ExecutionCallback> executionCallback = new ExecutionCallback();
+    *synchronizationCallback = nullptr;
+
+    std::vector<RunTimePoolInfo> requestPoolInfos;
+    requestPoolInfos.reserve(memories->size());
+    for (const Memory* mem : *memories) {
+        if (std::optional<RunTimePoolInfo> poolInfo =
+                    RunTimePoolInfo::createFromHidlMemory(mem->getHidlMemory())) {
+            requestPoolInfos.emplace_back(*poolInfo);
+        } else {
+            return ANEURALNETWORKS_UNMAPPABLE;
+        }
+    }
+    // Create as many pools as there are input / output.
+    auto fixPointerArguments = [&requestPoolInfos](std::vector<ModelArgumentInfo>* argumentInfos) {
+        for (ModelArgumentInfo& argumentInfo : *argumentInfos) {
+            if (argumentInfo.state == ModelArgumentInfo::POINTER) {
+                argumentInfo.locationAndLength.poolIndex =
+                        static_cast<uint32_t>(requestPoolInfos.size());
+                argumentInfo.locationAndLength.offset = 0;
+                requestPoolInfos.emplace_back(RunTimePoolInfo::createFromExistingBuffer(
+                        static_cast<uint8_t*>(argumentInfo.buffer)));
+            }
+        }
+    };
+    fixPointerArguments(inputs);
+    fixPointerArguments(outputs);
+
+    Request request;
+    setRequestArgumentArray(*inputs, &request.inputs);
+    setRequestArgumentArray(*outputs, &request.outputs);
+
+    if (DeviceManager::get()->syncExecCpu()) {
+        computeOnCpu(mModel, request, mModelPoolInfos, requestPoolInfos, executionCallback);
+    } else {
+        std::thread thread(computeOnCpu, std::cref(mModel), std::move(request),
+                           std::cref(mModelPoolInfos), std::move(requestPoolInfos),
+                           executionCallback);
+        executionCallback->bindThread(std::move(thread));
+    }
+
+    *synchronizationCallback = executionCallback;
     return ANEURALNETWORKS_NO_ERROR;
 }
 
diff --git a/nn/runtime/Manager.h b/nn/runtime/Manager.h
index 0c7ef0066..3f783f0df 100644
--- a/nn/runtime/Manager.h
+++ b/nn/runtime/Manager.h
@@ -17,39 +17,68 @@
 #ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_MANAGER_H
 #define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_MANAGER_H
 
-#include "HalInterfaces.h"
-#include "Utils.h"
-#include "VersionedInterfaces.h"
-
 #include <android-base/macros.h>
+
 #include <map>
+#include <memory>
+#include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "Callbacks.h"
+#include "HalInterfaces.h"
+#include "Memory.h"
+#include "Utils.h"
+
 namespace android {
 namespace nn {
 
 // Forward declaration
 class MetaModel;
+class ExecutionBurstController;
+struct ModelArgumentInfo;
+
+// A unified interface for actual driver prepared model as well as the CPU.
+class PreparedModel {
+    DISALLOW_COPY_AND_ASSIGN(PreparedModel);
+
+   public:
+    PreparedModel() = default;
+    virtual ~PreparedModel() = default;
+
+    // Start computation with given input/output argument info and memory pools.
+    //
+    // When executed on an actual driver device, this method may append new memory pools to
+    // "memories" for inputs and outputs specified via pointers, and the data location for
+    // "inputs" and "outputs" may get updated.
+    virtual int execute(const std::shared_ptr<ExecutionBurstController>& burstController,
+                        MeasureTiming measure, std::vector<ModelArgumentInfo>* inputs,
+                        std::vector<ModelArgumentInfo>* outputs, MemoryTracker* memories,
+                        sp<ExecutionCallback>* synchronizationCallback) const = 0;
+
+    virtual std::shared_ptr<ExecutionBurstController> configureExecutionBurst(
+            bool blocking) const = 0;
+};
 
 // A unified interface for actual driver devices as well as the CPU
 class Device {
-   public:
-    virtual ~Device() {}
+    DISALLOW_COPY_AND_ASSIGN(Device);
 
-    // Get the handle of underlying VersionedIDevice, if any
-    virtual VersionedIDevice* getInterface() = 0;
+   public:
+    Device() = default;
+    virtual ~Device() = default;
 
     // Introspection methods returning device information
     virtual const char* getName() const = 0;
     virtual const char* getVersionString() const = 0;
-    virtual int64_t getFeatureLevel() = 0;
+    virtual int64_t getFeatureLevel() const = 0;
     virtual int32_t getType() const = 0;
     virtual hal::hidl_vec<hal::Extension> getSupportedExtensions() const = 0;
 
     // See the MetaModel class in MetaModel.h for more details.
     virtual void getSupportedOperations(const MetaModel& metaModel,
-                                        hal::hidl_vec<bool>* supportedOperations) = 0;
+                                        hal::hidl_vec<bool>* supportedOperations) const = 0;
 
     virtual hal::PerformanceInfo getPerformance(hal::OperandType type) const = 0;
     virtual hal::PerformanceInfo getRelaxedFloat32toFloat16PerformanceScalar() const = 0;
@@ -62,12 +91,12 @@ class Device {
             const hal::hidl_vec<hal::hidl_handle>& modelCache,
             const hal::hidl_vec<hal::hidl_handle>& dataCache,
             const hal::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>& token,
-            std::shared_ptr<VersionedIPreparedModel>* preparedModel) = 0;
+            std::shared_ptr<PreparedModel>* preparedModel) const = 0;
     virtual int prepareModelFromCache(
             const hal::hidl_vec<hal::hidl_handle>& modelCache,
             const hal::hidl_vec<hal::hidl_handle>& dataCache,
             const hal::hidl_array<uint8_t, ANEURALNETWORKS_BYTE_SIZE_OF_CACHE_TOKEN>& token,
-            std::shared_ptr<VersionedIPreparedModel>* preparedModel) = 0;
+            std::shared_ptr<PreparedModel>* preparedModel) const = 0;
 };
 
 // Manages the NN HAL devices.  Only one instance of this class will exist.
diff --git a/nn/runtime/ModelArgumentInfo.cpp b/nn/runtime/ModelArgumentInfo.cpp
new file mode 100644
index 000000000..52935d145
--- /dev/null
+++ b/nn/runtime/ModelArgumentInfo.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define LOG_TAG "ModelArgumentInfo"
+
+#include "ModelArgumentInfo.h"
+
+#include <algorithm>
+#include <vector>
+
+#include "HalInterfaces.h"
+#include "NeuralNetworks.h"
+#include "TypeManager.h"
+#include "Utils.h"
+
+namespace android {
+namespace nn {
+
+using namespace hal;
+
+int ModelArgumentInfo::setFromPointer(const Operand& operand,
+                                      const ANeuralNetworksOperandType* type, void* data,
+                                      uint32_t length) {
+    if ((data == nullptr) != (length == 0)) {
+        const char* dataPtrMsg = data ? "NOT_NULLPTR" : "NULLPTR";
+        LOG(ERROR) << "Data pointer must be nullptr if and only if length is zero (data = "
+                   << dataPtrMsg << ", length = " << length << ")";
+        return ANEURALNETWORKS_BAD_DATA;
+    }
+    if (data == nullptr) {
+        state = ModelArgumentInfo::HAS_NO_VALUE;
+    } else {
+        NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type));
+        if (operand.type != OperandType::OEM) {
+            uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
+            if (neededLength != length && neededLength != 0) {
+                LOG(ERROR) << "Setting argument with invalid length: " << length
+                           << ", expected length: " << neededLength;
+                return ANEURALNETWORKS_BAD_DATA;
+            }
+        }
+        state = ModelArgumentInfo::POINTER;
+    }
+    buffer = data;
+    locationAndLength = {.poolIndex = 0, .offset = 0, .length = length};
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+int ModelArgumentInfo::setFromMemory(const Operand& operand, const ANeuralNetworksOperandType* type,
+                                     uint32_t poolIndex, uint32_t offset, uint32_t length) {
+    NN_RETURN_IF_ERROR(updateDimensionInfo(operand, type));
+    if (operand.type != OperandType::OEM) {
+        uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
+        if (neededLength != length && neededLength != 0) {
+            LOG(ERROR) << "Setting argument with invalid length: " << length
+                       << ", expected length: " << neededLength;
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+    }
+
+    state = ModelArgumentInfo::MEMORY;
+    locationAndLength = {.poolIndex = poolIndex, .offset = offset, .length = length};
+    buffer = nullptr;
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+int ModelArgumentInfo::setFromTemporaryMemory(const Operand& operand, uint32_t poolIndex,
+                                              uint32_t offset, uint32_t length) {
+    NN_RETURN_IF_ERROR(updateDimensionInfo(operand, nullptr));
+    if (operand.type != OperandType::OEM) {
+        uint32_t neededLength = TypeManager::get()->getSizeOfData(operand.type, dimensions);
+        if (neededLength != length) {
+            LOG(ERROR) << "Setting argument with invalid length: " << length
+                       << ", expected length: " << neededLength;
+            return ANEURALNETWORKS_BAD_DATA;
+        }
+    }
+
+    state = ModelArgumentInfo::MEMORY;
+    locationAndLength = {
+            .poolIndex = poolIndex,
+            .offset = offset,
+            .length = length,
+    };
+    buffer = nullptr;
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+int ModelArgumentInfo::updateDimensionInfo(const Operand& operand,
+                                           const ANeuralNetworksOperandType* newType) {
+    if (newType == nullptr) {
+        dimensions = operand.dimensions;
+    } else {
+        const uint32_t count = newType->dimensionCount;
+        dimensions = hidl_vec<uint32_t>(count);
+        std::copy(&newType->dimensions[0], &newType->dimensions[count], dimensions.begin());
+    }
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+}  // namespace nn
+}  // namespace android
diff --git a/nn/runtime/ModelArgumentInfo.h b/nn/runtime/ModelArgumentInfo.h
new file mode 100644
index 000000000..68403246d
--- /dev/null
+++ b/nn/runtime/ModelArgumentInfo.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2019 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_FRAMEWORKS_ML_NN_RUNTIME_MODEL_ARGUMENT_INFO_H
+#define ANDROID_FRAMEWORKS_ML_NN_RUNTIME_MODEL_ARGUMENT_INFO_H
+
+#include <vector>
+
+#include "HalInterfaces.h"
+#include "NeuralNetworks.h"
+
+namespace android {
+namespace nn {
+
+// TODO move length out of DataLocation
+struct ModelArgumentInfo {
+    // Whether the argument was specified as being in a Memory, as a pointer,
+    // has no value, or has not been specified.
+    // If POINTER then:
+    //   locationAndLength.length is valid.
+    //   dimensions is valid.
+    //   buffer is valid
+    // If MEMORY then:
+    //   locationAndLength.{poolIndex, offset, length} is valid.
+    //   dimensions is valid.
+    enum { POINTER, MEMORY, HAS_NO_VALUE, UNSPECIFIED } state = UNSPECIFIED;
+    hal::DataLocation locationAndLength;
+    std::vector<uint32_t> dimensions;
+    void* buffer;
+    bool isSufficient = true;
+
+    int setFromPointer(const hal::Operand& operand, const ANeuralNetworksOperandType* type,
+                       void* buffer, uint32_t length);
+    int setFromMemory(const hal::Operand& operand, const ANeuralNetworksOperandType* type,
+                      uint32_t poolIndex, uint32_t offset, uint32_t length);
+    int setFromTemporaryMemory(const hal::Operand& operand, uint32_t poolIndex, uint32_t offset,
+                               uint32_t length);
+    int updateDimensionInfo(const hal::Operand& operand, const ANeuralNetworksOperandType* newType);
+};
+
+}  // namespace nn
+}  // namespace android
+
+#endif  // ANDROID_FRAMEWORKS_ML_NN_RUNTIME_MODEL_ARGUMENT_INFO_H
+\ No newline at end of file
author	Xusong Wang <xusongw@google.com>	2019-07-02 13:53:25 -0700
committer	Slava Shklyaev <slavash@google.com>	2019-08-16 13:07:33 +0100
commit	0480f1b27f4872bb81202f111ad434aaa8363882 (patch)
tree	6d8f2c9046ce34bcd509797c8a096b7b4990ac40
parent	d3794f86d973565ded716f5930e447342b84366b (diff)
download	ml-0480f1b27f4872bb81202f111ad434aaa8363882.tar.gz