Merge changes I388f14fe,I11e0e5bb,I52e7179f

* changes: model output of unspecified shape as partition input should not force CPU fallback More testing for partitions with boundary temporaries of unknown size. Partial fix to allow partitions to have boundary temporaries of unknown size.
author: David Gross <dgross@google.com> 2020-09-29 19:44:18 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2020-09-29 19:44:18 +0000
commit: 3ab5ca5044de7f1f5929cd90147503025f2982e1 (patch)
tree: 608562c909d4b71ece25cd40fb0e135db766a55d
parent: 6c03731d6e9450068d8cf3866edcec185326fa42 (diff)
parent: 9d1874d82fd53740c2248afff1d8ad20200c99fa (diff)
download: ml-3ab5ca5044de7f1f5929cd90147503025f2982e1.tar.gz
19 files changed, 1644 insertions, 416 deletions
diff --git a/nn/common/CpuExecutor.cpp b/nn/common/CpuExecutor.cpp
index 9f2477592..8d23c0a15 100644
--- a/nn/common/CpuExecutor.cpp
+++ b/nn/common/CpuExecutor.cpp
@@ -1914,6 +1914,8 @@ void CpuExecutor::setOutputShapes(const std::vector<uint32_t>& outputIndexes,
         const RunTimeOperandInfo& from = operands[operandIndex];
         mOutputShapes[i].dimensions = from.dimensions;
         mOutputShapes[i].isSufficient = from.isSufficient();
+        VLOG(EXECUTION) << "CpuExecutor::setOutputShapes: mOutputShapes[" << i
+                        << "] = " << toString(mOutputShapes[i]);
     }
 }
 
diff --git a/nn/common/OperationsUtils.cpp b/nn/common/OperationsUtils.cpp
index d1814b7c8..f0bcb0ed7 100644
--- a/nn/common/OperationsUtils.cpp
+++ b/nn/common/OperationsUtils.cpp
@@ -356,7 +356,7 @@ bool calculateBroadcastedShape(const Shape& in1, const Shape& in2, Shape* out) {
         if (dim1 != dim2 && dim1 != 1 && dim2 != 1) {
             LOG(ERROR) << "Dimensions mismatch for broadcast:\n"
                        << "First tensor: dimension " << numberOfDims1 - i << " of size " << dim1
-                       << "\nSecond tensor: dimension " << numberOfDims2 - i << "of size " << dim2;
+                       << "\nSecond tensor: dimension " << numberOfDims2 - i << " of size " << dim2;
             return false;
         }
         out->dimensions[maxDims - i] = (dim1 == 1) ? dim2 : dim1;
diff --git a/nn/common/ValidateHal.cpp b/nn/common/ValidateHal.cpp
index 6470fbce1..46f9b2fce 100644
--- a/nn/common/ValidateHal.cpp
+++ b/nn/common/ValidateHal.cpp
@@ -782,8 +782,9 @@ static bool validateRequestArguments(const hidl_vec<RequestArgument>& requestArg
                     // extension operand type.
                     if (!isExtensionOperandType(operand.type) &&
                         !nonExtensionOperandTypeIsScalar(static_cast<int>(operand.type))) {
-                        NN_RET_CHECK_GT(modelRank, 0) << "Model has unknown rank but the request "
-                                                         "does not specify the rank.";
+                        NN_RET_CHECK_GT(modelRank, 0)
+                                << "Model " << type << " " << requestArgumentIndex
+                                << " has unknown rank but the request does not specify the rank.";
                     }
                     // Validate that all the dimensions are specified in the model.
                     for (size_t i = 0; i < modelRank; i++) {
diff --git a/nn/runtime/CompilationBuilder.cpp b/nn/runtime/CompilationBuilder.cpp
index 8b2a26915..051ac886c 100644
--- a/nn/runtime/CompilationBuilder.cpp
+++ b/nn/runtime/CompilationBuilder.cpp
@@ -63,7 +63,8 @@ int CompilationBuilder::finish() {
         mPlan.setCaching(&mCacheDir, mToken);
     }
     if (mPartitioning) {
-        int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan);
+        int n = mModel->partitionTheWork(mDevices, mPreference, mPriority, deadline, &mPlan,
+                                         mFailPartitioning);
         switch (n) {
             case ANEURALNETWORKS_NO_ERROR:
                 return n;
@@ -96,7 +97,7 @@ int CompilationBuilder::finish() {
     VLOG(COMPILATION) << "CompilationBuilder::finish with CPU fallback";
     mPlan.reset();
     mPlan.becomeSingleStep(DeviceManager::getCpuDevice(), mModel);
-    return mPlan.finish(mPreference, mPriority, deadline);
+    return mPlan.finish(mPreference, mPriority, deadline, ANEURALNETWORKS_NO_ERROR);
 }
 
 int CompilationBuilder::setPreference(int32_t preference) {
@@ -166,9 +167,9 @@ int CompilationBuilder::setTimeoutDuration(uint64_t duration) {
     return ANEURALNETWORKS_NO_ERROR;
 }
 
-int CompilationBuilder::setPartitioning(uint32_t partitioning) {
+int CompilationBuilder::forTest_setPartitioning(uint32_t partitioning) {
     if (mFinished) {
-        LOG(ERROR) << "ANeuralNetworksCompilation_setPartitioning can't modify after compilation "
+        LOG(ERROR) << "CompilationBuilder::forTest_setPartitioning can't modify after compilation "
                       "finished";
         return ANEURALNETWORKS_BAD_STATE;
     }
@@ -177,6 +178,17 @@ int CompilationBuilder::setPartitioning(uint32_t partitioning) {
     return ANEURALNETWORKS_NO_ERROR;
 }
 
+int CompilationBuilder::forTest_failPartitioning(int fail) {
+    if (mFinished) {
+        LOG(ERROR) << "CompilationBuilder::forTest_failPartitioning can't modify after compilation "
+                      "finished";
+        return ANEURALNETWORKS_BAD_STATE;
+    }
+
+    mFailPartitioning = fail;
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
 int CompilationBuilder::createExecution(ExecutionBuilder** execution) {
     if (!mFinished) {
         LOG(ERROR) << "ANeuralNetworksExecution_create passed an unfinished compilation";
diff --git a/nn/runtime/CompilationBuilder.h b/nn/runtime/CompilationBuilder.h
index d94fb18df..0f2db4d4f 100644
--- a/nn/runtime/CompilationBuilder.h
+++ b/nn/runtime/CompilationBuilder.h
@@ -47,8 +47,6 @@ class CompilationBuilder {
 
     int setPreference(int32_t preference);
 
-    int setPartitioning(uint32_t partitioning);
-
     int setCaching(const std::string& cacheDir, const uint8_t* token);
 
     int setPriority(int32_t priority);
@@ -66,10 +64,17 @@ class CompilationBuilder {
     int forEachStepRoleOfInput(uint32_t index, const StepRoleCallback& callback) const;
     int forEachStepRoleOfOutput(uint32_t index, const StepRoleCallback& callback) const;
 
-    const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
-
     bool createdWithExplicitDeviceList() const { return mExplicitDeviceList; }
 
+    bool hasDynamicTemporaries() const { return mPlan.hasDynamicTemporaries(); }
+
+    // These functions are solely intended for use by unit tests of the
+    // partitioning algorithm.
+    const ExecutionPlan& forTest_getExecutionPlan() const { return mPlan; }
+    int forTest_setPartitioning(uint32_t partitioning);
+    int forTest_failPartitioning(
+            int resultCode);  // If not ANEURALNETWORKS_NO_ERROR, then simulate partitioning failure
+
    private:
     const ModelBuilder* mModel;
 
@@ -83,6 +88,9 @@ class CompilationBuilder {
     // we can override this later.
     uint32_t mPartitioning;
 
+    // For testing purposes, simulate partitioning failure.
+    int mFailPartitioning = ANEURALNETWORKS_NO_ERROR;
+
     // Once the compilation has been finished, we should not allow further
     // modifications to the compilation.
     bool mFinished = false;
diff --git a/nn/runtime/ExecutionBuilder.cpp b/nn/runtime/ExecutionBuilder.cpp
index 0f94e4396..8b6b81758 100644
--- a/nn/runtime/ExecutionBuilder.cpp
+++ b/nn/runtime/ExecutionBuilder.cpp
@@ -20,6 +20,7 @@
 
 #include <algorithm>
 #include <limits>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <optional>
@@ -46,6 +47,66 @@ namespace nn {
 
 using namespace hal;
 
+// Partial validation of output shapes returned from driver, to ensure they
+// conform to a very specific set of rules.
+static bool validateOutputShapesFromDriver(ErrorStatus executionStatus, const ModelBuilder* model,
+                                           const std::vector<hal::OutputShape>& shapes) {
+    // Enforces the following rules (some of which are from b/154054474):
+    // - shapes vector is empty except in the case of NONE or OUTPUT_INSUFFICIENT_SIZE.
+    //   If the vector is not empty, it must have as many entries as the step model has outputs.
+    // - If NONE, then either shapes vector is empty, or every shape is
+    //   marked isSufficient and, if a tensor, has known rank.
+    // - If OUTPUT_INSUFFICIENT_SIZE, then the vector is not empty.  At least one entry
+    //   is marked !isSufficient.
+    switch (executionStatus) {
+        case ErrorStatus::NONE: {
+            NN_RET_CHECK(shapes.size() == 0 || shapes.size() == model->outputCount())
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be empty or of length " << model->outputCount()
+                    << " but has length " << shapes.size();
+            NN_RET_CHECK(std::all_of(shapes.begin(), shapes.end(),
+                                     [](const OutputShape& shape) { return shape.isSufficient; }))
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " at least one output shape is unexpectedly marked !isSufficient";
+
+            const TypeManager* tm = TypeManager::get();
+            for (uint32_t outputIndex = 0, outputCount = shapes.size(); outputIndex < outputCount;
+                 ++outputIndex) {
+                const hal::Operand& outputOperand = model->getOutputOperand(outputIndex);
+                NN_RET_CHECK(!tm->isTensorType(outputOperand.type) ||
+                             (shapes[outputIndex].dimensions.size() != 0))
+                        << "With execution ErrorStatus " << toString(executionStatus) << " output#"
+                        << outputIndex << " shape unexpectedly has zero rank";
+            }
+
+            break;
+        }
+        case ErrorStatus::OUTPUT_INSUFFICIENT_SIZE: {
+            NN_RET_CHECK(shapes.size() == model->outputCount())
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be of length " << model->outputCount()
+                    << " but has length " << shapes.size();
+            NN_RET_CHECK(std::any_of(shapes.begin(), shapes.end(),
+                                     [](const OutputShape& shape) { return !shape.isSufficient; }))
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " at least one output shape must have been marked !isSufficient";
+            break;
+        }
+        default: {
+            NN_RET_CHECK(shapes.size() == 0)
+                    << "With execution ErrorStatus " << toString(executionStatus)
+                    << " output shapes vector must be empty but has length " << shapes.size();
+            break;
+        }
+    }
+    return true;
+}
+static bool validateOutputShapesFromDriver(int executionResultCode, const ModelBuilder* model,
+                                           const std::vector<hal::OutputShape>& shapes) {
+    return validateOutputShapesFromDriver(convertResultCodeToErrorStatus(executionResultCode),
+                                          model, shapes);
+}
+
 const Timing kNoTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
 
 static MeasureTiming measureTiming(const ExecutionBuilder* execution) {
@@ -483,7 +544,7 @@ cpuFallbackPartial(const ExecutionPlan& plan,
 
     // Get fallback executor.
     std::shared_ptr<StepExecutor> executor;
-    int n1 = plan.fallback(controller, &executor);
+    int n1 = plan.fallback(controller, &executor, nullptr, nullptr);
     if (n1 != ANEURALNETWORKS_NO_ERROR) {
         return {n1, {}, kNoTiming, nullptr};
     }
@@ -497,7 +558,7 @@ cpuFallbackPartial(const ExecutionPlan& plan,
 static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
                                          const ExecutionPlan& plan,
                                          std::shared_ptr<ExecutionPlan::Controller> controller,
-                                         bool allowFallback,
+                                         bool allowCpuFallback,
                                          const std::optional<Deadline>& deadline,
                                          const sp<ExecutionCallback>& executionCallback) {
     CHECK(executionBuilder != nullptr);
@@ -505,8 +566,12 @@ static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
 
     std::vector<OutputShape> outputShapes = executionBuilder->getInitialOutputShapes();
     Timing timing = kNoTiming;
-    // Disallow fallback when the ExecutionPlan is simple on CPU.
-    allowFallback &= !plan.isSimpleCpu();
+    // Disallow CPU fallback when the ExecutionPlan is simple on CPU.
+    allowCpuFallback &= !plan.isSimpleCpu();
+
+    // On this iteration, do I need to repeat the previous step because it
+    // reported insufficient size?
+    bool doInsufficientSizeFallback = false;
 
     while (true) {
         VLOG(EXECUTION) << "looking for next StepExecutor";
@@ -514,13 +579,16 @@ static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
         std::shared_ptr<ExecutionBurstController> burstController;
-        int n = plan.next(controller, &executor, &burstController);
+        int n = doInsufficientSizeFallback
+                        ? plan.fallback(controller, &executor, &burstController, &outputShapes)
+                        : plan.next(controller, &executor, &burstController, &outputShapes);
+        doInsufficientSizeFallback = false;
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowFallback && !missedDeadline) break;
+            if (allowCpuFallback && !missedDeadline) break;
             executionCallback->notify(convertResultCodeToErrorStatus(n), {}, kNoTiming);
             return;
         }
@@ -536,36 +604,57 @@ static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
         // Attempt to execute a single step of the execution.
         auto [stepN, stepOutputShapes, stepTiming] = executor->compute(deadline, burstController);
 
-        // Update global outputs.
-        if (!executor->updateOutputShapes(stepOutputShapes, &outputShapes)) {
+        // Update global outputs and dynamic temporaries.
+        StepExecutor::UpdateOutputShapes updateOutputShapes = {};
+        if (!executor->updateOutputShapes(stepN, stepOutputShapes, &outputShapes,
+                                          &updateOutputShapes)) {
             stepN = ANEURALNETWORKS_OP_FAILED;
         }
 
         // If execution was successful, continue to next step.
         if (stepN == ANEURALNETWORKS_NO_ERROR) {
-            // We only support collection of timing information in the case of a
-            // single step, so it's safe to just keep track of the last step's
-            // timing information.
-            timing = stepTiming;
-            continue;
+            if (updateOutputShapes.zeroSizedInput) {
+                // We'll need to do full model CPU fallback
+                VLOG(EXECUTION) << "updateOutputShapes.zeroSizedInput";
+                stepN = ANEURALNETWORKS_OP_FAILED;
+            } else {
+                CHECK(executor->areDynamicTemporariesAllocated());
+                // We only support collection of timing information in the case
+                // of a single step, so it's safe to just keep track of the last
+                // step's timing information.
+                timing = stepTiming;
+                continue;
+            }
         }
 
-        // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
         if (stepN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
-            const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
-            executionCallback->notify(stepStatus, outputShapes, kNoTiming);
-            return;
+            VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: " << toString(updateOutputShapes);
+            if (updateOutputShapes.mainOutputInsufficient ||
+                !updateOutputShapes.updatedDynamicTemporary) {
+                // Either:
+                // - At least one main model output is not of sufficient size; or
+                // - we didn't learn anything new about dynamic temporaries.
+                // Neither of these is recoverable, so end execution.
+                const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
+                executionCallback->notify(stepStatus, outputShapes, kNoTiming);
+                return;
+            }
+            // Every main model output is of sufficient size.  This implies that
+            // at least one dynamic temporary is not of sufficient size.  This
+            // is recoverable.
+            doInsufficientSizeFallback = true;
+            continue;
         }
 
-        // If fallback is not allowed and there was an error, end execution.
-        if (!allowFallback) {
+        // If CPU fallback is not allowed and there was an error, end execution.
+        if (!allowCpuFallback) {
             const ErrorStatus stepStatus = convertResultCodeToErrorStatus(stepN);
             executionCallback->notify(stepStatus, {}, kNoTiming);
             return;
         }
 
         // If CPU execution was already attempted, either:
-        // (1) perform a full fallback if the plan is not simple, or
+        // (1) perform a full CPU fallback if the plan is not simple, or
         // (2) return from the function with an error
         if (executorIsCpu) {
             if (!plan.isSimple()) break;
@@ -574,42 +663,77 @@ static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
         }
 
         // If the code reaches this point, attempt a partial fallback to CPU.
-        CHECK(allowFallback);
-        auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
-                cpuFallbackPartial(plan, controller);
-
-        // Update global outputs.
-        if (fallbackExecutor != nullptr &&
-            !fallbackExecutor->updateOutputShapes(fallbackOutputShapes, &outputShapes)) {
-            fallbackN = ANEURALNETWORKS_OP_FAILED;
+        CHECK(allowCpuFallback);
+        if (updateOutputShapes.zeroSizedInput) {
+            // Do not attempt a partial fallback.
+            break;
         }
+        while (true) {
+            auto [fallbackN, fallbackOutputShapes, fallbackTiming, fallbackExecutor] =
+                    cpuFallbackPartial(plan, controller);
+
+            // Update global outputs and dynamic temporaries.
+            StepExecutor::UpdateOutputShapes fallbackUpdateOutputShapes = {};
+            if (fallbackExecutor != nullptr &&
+                !fallbackExecutor->updateOutputShapes(fallbackN, fallbackOutputShapes,
+                                                      &outputShapes, &fallbackUpdateOutputShapes)) {
+                fallbackN = ANEURALNETWORKS_OP_FAILED;
+            }
 
-        // If execution was successful, continue to next step.
-        if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
-            // We only support collection of timing information in the case of a
-            // single step, so it's safe to just keep track of the last step's
-            // timing information.
-            timing = fallbackTiming;
-            continue;
-        }
+            // If execution was successful, continue to next step.
+            if (fallbackN == ANEURALNETWORKS_NO_ERROR) {
+                if (fallbackUpdateOutputShapes.zeroSizedInput) {
+                    // We'll need to do full model CPU fallback
+                    VLOG(EXECUTION) << "fallbackUpdateOutputShapes.zeroSizedInput";
+                    fallbackN = ANEURALNETWORKS_OP_FAILED;
+                    break;
+                }
+                CHECK(fallbackExecutor->areDynamicTemporariesAllocated());
+                // We only support collection of timing information in the case of a
+                // single step, so it's safe to just keep track of the last step's
+                // timing information.
+                timing = fallbackTiming;
+                goto nextStep;
+            }
 
-        // OUTPUT_INSUFFICIENT_SIZE is not recoverable, so end execution.
-        if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
-            const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-            executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
-            return;
-        }
+            if (fallbackN == ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE) {
+                VLOG(EXECUTION) << "OUTPUT_INSUFFICIENT_SIZE: "
+                                << toString(fallbackUpdateOutputShapes);
+                if (fallbackUpdateOutputShapes.mainOutputInsufficient ||
+                    !fallbackUpdateOutputShapes.updatedDynamicTemporary) {
+                    // Either:
+                    // - At least one main model output is not of sufficient size; or
+                    // - we didn't learn anything new about dynamic temporaries.
+                    // Neither of these is recoverable, so end execution.
+                    const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+                    executionCallback->notify(fallbackStatus, outputShapes, kNoTiming);
+                    return;
+                }
+                // Every main model output is of sufficient size.  This implies
+                // that at least one dynamic temporary is not of sufficient
+                // size.  This is recoverable.
+                continue;
+            }
 
-        // Do not fallback twice if the ExecutionPlan is simple.
-        if (plan.isSimple()) {
-            const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
-            executionCallback->notify(fallbackStatus, {}, kNoTiming);
-            return;
+            // Do not fallback twice if the ExecutionPlan is simple.
+            if (plan.isSimple()) {
+                const ErrorStatus fallbackStatus = convertResultCodeToErrorStatus(fallbackN);
+                executionCallback->notify(fallbackStatus, {}, kNoTiming);
+                return;
+            }
+
+            // If the code reaches this point, then there was an error with the
+            // fallback. In this case, attempt full fallback.
+            break;
         }
 
         // If the code reaches this point, then there was an error with the
         // fallback. In this case, attempt full fallback.
         break;
+
+    nextStep:
+        // Bottom of the outer loop
+        continue;
     }
 
     // If the code has reached this point, a potentially recoverable error
@@ -623,16 +747,28 @@ static void asyncStartComputePartitioned(ExecutionBuilder* executionBuilder,
 // In case of partitioned execution, startComputeFenced call will return the sync
 // fence and the fenced compute callback returned from the last partition.
 // Any failed partition will result in the whole execution fallback to CPU if
-// allowFallback is set to true.
+// allowCpuFallback is set to true.
 static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFenced(
         ExecutionBuilder* executionBuilder, const ExecutionPlan& plan,
         std::shared_ptr<ExecutionPlan::Controller> controller, const std::vector<int>& waitFor,
         uint64_t timeoutDurationAfterFence, const std::optional<Deadline>& deadline,
-        bool allowFallback) {
+        bool allowCpuFallback) {
+    // We should have detected this earlier in the call chain and fallen back to
+    // non-fenced execution.  This is an implementation limitation: In order to
+    // support dynamic temporarires in this code, we'd need to implement
+    // something like the following:
+    // - If a partition has outputs of unknown size, execute that partition in a
+    //   non fenced fashion, just as if it were scheduled on a driver that does
+    //   not support fenced execution.
+    // - Implement something similar to the code in asyncStartComputePartitioned()
+    //   that handles a step execution that fails with
+    //   ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE.
+    CHECK(!executionBuilder->getCompilation()->hasDynamicTemporaries());
+
     CHECK(executionBuilder != nullptr);
     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced (from plan, iteratively)";
     // Disallow fallback when the ExecutionPlan is simple on CPU.
-    allowFallback &= !plan.isSimpleCpu();
+    allowCpuFallback &= !plan.isSimpleCpu();
 
     // Initiate waitForFds, syncFence for the first step.
     std::vector<int> waitForFds = waitFor;
@@ -644,13 +780,13 @@ static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFence
 
         // Get the current step of the execution.
         std::shared_ptr<StepExecutor> executor;
-        int n = plan.next(controller, &executor, nullptr, syncFence);
+        int n = plan.next(controller, &executor, nullptr, nullptr, syncFence);
         if (n != ANEURALNETWORKS_NO_ERROR) {
             // During the interpreted execution of control flow, a loop timeout
             // might occur in ExecutionPlan::next().
             bool missedDeadline = n == ANEURALNETWORKS_MISSED_DEADLINE_TRANSIENT ||
                                   n == ANEURALNETWORKS_MISSED_DEADLINE_PERSISTENT;
-            if (allowFallback && !missedDeadline) break;
+            if (allowCpuFallback && !missedDeadline) break;
             // Return -1 for the sync fence fd, and nullptr for the callback.
             return std::make_tuple(n, -1, nullptr);
         }
@@ -686,8 +822,8 @@ static std::tuple<int, int, sp<hal::IFencedExecutionCallback>> startComputeFence
         if (stepN == ANEURALNETWORKS_NO_ERROR) {
             continue;
         }
-        // If fallback is not allowed and there was an error, end execution.
-        if (!allowFallback) {
+        // If CPU fallback is not allowed and there was an error, end execution.
+        if (!allowCpuFallback) {
             return std::make_tuple(stepN, -1, nullptr);
         }
 
@@ -767,12 +903,13 @@ int ExecutionBuilder::computeFenced(const std::vector<int>& waitFor,
         }
     }
     mStarted = true;
-    const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
     std::shared_ptr<ExecutionPlan::Controller> controller = mPlan->makeController(this, nullptr);
     VLOG(EXECUTION) << "ExecutionBuilder::computeFenced";
     int result;
-    std::tie(result, mSyncFenceFd, mFencedExecutionCallback) = startComputeFenced(
-            this, *mPlan, controller, waitFor, timeoutDurationAfterFence, deadline, allowFallback);
+    std::tie(result, mSyncFenceFd, mFencedExecutionCallback) =
+            startComputeFenced(this, *mPlan, controller, waitFor, timeoutDurationAfterFence,
+                               deadline, allowCpuFallback);
     *syncFence = mSyncFenceFd;
     return result;
 }
@@ -826,14 +963,18 @@ int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
     // asynchronous thread -- take the asynchronous thread logic out of
     // CpuPreparedModel::execute() and use it to wrap the plan-based-path.
     mStarted = true;
-    const bool allowFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
+    const bool allowCpuFallback = DeviceManager::partitioningAllowsFallback(mPartitioning);
     std::shared_ptr<ExecutionPlan::Controller> controller =
             mPlan->makeController(this, burstBuilder);
     if (synchronous) {
-        VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
+        if (burstBuilder) {
+            VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API, burst)";
+        } else {
+            VLOG(EXECUTION) << "ExecutionBuilder::compute (synchronous API)";
+        }
         sp<ExecutionCallback> localSynchronizationCallback = new ExecutionCallback();
         localSynchronizationCallback->setOnFinish(wrappedFinish);
-        asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
                                      localSynchronizationCallback);
         localSynchronizationCallback->wait();
         if (mMeasureTiming) {
@@ -854,13 +995,13 @@ int ExecutionBuilder::compute(sp<ExecutionCallback>* synchronizationCallback,
         executionCallback->setOnFinish(wrappedFinish);
         if (DeviceManager::get()->syncExecRuntime()) {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API, non-threaded)";
-            asyncStartComputePartitioned(this, *mPlan, controller, allowFallback, deadline,
+            asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback, deadline,
                                          executionCallback);
         } else {
             VLOG(EXECUTION) << "ExecutionBuilder::compute (asynchronous API)";
             std::thread asyncExecution(
-                    [this, controller, allowFallback, deadline, executionCallback] {
-                        asyncStartComputePartitioned(this, *mPlan, controller, allowFallback,
+                    [this, controller, allowCpuFallback, deadline, executionCallback] {
+                        asyncStartComputePartitioned(this, *mPlan, controller, allowCpuFallback,
                                                      deadline, executionCallback);
                     });
             executionCallback->bindThread(std::move(asyncExecution));
@@ -884,7 +1025,7 @@ std::vector<OutputShape> ExecutionBuilder::getInitialOutputShapes() const {
 }
 
 // Check if the dimensions "to" is updatable by dimensions "from", where "from" must
-// have a higher specification level.
+// have no lower a specification level.
 static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint32_t>& from) {
     if (to.size() == 0) return true;
     NN_RET_CHECK_EQ(to.size(), from.size());
@@ -894,7 +1035,17 @@ static bool isUpdatable(const std::vector<uint32_t>& to, const std::vector<uint3
     return true;
 }
 
-bool ExecutionBuilder::updateOutputShapes(const std::vector<OutputShape>& outputShapes) {
+static bool isZeroSizedTensor(int executionResultCode, const OutputShape& outputShape) {
+    return (executionResultCode == ANEURALNETWORKS_NO_ERROR) && outputShape.isSufficient &&
+           outputShape.dimensions.size() &&
+           (std::find(outputShape.dimensions.begin(), outputShape.dimensions.end(), uint32_t(0)) !=
+            outputShape.dimensions.end());
+}
+
+bool ExecutionBuilder::updateOutputShapes(ErrorStatus status,
+                                          const std::vector<OutputShape>& outputShapes) {
+    NN_RET_CHECK(validateOutputShapesFromDriver(status, mModel, outputShapes));
+
     if (outputShapes.size() == 0) {
         return true;
     }
@@ -927,7 +1078,7 @@ ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
     CHECK(!mFinishedWithoutSyncFence) << "ExecutionBuilder::finishWithoutSyncFence is called twice";
     CHECK(!hasSyncFence())
             << "ExecutionBuilder::finishWithoutSyncFence is called when hasSyncFence()";
-    if (!updateOutputShapes(outputShapes) || !updateMemories()) {
+    if (!updateOutputShapes(status, outputShapes) || !updateMemories()) {
         status = ErrorStatus::GENERAL_FAILURE;
     }
     bool success = status == ErrorStatus::NONE;
@@ -951,19 +1102,124 @@ ErrorStatus ExecutionBuilder::finishWithoutSyncFence(ErrorStatus status,
     return status;
 }
 
-bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
-                                      std::vector<OutputShape>* to) {
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes) {
+    return "{ .updatedDynamicTemporary = " +
+           std::to_string(updateOutputShapes.updatedDynamicTemporary) +
+           ", .mainOutputInsufficient = " +
+           std::to_string(updateOutputShapes.mainOutputInsufficient) + "}";
+}
+
+bool StepExecutor::updateOutputShapes(int executionResultCode, const std::vector<OutputShape>& from,
+                                      std::vector<OutputShape>* to, UpdateOutputShapes* update) {
+    CHECK(update != nullptr);
+    *update = {.updatedDynamicTemporary = false,
+               .mainOutputInsufficient = false,
+               .zeroSizedInput = false};
+
+    NN_RET_CHECK(validateOutputShapesFromDriver(executionResultCode, mModel, from));
+
     if (from.size() == 0) {
         return true;
     }
+
+    if (VLOG_IS_ON(EXECUTION)) {
+        for (const auto& shape : from) {
+            VLOG(EXECUTION) << "updateOutputShapes: " << toString(shape);
+        }
+    }
+
     if (mExecutionStep != nullptr) {
         const auto& indexMapping = mExecutionStep->getOutputIndexStepModelToMainModel();
         NN_RET_CHECK_LE(indexMapping.size(), from.size());
         for (uint32_t i = 0, e = indexMapping.size(); i < e; i++) {
-            uint32_t toIndex = indexMapping[i];
+            const uint32_t toIndex = indexMapping[i];
             NN_RET_CHECK_GT(to->size(), toIndex);
             NN_RET_CHECK(isUpdatable(to->at(toIndex).dimensions, from[i].dimensions));
             (*to)[toIndex] = from[i];
+            update->mainOutputInsufficient |= !(*to)[toIndex].isSufficient;
+            if (mExecutionStep->getModelOutputsThatAreDownstreamInputs().count(toIndex) &&
+                isZeroSizedTensor(executionResultCode, from[i])) {
+                update->zeroSizedInput = true;
+            }
+        }
+
+        if (!mDynamicTemporaries->empty()) {
+            // TODO(b/157236079): Instead of computing this here, precompute it in ExecutionStep?
+            std::map<uint32_t, uint32_t> operandIndexStepModelOutputToSourceModelTemp;
+            for (const auto& entry : mExecutionStep->getTempsAsStepModelOutputs()) {
+                operandIndexStepModelOutputToSourceModelTemp.emplace(entry.second, entry.first);
+            }
+
+            const uint32_t sourceModelIndex = mExecutionStep->getSourceModelIndex();
+            for (uint32_t i = 0, e = mModel->outputCount(); i < e; i++) {
+                const uint32_t stepModelOperandIndex = mModel->getOutputOperandIndex(i);
+                const auto it =
+                        operandIndexStepModelOutputToSourceModelTemp.find(stepModelOperandIndex);
+                if (it == operandIndexStepModelOutputToSourceModelTemp.end()) {
+                    continue;
+                }
+                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, it->second);
+                VLOG(EXECUTION) << "updateOutputShapes checking to see if output#" << i
+                                << " sourceOperandIndex = (" << sourceOperandIndex.first << ", "
+                                << sourceOperandIndex.second << ") is a dynamic temporary";
+                // This is a temporary, but it might not be a dynamic temporary.
+                const auto loc = mDynamicTemporaries->lookup(sourceOperandIndex, false);
+                if (loc == std::nullopt) {
+                    continue;
+                }
+                NN_RET_CHECK(isUpdatable(*loc->dimensions, from[i].dimensions));
+                bool changedShape = false;
+                const uint32_t actualSize = TypeManager::get()->getSizeOfData(
+                        mModel->getOperand(stepModelOperandIndex).type, from[i].dimensions);
+                if (actualSize > 0) {
+                    changedShape = mDynamicTemporaries->redeclare(sourceOperandIndex,
+                                                                  from[i].dimensions, actualSize);
+                } else if (!from[i].isSufficient) {
+                    NN_RET_CHECK(loc->length < UINT32_MAX / 2)
+                            << "output#" << i << " length overflow";
+                    changedShape = mDynamicTemporaries->redeclare(
+                            sourceOperandIndex, from[i].dimensions, 2 * loc->length);
+                } else {
+                    // The combination of not-fully-specified dimensions
+                    // and isSufficient means that we have no
+                    // information about whether the size of the dynamic
+                    // temporary is adequate.
+                    VLOG(EXECUTION) << "updateOutputShapes skipping redeclaration for output#" << i;
+                    if (executionResultCode == ANEURALNETWORKS_NO_ERROR) {
+                        NN_RET_CHECK(isZeroSizedTensor(executionResultCode, from[i]));
+                        // This is a zero-sized tensor, and by
+                        // definition, any dynamic temporary is an input
+                        // to an execution step.
+                        update->zeroSizedInput = true;
+                    }
+                }
+                if (changedShape) {
+                    // TODO: find a better place for this comment.
+                    //
+                    // isUpdatable(a, b) imposes a partial ordering a <=
+                    // b.  Every fully specified dimensions vector is an
+                    // upper bound of that ordering.  Therefore, any
+                    // change in dimensions moves towards an upper
+                    // bound, and hence there are a finite number of
+                    // such changes possible.
+                    //
+                    // actualSize can only be computed from dimensions
+                    // that are an upper bound.  Therefore, once
+                    // actualSize is computed, it will not change.
+                    //
+                    // If dimensions are not fully specified, and
+                    // estimated size changes, it increases.  There is
+                    // an upper bound on estimated size to avoid
+                    // overflow.
+                    //
+                    // Therefore, if we retry only when dimensions or
+                    // size chage, and we stop retrying if we would
+                    // otherwise overflow, we should only retry a finite
+                    // number of times.
+                    update->updatedDynamicTemporary = true;
+                }
+            }
+            mDynamicTemporaries->vlogDump("finished updateOutputShapes");
         }
     } else {
         NN_RET_CHECK_EQ(from.size(), to->size());
@@ -977,19 +1233,26 @@ bool StepExecutor::updateOutputShapes(const std::vector<OutputShape>& from,
 
 StepExecutor::StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                            std::shared_ptr<Device> device,
-                           std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step)
+                           std::shared_ptr<PreparedModel> preparedModel, const ExecutionStep* step,
+                           DynamicTemporaries* dynamicTemporaries)
     : mExecutionBuilder(executionBuilder),
       mExecutionStep(step),
+      mDynamicTemporaries(dynamicTemporaries),
       mModel(model),
       mDevice(device),
       mPreparedModel(preparedModel),
       mInputs(model->inputCount()),
       mOutputs(model->outputCount()) {
     CHECK(mDevice != nullptr);
+    CHECK_EQ(step == nullptr, dynamicTemporaries == nullptr);
     VLOG(EXECUTION) << "StepExecutor::StepExecutor with " << mInputs.size() << " inputs and "
                     << mOutputs.size() << " outputs";
 }
 
+bool StepExecutor::areDynamicTemporariesAllocated() const {
+    return !mDynamicTemporaries || mDynamicTemporaries->allocated(mExecutionStep->getIndex());
+}
+
 void StepExecutor::mapInputsAndOutputsTrivially() {
     mInputs = mExecutionBuilder->mInputs;
     mOutputs = mExecutionBuilder->mOutputs;
@@ -997,17 +1260,28 @@ void StepExecutor::mapInputsAndOutputsTrivially() {
 }
 
 void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
-                                    ModelArgumentInfo* executorInputOrOutput) {
+                                    ModelArgumentInfo* executorInputOrOutput,
+                                    const hidl_vec<uint32_t>* builderDimensions) {
+    auto updateDimensions = [executorInputOrOutput, builderDimensions] {
+        if (!builderDimensions) {
+            return;
+        }
+        executorInputOrOutput->dimensions() = *builderDimensions;
+    };
+
     *executorInputOrOutput = builderInputOrOutput;
     switch (executorInputOrOutput->state()) {
         default:
             CHECK(false) << "unexpected ModelArgumentInfo::state";
             break;
         case ModelArgumentInfo::HAS_NO_VALUE:
-        case ModelArgumentInfo::POINTER:
         case ModelArgumentInfo::UNSPECIFIED:
             break;
+        case ModelArgumentInfo::POINTER:
+            updateDimensions();
+            break;
         case ModelArgumentInfo::MEMORY: {
+            updateDimensions();
             const uint32_t builderPoolIndex = builderInputOrOutput.locationAndLength().poolIndex;
             const Memory* memory = mExecutionBuilder->mMemories[builderPoolIndex];
             const uint32_t executorPoolIndex = mMemories.add(memory);
@@ -1019,33 +1293,56 @@ void StepExecutor::mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutpu
 
 int StepExecutor::setInputOrOutputFromMemory(const Operand& inputOrOutputOperand,
                                              const Memory* memory, uint32_t offset,
+                                             const hal::hidl_vec<uint32_t>& dimensions,
+                                             std::optional<uint32_t> length,
                                              ModelArgumentInfo* inputOrOutputInfo) {
     // Should be similar to
     //     ExecutionBuilder::setInputFromMemory()
     //     ExecutionBuilder::setOutputFromMemory()
 
     uint32_t poolIndex = mMemories.add(memory);
-    uint32_t length = TypeManager::get()->getSizeOfData(inputOrOutputOperand);
+    uint32_t lengthVal = length.value_or(TypeManager::get()->getSizeOfData(inputOrOutputOperand));
     CHECK(inputOrOutputInfo->unspecified());
     int n;
     std::tie(n, *inputOrOutputInfo) =
             ModelArgumentInfo::createFromMemory(inputOrOutputOperand,
-                                                /*type=*/nullptr, poolIndex, offset, length);
+                                                /*type=*/nullptr, poolIndex, offset, lengthVal);
+    if (n == ANEURALNETWORKS_NO_ERROR && dimensions.size()) {
+        CHECK(isUpdatable(inputOrOutputInfo->dimensions(), dimensions));
+        inputOrOutputInfo->dimensions() = dimensions;
+    }
     return n;
 }
 
+static std::string toString(std::vector<uint32_t> dimensions) {
+    std::string ret = "(";
+    bool wroteOne = false;
+    for (uint32_t dimension : dimensions) {
+        if (wroteOne) {
+            ret += ", ";
+        } else {
+            wroteOne = true;
+        }
+        ret += std::to_string(dimension);
+    }
+    ret += ")";
+    return ret;
+};
+
 static void logArguments(const char* kind, const std::vector<ModelArgumentInfo>& args) {
     for (unsigned i = 0; i < args.size(); i++) {
         const auto& arg = args[i];
         std::string prefix = kind + std::string("[") + std::to_string(i) + "] = ";
         switch (arg.state()) {
             case ModelArgumentInfo::POINTER:
-                VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ")";
+                VLOG(EXECUTION) << prefix << "POINTER(" << SHOW_IF_DEBUG(arg.buffer()) << ") dim"
+                                << toString(arg.dimensions());
                 break;
             case ModelArgumentInfo::MEMORY:
                 VLOG(EXECUTION) << prefix << "MEMORY("
                                 << "pool=" << arg.locationAndLength().poolIndex << ", "
-                                << "off=" << arg.locationAndLength().offset << ")";
+                                << "off=" << arg.locationAndLength().offset << ") dim"
+                                << toString(arg.dimensions());
                 break;
             case ModelArgumentInfo::HAS_NO_VALUE:
                 VLOG(EXECUTION) << prefix << "HAS_NO_VALUE";
diff --git a/nn/runtime/ExecutionBuilder.h b/nn/runtime/ExecutionBuilder.h
index f61df4c0d..2540f233c 100644
--- a/nn/runtime/ExecutionBuilder.h
+++ b/nn/runtime/ExecutionBuilder.h
@@ -19,6 +19,7 @@
 
 #include <atomic>
 #include <memory>
+#include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
@@ -38,6 +39,7 @@ namespace nn {
 class BurstBuilder;
 class CompilationBuilder;
 class Device;
+class DynamicTemporaries;
 class ExecutionBurstController;
 class ExecutionPlan;
 class ExecutionStep;
@@ -134,7 +136,8 @@ class ExecutionBuilder {
     const CompilationBuilder* mCompilation;
 
     // Update output dimensional information from OutputShape to ModelArgumentInfo.
-    bool updateOutputShapes(const std::vector<hal::OutputShape>& outputShapes);
+    bool updateOutputShapes(hal::ErrorStatus status,
+                            const std::vector<hal::OutputShape>& outputShapes);
 
     bool updateMemories();
 
@@ -226,9 +229,16 @@ class StepExecutor {
     //     Contains the output index mapping from the excerpted "step" model to
     //     main model if the execution has multiple "steps". Must be nullptr
     //     otherwise.
+    //     (step == nullptr) == (dynamicTemporaries == nullptr)
+    // dynamicTemporaries
+    //     If the execution has multiple "steps", describes the temporaries
+    //     of source models that do not have fully specified types and are outputs
+    //     of "step" models. Must be nullptr otherwise.
+    //     (step == nullptr) == (dynamicTemporaries == nullptr)
     StepExecutor(ExecutionBuilder* executionBuilder, const ModelBuilder* model,
                  std::shared_ptr<Device> device, std::shared_ptr<PreparedModel> preparedModel,
-                 const ExecutionStep* step = nullptr);
+                 const ExecutionStep* step = nullptr,
+                 DynamicTemporaries* dynamicTemporaries = nullptr);
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // in the case where we have a single-"step" execution (i.e., the executor
@@ -236,31 +246,53 @@ class StepExecutor {
     void mapInputsAndOutputsTrivially();
 
     // Update output shapes with shapes returned from execution.
-    bool updateOutputShapes(const std::vector<hal::OutputShape>& from,
-                            std::vector<hal::OutputShape>* to);
+    struct UpdateOutputShapes {
+        // These fields are meaningless unless updateOutputShapes() returns true
+        bool updatedDynamicTemporary;  // did shape (dimensions, size) information change for at
+                                       // least one dynamic temporary?
+        bool mainOutputInsufficient;  // is at least one main model output written by this execution
+                                      // marked !isSufficient?
+        bool zeroSizedInput;  // is at least one output of this execution step a zero-sized tensor
+                              // that needs to be read by some other step of the same execution?
+    };
+    bool updateOutputShapes(int executionResultCode, const std::vector<hal::OutputShape>& from,
+                            std::vector<hal::OutputShape>* to, UpdateOutputShapes* update);
 
     // Map inputs and outputs from ExecutionBuilder to StepExecutor,
     // one at a time.  Note that these are input/output indexes, not
     // operand indexes.
+    //
+    // For mapOutputToInput(), outputDimensions may be nullptr if the input
+    // operand has fully specified dimensions.
     void mapInput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mInputs[builderIndex], &mInputs[executorIndex]);
     }
     void mapOutput(uint32_t builderIndex, uint32_t executorIndex) {
         mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mOutputs[executorIndex]);
     }
-    void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex) {
-        mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex]);
+    void mapOutputToInput(uint32_t builderIndex, uint32_t executorIndex,
+                          const hal::hidl_vec<uint32_t>* outputDimensions) {
+        mapInputOrOutput(mExecutionBuilder->mOutputs[builderIndex], &mInputs[executorIndex],
+                         outputDimensions);
     }
 
-    // The input or output is assumed to have the size of the
-    // corresponding operand.
-    int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset) {
+    // If no length is provided, the input or output is assumed to have the length
+    // of the operand.  dimensions must either have zero rank or must be
+    // consistent with and at least as well specified as operand dimensions
+    // (i.e., either rank must match, or operand rank must be zero; and for each
+    // individual dimension, either dimension must match, or operand dimension
+    // must be zero).
+    int setInputFromMemory(uint32_t inputIndex, const Memory* memory, uint32_t offset,
+                           const hal::hidl_vec<uint32_t>& dimensions = {},
+                           std::optional<uint32_t> length = std::nullopt) {
         return setInputOrOutputFromMemory(mModel->getInputOperand(inputIndex), memory, offset,
-                                          &mInputs.at(inputIndex));
+                                          dimensions, length, &mInputs.at(inputIndex));
     }
-    int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset) {
+    int setOutputFromMemory(uint32_t outputIndex, const Memory* memory, uint32_t offset,
+                            const hal::hidl_vec<uint32_t>& dimensions = {},
+                            std::optional<uint32_t> length = std::nullopt) {
         return setInputOrOutputFromMemory(mModel->getOutputOperand(outputIndex), memory, offset,
-                                          &mOutputs.at(outputIndex));
+                                          dimensions, length, &mOutputs.at(outputIndex));
     }
 
     // Executes using the (driver, preparedModel) specified at construction time.
@@ -280,12 +312,27 @@ class StepExecutor {
             const std::vector<int>& wait_for, uint64_t timeoutDurationAfterFence,
             const std::optional<Deadline>& deadline);
 
+    // Do the dynamic temporaries defined by this step have valid allocations?
+    // (true if there are no dynamic temporaries defined by this step.)
+    bool areDynamicTemporariesAllocated() const;
+
    private:
+    // builderDimensions may be nullptr if executorInputOrOutput has fully
+    // specified dimensions.
     void mapInputOrOutput(const ModelArgumentInfo& builderInputOrOutput,
-                          ModelArgumentInfo* executorInputOrOutput);
-
+                          ModelArgumentInfo* executorInputOrOutput,
+                          const hal::hidl_vec<uint32_t>* builderDimensions = nullptr);
+
+    // If no length is provided, the input or output is assumed to have the length
+    // of the corresponding operand.  dimensions must either have zero rank or
+    // must be consistent with and at least as well specified as operand
+    // dimensions (i.e., either rank must match, or operand rank must be zero;
+    // and for each individual dimension, either dimension must match, or
+    // operand dimension must be zero).
     int setInputOrOutputFromMemory(const hal::Operand& inputOrOutputOperand, const Memory* memory,
-                                   uint32_t offset, ModelArgumentInfo* inputOrOutputInfo);
+                                   uint32_t offset, const hal::hidl_vec<uint32_t>& dimensions,
+                                   std::optional<uint32_t> length,
+                                   ModelArgumentInfo* inputOrOutputInfo);
 
     std::tuple<int, std::vector<hal::OutputShape>, hal::Timing> computeWithMemories(
             const std::optional<Deadline>& deadline, const std::vector<const Memory*>& memories,
@@ -295,7 +342,10 @@ class StepExecutor {
     ExecutionBuilder* mExecutionBuilder;
 
     // describes the single execution step
-    const ExecutionStep* mExecutionStep = nullptr;
+    const ExecutionStep* mExecutionStep;
+
+    // describes the dynamic temporaries
+    DynamicTemporaries* mDynamicTemporaries;
 
     // model to be executed on the executor, in both original and
     // compiled forms; and device on which to execute it
@@ -318,6 +368,8 @@ class StepExecutor {
     MemoryTracker mMemories;
 };
 
+std::string toString(StepExecutor::UpdateOutputShapes updateOutputShapes);
+
 }  // namespace nn
 }  // namespace android
 
diff --git a/nn/runtime/ExecutionPlan.cpp b/nn/runtime/ExecutionPlan.cpp
index da0c003f2..c3aa61f92 100644
--- a/nn/runtime/ExecutionPlan.cpp
+++ b/nn/runtime/ExecutionPlan.cpp
@@ -180,8 +180,165 @@ void OperandTracker::markProcessed(uint32_t operationIndex, OperationReadyCallba
     }
 }
 
+uint32_t addTemporaryOfSize(uint32_t* totalSizeOfTemporaries, uint32_t size) {
+    // TODO: what about overflow?
+    *totalSizeOfTemporaries += alignBytesNeeded(*totalSizeOfTemporaries, size);
+    const uint32_t offset = *totalSizeOfTemporaries;
+    *totalSizeOfTemporaries += size;
+    return offset;
+};
+
+std::string toString(SourceOperandIndex sourceOperandIndex) {
+    return "(" + std::to_string(sourceOperandIndex.first) + ", " +
+           std::to_string(sourceOperandIndex.second) + ")";
+};
+
+std::string toString(hidl_vec<uint32_t> dimensions) {
+    std::string ret = "(";
+    bool wroteOne = false;
+    for (uint32_t dimension : dimensions) {
+        if (wroteOne) {
+            ret += ", ";
+        } else {
+            wroteOne = true;
+        }
+        ret += std::to_string(dimension);
+    }
+    ret += ")";
+    return ret;
+};
+
 }  // namespace
 
+void DynamicTemporaries::vlogDump(const char* context) const {
+    if (empty()) {
+        return;
+    }
+    if (context) {
+        VLOG(EXECUTION) << "DynamicTemporaries: \"" << context << "\"";
+    }
+    for (const auto& temp : mSourceOperandToTemporary) {
+        VLOG(EXECUTION) << "DynamicTemporaries: sourceOperandIndex = " << toString(temp.first)
+                        << ", stepIndex = " << temp.second.stepIndex
+                        << ", offset = " << temp.second.offset
+                        << ", dimensions = " << toString(temp.second.dimensions)
+                        << ", length = " << temp.second.length;
+    }
+}
+
+void DynamicTemporaries::declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+                                 const hidl_vec<uint32_t>& initialDimensions,
+                                 uint32_t initialLength) {
+    VLOG(EXECUTION) << "DynamicTemporaries::declare(sourceOperandIndex = "
+                    << toString(sourceOperandIndex) << ", stepIndex = " << stepIndex
+                    << ", initialDimensions = " << toString(initialDimensions)
+                    << ", initialLength = " << initialLength << ")";
+    CHECK(!mDeclared);
+    CHECK_GT(initialLength, 0u);
+    auto [_, isNew] = mSourceOperandToTemporary.emplace(
+            sourceOperandIndex,
+            InternalLocationAndShape{stepIndex, 0, initialDimensions, initialLength});
+    CHECK(isNew);
+    mStepIndexToSourceOperandIndexes[stepIndex].emplace_back(sourceOperandIndex);
+}
+
+bool DynamicTemporaries::redeclare(SourceOperandIndex sourceOperandIndex,
+                                   const hidl_vec<uint32_t>& newDimensions, uint32_t newLength) {
+    auto createAndLogResult = [sourceOperandIndex, &newDimensions, newLength](bool changedShape) {
+        VLOG(EXECUTION) << "DynamicTemporaries::redeclare(sourceOperandIndex = "
+                        << toString(sourceOperandIndex)
+                        << ", newDimensions = " << toString(newDimensions)
+                        << ", newLength = " << newLength << ") -> " << toString(changedShape);
+        return changedShape;
+    };
+
+    CHECK(mDeclared);
+    CHECK_GT(newLength, 0u);
+
+    InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+    if (temp.length == newLength && temp.dimensions == newDimensions) {
+        return createAndLogResult(false);
+    }
+    if (temp.length < newLength) {
+        // Otherwise allocation remains valid, even if it may be suboptimal
+        // (because it uses more space than needed).  Use case: Don't force
+        // client to allocate again just because the client reported more
+        // accurate shape information.
+        mAllocatedStepIndexes.erase(temp.stepIndex);
+    }
+    temp.length = newLength;
+    temp.dimensions = newDimensions;
+    return createAndLogResult(true);
+}
+
+int DynamicTemporaries::allocate(uint32_t stepIndex) {
+    VLOG(EXECUTION) << "DynamicTemporaries::allocate(stepIndex = " << stepIndex << ")";
+
+    CHECK(mDeclared);
+
+    const auto sourceOperandIndexesI = mStepIndexToSourceOperandIndexes.find(stepIndex);
+    if (sourceOperandIndexesI == mStepIndexToSourceOperandIndexes.end()) {
+        return ANEURALNETWORKS_NO_ERROR;
+    }
+
+    // perform layout
+    uint32_t newSize = 0;
+    for (const auto sourceOperandIndex : sourceOperandIndexesI->second) {
+        InternalLocationAndShape& temp = mSourceOperandToTemporary.at(sourceOperandIndex);
+        temp.offset = addTemporaryOfSize(&newSize, temp.length);
+    }
+
+    // perform (re-)allocation
+    // TODO: Today we may shrink the allocation in order to avoid wasting memory.  Is this important
+    //       to conserve memory, or do we waste time reallocating?
+    const double kWaste = 0.2 /* arbitrary */;  // Willing to waste space to avoid
+                                                // deallocation/reallocation overhead
+    auto& memory = mStepIndexToMemory[stepIndex];
+    const uint32_t oldSize = (memory ? memory->getSize() : 0);
+    if ((oldSize >= newSize) && (oldSize <= newSize * (1 + kWaste))) {
+        // Suitable allocation already exists; nothing to do
+    } else {
+        int n;
+        std::tie(n, memory) = MemoryAshmem::create(newSize);
+        if (n != ANEURALNETWORKS_NO_ERROR) {
+            LOG(ERROR) << "Failed to allocate dynamic temporaries of size " << newSize
+                       << " for step " << stepIndex;
+            mAllocatedStepIndexes.erase(stepIndex);
+            return n;
+        }
+    }
+
+    mAllocatedStepIndexes.insert(stepIndex);
+    return ANEURALNETWORKS_NO_ERROR;
+}
+
+bool DynamicTemporaries::allocated(uint32_t stepIndex) const {
+    return (mStepIndexToSourceOperandIndexes.find(stepIndex) ==
+            mStepIndexToSourceOperandIndexes.end()) ||
+           mAllocatedStepIndexes.count(stepIndex);
+}
+
+std::optional<DynamicTemporaries::LocationAndShape> DynamicTemporaries::lookup(
+        SourceOperandIndex sourceOperandIndex, bool mustBeAllocated) const {
+    CHECK(mDeclared);
+    if (auto it = mSourceOperandToTemporary.find(sourceOperandIndex);
+        it != mSourceOperandToTemporary.end()) {
+        const InternalLocationAndShape& temp = it->second;
+        const bool isAllocated = allocated(temp.stepIndex);
+        if (mustBeAllocated) {
+            CHECK(isAllocated) << "Source operand " << toString(sourceOperandIndex)
+                               << " must be allocated";
+        }
+        if (isAllocated) {
+            return LocationAndShape{mStepIndexToMemory.at(temp.stepIndex).get(), temp.offset,
+                                    &temp.dimensions, temp.length};
+        } else {
+            return LocationAndShape{nullptr, ~uint32_t(0), &temp.dimensions, temp.length};
+        }
+    }
+    return std::nullopt;
+}
+
 ExecutionStep::ExecutionStep(ExecutionPlan* plan, uint32_t stepIndex, uint32_t sourceModelIndex,
                              std::shared_ptr<Device> device)
     : mPlan(plan),
@@ -283,6 +440,10 @@ int ExecutionStep::addOperand(uint32_t sourceOperandIndex, uint32_t* stepOperand
                 // The first time we've seen this operand is as an
                 // output.
                 mModelOutputs.emplace_back(sourceOperandIndex, *stepOperandIndex);
+                // It may be an input to a different partition, so keep track of
+                // it.
+                mPlan->recordOutputDef(SourceOperandIndex(mSourceModelIndex, sourceOperandIndex),
+                                       mIndex);
             }
         } break;
         case OperandLifeTime::SUBGRAPH: {
@@ -336,8 +497,10 @@ int ExecutionStep::addOperation(int operationIndex) {
 }
 
 void ExecutionStep::mapInputsAndOutputs(
-        std::shared_ptr<StepExecutor> executor, const Memory* temporaryMemory,
+        std::shared_ptr<StepExecutor> executor,
+        const std::vector<hal::OutputShape>* mainModelOutputShapes, const Memory* temporaryMemory,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+        const DynamicTemporaries& dynamicTemporaries,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
         const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -347,12 +510,18 @@ void ExecutionStep::mapInputsAndOutputs(
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setInputFromMemory(stepInputIndex, temporaryMemory, it->second);
+        } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+            executor->setInputFromMemory(stepInputIndex, loc->memory, loc->offset, *loc->dimensions,
+                                         loc->length);
         } else if (auto it = sourceOperandToInputIndex.find(sourceOperandIndex);
                    it != sourceOperandToInputIndex.end()) {
             executor->mapInput(it->second, stepInputIndex);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
-            executor->mapOutputToInput(it->second, stepInputIndex);
+            executor->mapOutputToInput(it->second, stepInputIndex,
+                                       mainModelOutputShapes
+                                               ? &mainModelOutputShapes->at(it->second).dimensions
+                                               : nullptr);
         } else if (auto it = sourceOperandToConstantReference.find(sourceOperandIndex);
                    it != sourceOperandToConstantReference.end()) {
             // Constant partition boundary operand. This could be an IF branch
@@ -368,6 +537,9 @@ void ExecutionStep::mapInputsAndOutputs(
         if (auto it = sourceOperandToOffsetOfTemporary.find(sourceOperandIndex);
             it != sourceOperandToOffsetOfTemporary.end()) {
             executor->setOutputFromMemory(stepOutputIndex, temporaryMemory, it->second);
+        } else if (auto loc = dynamicTemporaries.lookup(sourceOperandIndex); loc != std::nullopt) {
+            executor->setOutputFromMemory(stepOutputIndex, loc->memory, loc->offset,
+                                          *loc->dimensions, loc->length);
         } else if (auto it = sourceOperandToOutputIndex.find(sourceOperandIndex);
                    it != sourceOperandToOutputIndex.end()) {
             executor->mapOutput(it->second, stepOutputIndex);
@@ -384,6 +556,32 @@ void ExecutionStep::mapInputsAndOutputs(
     }
 }
 
+void ExecutionPlan::CompoundBody::findModelOutputsThatAreDownstreamInputs() {
+    auto declareModelOutputIsDownstreamInput =
+            [this](const SourceOperandIndex& sourceOperandIndex) {
+                const auto it = mOutputToDefiningExecutionStep.find(sourceOperandIndex);
+                CHECK(it != mOutputToDefiningExecutionStep.end());
+                uint32_t stepIndex = it->second;
+                CHECK_LT(stepIndex, mSteps.size());
+                VLOG(COMPILATION)
+                        << "ExecutionStep(" << stepIndex
+                        << ")->declareModelOutputIsDownstreamInput(mSourceOperandToOutputIndex.at"
+                        << toString(sourceOperandIndex) << ")";
+                CHECK(mSourceOperandToOutputIndex.find(sourceOperandIndex) !=
+                      mSourceOperandToOutputIndex.end());
+                mSteps[stepIndex]->executionStep()->declareModelOutputIsDownstreamInput(
+                        mSourceOperandToOutputIndex.at(sourceOperandIndex));
+            };
+    for (const auto& logicalStep : mSteps) {
+        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+            for (const auto& output : step->getOutputsAsStepModelInputs()) {
+                SourceOperandIndex sourceOperandIndex(step->getSourceModelIndex(), output.first);
+                declareModelOutputIsDownstreamInput(sourceOperandIndex);
+            }
+        }
+    }
+}
+
 void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
     auto recordAsOutputIfTemporary = [this](const SourceOperandIndex& sourceOperandIndex) {
         const auto it = mTemporaryToDefiningExecutionStep.find(sourceOperandIndex);
@@ -418,6 +616,17 @@ void ExecutionPlan::CompoundBody::findTempsAsStepModelOutputs() {
     }
 }
 
+void ExecutionStep::declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex) {
+    VLOG(COMPILATION) << "ExecutionStep(" << mIndex << ")::declareModelOutputIsDownstreamInput("
+                      << mainModelOutputIndex << ")";
+    const auto it = std::find(mOutputIndexStepModelToMainModel.begin(),
+                              mOutputIndexStepModelToMainModel.end(), mainModelOutputIndex);
+    CHECK(it != mOutputIndexStepModelToMainModel.end());
+    const uint32_t stepModelOutputIndex = it - mOutputIndexStepModelToMainModel.begin();
+    CHECK(stepModelOutputIndex < mModelOutputs.size());
+    mModelOutputsThatAreDownstreamInputs.insert(stepModelOutputIndex);
+}
+
 void ExecutionStep::recordTempAsStepModelOutput(uint32_t stepOperandIndex) {
     const auto it = mOperandMap.find(stepOperandIndex);
     CHECK(it != mOperandMap.end());
@@ -610,7 +819,8 @@ void LogicalStep::dump() const {
 
 int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
                                         int32_t executionPreference, int32_t priority,
-                                        const std::optional<Deadline>& deadline) {
+                                        const std::optional<Deadline>& deadline,
+                                        int simulateFailureResultCode) {
     CHECK(!mSuccessfulFinish);
     CHECK(!deadline.has_value());
     const ModelBuilder* mainModel = sourceModels->getModel(kMainModelInSourceModels);
@@ -629,8 +839,8 @@ int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
     findTempsAsStepModelOutputs();
     for (const auto& logicalStep : mSteps) {
         if (ExecutionStep* step = logicalStep->tryExecutionStep()) {
-            int n = step->finishStepModel(mainModel, &mHasStepModelOutputOfUnknownSize,
-                                          executionPreference, priority);
+            int n = step->finishStepModel(mainModel, &mHasDynamicTemporaries, executionPreference,
+                                          priority);
             if (n != ANEURALNETWORKS_NO_ERROR) {
                 VLOG(COMPILATION)
                         << "ExecutionPlan::CompoundBody::finish -- finishStepModel failed";
@@ -657,10 +867,11 @@ int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
             CHECK(logicalStep->isGoto());
         }
     }
-    if (mHasStepModelOutputOfUnknownSize) {
-        VLOG(COMPILATION)
-                << "ExecutionPlan::CompoundBody::finish -- mHasStepModelOutputOfUnknownSize";
-        return ANEURALNETWORKS_OP_FAILED;
+
+    if (simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+        VLOG(COMPILATION) << "ExecutionPlan::CompoundeBody::finish: simulating failure, ResultCode "
+                          << simulateFailureResultCode;
+        return simulateFailureResultCode;
     }
 
     for (uint32_t i = 0, n = mainModel->inputCount(); i < n; ++i) {
@@ -673,6 +884,7 @@ int ExecutionPlan::CompoundBody::finish(const SourceModels* sourceModels,
     }
 
     findControlFlowBoundaryConstants(sourceModels);
+    findModelOutputsThatAreDownstreamInputs();
 
     mSuccessfulFinish = true;
     return ANEURALNETWORKS_NO_ERROR;
@@ -713,25 +925,32 @@ void ExecutionPlan::CompoundBody::findControlFlowBoundaryConstants(
 }
 
 int ExecutionPlan::SimpleBody::finish(const SourceModels*, int32_t executionPreference,
-                                      int32_t priority, const std::optional<Deadline>& deadline) {
+                                      int32_t priority, const std::optional<Deadline>& deadline,
+                                      int simulateFailureResultCode) {
     CHECK(!mSuccessfulFinish);
     CHECK(mDevice != nullptr);
     VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish, compilation";
-    const int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir,
-                          &mToken, &mPreparedModel);
+    int n = compile(*mDevice, *mModel, executionPreference, priority, deadline, *mCacheDir, &mToken,
+                    &mPreparedModel);
+    if (n == ANEURALNETWORKS_NO_ERROR && simulateFailureResultCode != ANEURALNETWORKS_NO_ERROR) {
+        VLOG(COMPILATION) << "ExecutionPlan::SimpleBody::finish: simulating failure, ResultCode "
+                          << simulateFailureResultCode;
+        n = simulateFailureResultCode;
+    }
     mSuccessfulFinish = (n == ANEURALNETWORKS_NO_ERROR);
     return n;
 }
 
 int ExecutionPlan::finish(int32_t executionPreference, int32_t priority,
-                          const std::optional<Deadline>& deadline) {
+                          const std::optional<Deadline>& deadline, int simulateFailureResultCode) {
     CHECK(mBody != nullptr);
-    return mBody->finish(&getSourceModels(), executionPreference, priority, deadline);
+    return mBody->finish(&getSourceModels(), executionPreference, priority, deadline,
+                         simulateFailureResultCode);
 }
 
 ExecutionPlan::Controller::Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
                                       const BurstBuilder* burstBuilder)
-    : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}) {}
+    : Controller(plan, executionBuilder, burstBuilder, 0, {}, {}, {}, {}, {}, {}, {}) {}
 
 ExecutionPlan::Controller::Controller(
         const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
@@ -741,7 +960,8 @@ ExecutionPlan::Controller::Controller(
         std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
         std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
         const std::map<SourceOperandIndex, ConstantCopyLocation>& sourceOperandToConstantCopy,
-        std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference)
+        std::map<SourceOperandIndex, ConstantReferenceLocation> sourceOperandToConstantReference,
+        DynamicTemporaries dynamicTemporaries)
     : mPlan(plan),
       mExecutionBuilder(executionBuilder),
       mBurstBuilder(burstBuilder),
@@ -750,6 +970,7 @@ ExecutionPlan::Controller::Controller(
       mSourceOperandToInputIndex(std::move(sourceOperandToInputIndex)),
       mSourceOperandToOutputIndex(std::move(sourceOperandToOutputIndex)),
       mSourceOperandToConstantReference(std::move(sourceOperandToConstantReference)),
+      mDynamicTemporaries(std::move(dynamicTemporaries)),
       mNextStepIndex(0),
       mFallbackNextStepIndex(kBadStepIndex),
       mLastStepSyncFd(-1) {
@@ -823,7 +1044,7 @@ std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
         return std::shared_ptr<Controller>(new Controller(this, executionBuilder, burstBuilder));
     }
     // Create the layout for a Memory object big enough to hold
-    // - every partition boundary TEMPORARY operand and
+    // - every partition boundary TEMPORARY operand that is not a dynamic temporary, and
     // - buffers required by the control flow implementation.
     //
     // TODO: Rethink this approach for managing temporaries.  Some
@@ -844,21 +1065,17 @@ std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
     // what our Memory objects represent.
     //
     uint32_t totalSizeOfTemporaries = 0;
-    auto addTemporaryOfSize = [&totalSizeOfTemporaries](uint32_t size) {
-        totalSizeOfTemporaries += alignBytesNeeded(totalSizeOfTemporaries, size);
-        const uint32_t offset = totalSizeOfTemporaries;
-        totalSizeOfTemporaries += size;
-        return offset;
-    };
     // This function has two modes of operation:
     // 1. When lifetime is TEMPORARY_VARIABLE, we allocate memory for
-    //    TEMPORARY_VARIABLE source operands, skip SUBGRAPH_OUTPUT source
-    //    operands, and panic if we see a source operand of another lifetime.
+    //    TEMPORARY_VARIABLE source operands that are not dynamic temporaries,
+    //    skip TEMPORARY_VARIABLE source operands that are dynamic temporaries,
+    //    skip SUBGRAPH_OUTPUT source operands, and panic if we see a source
+    //    operand of another lifetime.
     // 2. When lifetime is SUBGRAPH_OUTPUT, we allocate memory for
     //    SUBGRAPH_OUTPUT source operands and panic if we see a source operand
     //    of another lifetime.
     auto mapTemporary =
-            [executionBuilder, addTemporaryOfSize](
+            [executionBuilder, &totalSizeOfTemporaries](
                     const SourceOperandIndex& sourceOperandIndex,
                     std::map<SourceOperandIndex, uint32_t>* sourceOperandToOffsetOfTemporary,
                     OperandLifeTime lifetime = OperandLifeTime::TEMPORARY_VARIABLE) {
@@ -873,13 +1090,19 @@ std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
                 }
                 CHECK(sourceOperand.lifetime == lifetime);
                 const uint32_t size = TypeManager::get()->getSizeOfData(sourceOperand);
-                CHECK_NE(size, 0u);
-                const uint32_t offset = addTemporaryOfSize(size);
-                auto [_, isNew] =
-                        sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
-                CHECK(isNew);
-                VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
-                                << " offset = " << offset;
+                if (size != 0u) {
+                    const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, size);
+                    auto [_, isNew] =
+                            sourceOperandToOffsetOfTemporary->emplace(sourceOperandIndex, offset);
+                    CHECK(isNew);
+                    VLOG(EXECUTION) << "temp: operand " << toString(sourceOperandIndex)
+                                    << " offset = " << offset;
+                } else {
+                    // Unknown size, hence dynamic temporary.  The mapping will
+                    // be established elsewhere (DynamicTemporaries::allocate()).
+                    CHECK(lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+                    CHECK(sourceOperand.lifetime == OperandLifeTime::TEMPORARY_VARIABLE);
+                }
             };
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary;
     std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2;
@@ -963,24 +1186,46 @@ std::shared_ptr<ExecutionPlan::Controller> ExecutionPlan::makeController(
     // Allocate temporary memory for boundary CONSTANT_COPY operands.
     for (const auto& [sourceOperandIndex, location] :
          compound()->mSourceOperandToBoundaryConstantCopy) {
-        const uint32_t offset = addTemporaryOfSize(location.length);
+        const uint32_t offset = addTemporaryOfSize(&totalSizeOfTemporaries, location.length);
         sourceOperandToOffsetOfTemporary.emplace(sourceOperandIndex, offset);
         VLOG(EXECUTION) << "temp (boundary constant): operand " << toString(sourceOperandIndex)
                         << " offset = " << offset;
     }
+    // Collect dynamic temporaries.
+    // TODO(b/157236079): Move some or all of this work to compilation time?
+    DynamicTemporaries dynamicTemporaries;
+    const TypeManager* typeManager = TypeManager::get();
+    forEachDynamicTemporary([typeManager, &dynamicTemporaries](
+                                    SourceOperandIndex sourceOperandIndex,
+                                    const Operand& sourceOperand, uint32_t definingStepIndex) {
+        CHECK(typeManager->isTensorType(sourceOperand.type));
+        // TODO: For now we guess an initial size equal to element
+        // size, which is overly conservative.
+        const uint32_t size = typeManager->getSizeOfData(sourceOperand.type, {1});
+        dynamicTemporaries.declare(sourceOperandIndex, definingStepIndex, sourceOperand.dimensions,
+                                   size);
+    });
+    dynamicTemporaries.endDeclarations();
+    dynamicTemporaries.vlogDump("finished declarations");
+
     return std::shared_ptr<Controller>(new Controller(
             this, executionBuilder, burstBuilder, totalSizeOfTemporaries,
             std::move(sourceOperandToOffsetOfTemporary),
             std::move(sourceOperandToOffsetOfTemporary2), compound()->mSourceOperandToInputIndex,
             compound()->mSourceOperandToOutputIndex,
             compound()->mSourceOperandToBoundaryConstantCopy,
-            compound()->mSourceOperandToBoundaryConstantReference));
+            compound()->mSourceOperandToBoundaryConstantReference, std::move(dynamicTemporaries)));
 }
 
 // TODO: Find a better way to provide this functionality.
 int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
-                            std::shared_ptr<StepExecutor>* executor) const {
+                            std::shared_ptr<StepExecutor>* executor,
+                            std::shared_ptr<ExecutionBurstController>* burstController,
+                            const std::vector<OutputShape>* mainModelOutputShapes) const {
     *executor = nullptr;
+    if (burstController != nullptr) {
+        *burstController = nullptr;
+    }
 
     VLOG(EXECUTION) << "ExecutionPlan::fallback(" << SHOW_IF_DEBUG(controller << ", " << executor)
                     << "): mFallbackNextStepIndex = " << controller->mFallbackNextStepIndex;
@@ -996,7 +1241,7 @@ int ExecutionPlan::fallback(std::shared_ptr<Controller> controller,
     }
 
     controller->mNextStepIndex = controller->mFallbackNextStepIndex;
-    return next(controller, executor);
+    return next(controller, executor, burstController, mainModelOutputShapes);
 }
 
 ExecutionPlan::Buffer::Buffer(void* pointer, uint32_t size)
@@ -1092,6 +1337,7 @@ int ExecutionPlan::readConditionValue(std::shared_ptr<Controller> controller,
 int ExecutionPlan::next(std::shared_ptr<Controller> controller,
                         std::shared_ptr<StepExecutor>* executor,
                         std::shared_ptr<ExecutionBurstController>* burstController,
+                        const std::vector<OutputShape>* mainModelOutputShapes,
                         int syncFdOfLastStep) const {
     controller->mLastStepSyncFd = syncFdOfLastStep;
     *executor = nullptr;
@@ -1133,12 +1379,13 @@ int ExecutionPlan::next(std::shared_ptr<Controller> controller,
         return ANEURALNETWORKS_NO_ERROR;
     }
 
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     if (controller->mNextStepIndex == Controller::kBadStepIndex) {
         return ANEURALNETWORKS_OP_FAILED;
     }
@@ -1151,13 +1398,13 @@ int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
 
     const auto& logicalStep = compoundBody->mSteps[controller->mNextStepIndex];
     if (const IfStep* step = logicalStep->tryIfStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const WhileStep* step = logicalStep->tryWhileStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const GotoStep* step = logicalStep->tryGotoStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
-        return nextCompound(step, controller, executor, burstController);
+        return nextCompound(step, controller, executor, burstController, mainModelOutputShapes);
     } else {
         CHECK(false) << "Unknown step variant";
         return ANEURALNETWORKS_BAD_STATE;
@@ -1166,16 +1413,23 @@ int ExecutionPlan::nextCompound(std::shared_ptr<Controller> controller,
 
 int ExecutionPlan::nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: Step#" << controller->mNextStepIndex << ": execute on "
                     << step->getDevice()->getName();
-    *executor =
-            std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
-                                           step->getDevice(), step->getPreparedStepModel(), step);
+
+    NN_RETURN_IF_ERROR(controller->mDynamicTemporaries.allocate(step->getIndex()));
+    controller->mDynamicTemporaries.vlogDump("finished allocating for a step");
+
+    *executor = std::make_shared<StepExecutor>(controller->mExecutionBuilder, step->getStepModel(),
+                                               step->getDevice(), step->getPreparedStepModel(),
+                                               step, &controller->mDynamicTemporaries);
+
     step->mapInputsAndOutputs(
-            *executor, controller->mTemporaries.get(),
-            controller->mSourceOperandToOffsetOfTemporary, controller->mSourceOperandToInputIndex,
-            controller->mSourceOperandToOutputIndex, controller->mSourceOperandToConstantReference);
+            *executor, mainModelOutputShapes, controller->mTemporaries.get(),
+            controller->mSourceOperandToOffsetOfTemporary, controller->mDynamicTemporaries,
+            controller->mSourceOperandToInputIndex, controller->mSourceOperandToOutputIndex,
+            controller->mSourceOperandToConstantReference);
     if (burstController != nullptr && controller->mBurstBuilder != nullptr) {
         *burstController = controller->mBurstBuilder->getControllerAt(controller->mNextStepIndex);
     }
@@ -1259,7 +1513,8 @@ int ExecutionPlan::Controller::waitForLastStepSyncFence() const {
 
 int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     // If the last step has a sync fence, wait for it to signal before reading the condition value.
     // This is safe because the steps are serialized when doing fenced compute.
@@ -1293,12 +1548,13 @@ int ExecutionPlan::nextCompound(const IfStep* step, std::shared_ptr<Controller>
         // step->outerOutputOperands[i] to implement double buffering.
         controller->setOutput(step->outerOutputOperands[i], branchOutputOperands[i]);
     }
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     WhileState& state = controller->mWhileState[controller->mNextStepIndex];
     if (state.stage == WhileState::EVALUATE_CONDITION) {
         state.iteration = state.iteration == WhileState::kOutsideLoop ? 0 : state.iteration + 1;
@@ -1326,7 +1582,7 @@ int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controlle
         }
 
         state.stage = WhileState::EVALUATE_BODY;
-        return nextCompound(controller, executor, burstController);
+        return nextCompound(controller, executor, burstController, mainModelOutputShapes);
     }
 
     CHECK(state.stage == WhileState::EVALUATE_BODY);
@@ -1414,15 +1670,16 @@ int ExecutionPlan::nextCompound(const WhileStep* step, std::shared_ptr<Controlle
     }
 
     state.stage = WhileState::EVALUATE_CONDITION;
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 int ExecutionPlan::nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
                                 std::shared_ptr<StepExecutor>* executor,
-                                std::shared_ptr<ExecutionBurstController>* burstController) const {
+                                std::shared_ptr<ExecutionBurstController>* burstController,
+                                const std::vector<OutputShape>* mainModelOutputShapes) const {
     VLOG(EXECUTION) << "next: " << toString(*step);
     controller->mNextStepIndex = step->gotoStepIndex;
-    return nextCompound(controller, executor, burstController);
+    return nextCompound(controller, executor, burstController, mainModelOutputShapes);
 }
 
 void ExecutionPlan::becomeCompoundIfEmpty() {
@@ -1473,6 +1730,13 @@ void ExecutionPlan::becomeSingleStep(const std::shared_ptr<Device> device,
     mState = SIMPLE;
 }
 
+void ExecutionPlan::recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
+    auto [it, isNew] =
+            compound()->mOutputToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
+    CHECK(isNew) << "Step " << stepIndex << " redefines output operand "
+                 << toString(sourceOperandIndex) << " already defined by step " << it->second;
+}
+
 void ExecutionPlan::recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex) {
     auto [it, isNew] =
             compound()->mTemporaryToDefiningExecutionStep.emplace(sourceOperandIndex, stepIndex);
@@ -1524,8 +1788,17 @@ const std::vector<std::shared_ptr<LogicalStep>>& ExecutionPlan::forTest_compound
     return compound()->mSteps;
 }
 
-bool ExecutionPlan::forTest_hasStepModelOutputsOfUnknownSize() const {
-    return mBody->hasStepModelOutputsOfUnknownSize();
+std::set<uint32_t> ExecutionPlan::forTest_flatGetDynamicTemporaries() const {
+    CHECK_EQ(getSourceModels().size(), size_t(1));
+    std::set<uint32_t> ret;
+    forEachDynamicTemporary([&ret](SourceOperandIndex dynTemp, const Operand&, uint32_t) {
+        ret.insert(dynTemp.second);
+    });
+    return ret;
+}
+
+bool ExecutionPlan::hasDynamicTemporaries() const {
+    return mBody->hasDynamicTemporaries();
 }
 
 const uint8_t* ExecutionPlan::forTest_simpleGetCacheToken() const {
@@ -1600,14 +1873,36 @@ void ExecutionPlan::CompoundBody::forEachStepRoleOfOutput(uint32_t index,
     }
 }
 
+void ExecutionPlan::forEachDynamicTemporary(
+        const std::function<void(SourceOperandIndex, const Operand&, uint32_t definingStepIndex)>&
+                fn) const {
+    if (mState != COMPOUND) {
+        return;
+    }
+
+    for (const auto& logicalStep : compound()->mSteps) {
+        if (const ExecutionStep* step = logicalStep->tryExecutionStep()) {
+            const uint32_t stepIndex = step->getIndex();
+            const uint32_t sourceModelIndex = step->getSourceModelIndex();
+            for (const auto& entry : step->getTempsAsStepModelOutputs()) {
+                const auto sourceOperandIndex = SourceOperandIndex(sourceModelIndex, entry.first);
+                const auto& sourceOperand = getSourceOperand(sourceOperandIndex);
+                if (hasUnknownSize(sourceOperand)) {
+                    fn(sourceOperandIndex, sourceOperand, stepIndex);
+                }
+            }
+        }
+    }
+}
+
 int ModelBuilder::partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices,
                                    uint32_t preference, uint32_t priority,
-                                   const std::optional<Deadline>& deadline,
-                                   ExecutionPlan* plan) const {
+                                   const std::optional<Deadline>& deadline, ExecutionPlan* plan,
+                                   int simulateFailureResultCode) const {
     uint32_t sourceModelIndex = plan->getSourceModels().addModel(this);
     NN_RETURN_IF_ERROR(partitionTheWorkInternal(sourceModelIndex, devices, preference, priority,
                                                 deadline, plan));
-    int n = plan->finish(preference, priority, deadline);
+    int n = plan->finish(preference, priority, deadline, simulateFailureResultCode);
     if (VLOG_IS_ON(COMPILATION)) {
         VLOG(COMPILATION) << "ModelBuilder::partitionTheWork: source model: ";
         logModelToInfo(makeHidlModel());
@@ -1668,12 +1963,24 @@ int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
     // (see LogicalStep).
     std::vector<std::queue<uint32_t>> perDeviceQueue(deviceCount + 1);
 
+    // This helper function produces a device name.
+    auto deviceName = [&devices, kControlFlowInterpreter,
+                       deviceCount](int deviceIndex) -> std::string {
+        if (deviceIndex == kControlFlowInterpreter) {
+            return "NNAPI";
+        } else if (deviceIndex < 0 || size_t(deviceIndex) >= deviceCount) {
+            return "{unknown}";
+        } else {
+            return devices.at(deviceIndex)->getName();
+        }
+    };
+
     // This helper function enqueues the operation on the appropriate queue.
     auto enqueueOnAppropriateDevice = [&](uint32_t operationIndex) {
         int deviceIndex = bestDeviceForOperation[operationIndex];
         perDeviceQueue[deviceIndex].push(operationIndex);
         VLOG(COMPILATION) << "enqueueOnAppropriateDevice " << operationIndex << " onto "
-                          << deviceIndex;
+                          << deviceIndex << " (" << deviceName(deviceIndex) << ")";
     };
 
     // This helper function finds a device that has operations ready to process.
@@ -1692,11 +1999,14 @@ int ModelBuilder::partitionTheWorkInternal(uint32_t sourceModelIndex,
     };
 
     OperandTracker tracker(this, enqueueOnAppropriateDevice);
-    // For each iteration of this loop, we'll create an execution step.
+    // For each iteration of this loop, we'll create either an execution step or
+    // an interpreted control flow construct (including nested execution steps
+    // and interpreted control flow constructs).
     while (true) {
         // Find the device we'll do this step for.
         int deviceIndex = findNextDeviceToProcess();
-        VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex;
+        VLOG(COMPILATION) << "findNextDeviceToProcess: " << deviceIndex << " ("
+                          << deviceName(deviceIndex) << ")";
         if (deviceIndex < 0) {
             break;
         }
@@ -2050,13 +2360,14 @@ int ModelBuilder::findBestDeviceForEachOperation(
             const int kControlFlowInterpreter = deviceCount;
             (*bestDeviceForOperation)[operationIndex] = kControlFlowInterpreter;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
-                              << toString(operation.type) << ") = -1"
+                              << toString(operation.type) << ":" << operationIndex << ") = -1"
                               << " (NNAPI)";
         } else {
             (*bestDeviceForOperation)[operationIndex] = bestChoice;
             VLOG(COMPILATION) << "ModelBuilder::findBestDeviceForEachOperation("
-                              << toString(operation.type) << ") = " << bestChoice << " ("
-                              << devices[bestChoice]->getName() << ")";
+                              << toString(operation.type) << ":" << operationIndex
+                              << ") = " << bestChoice << " (" << devices[bestChoice]->getName()
+                              << ")";
         }
     }
     return ANEURALNETWORKS_NO_ERROR;
diff --git a/nn/runtime/ExecutionPlan.h b/nn/runtime/ExecutionPlan.h
index d1e7d9435..740912d8e 100644
--- a/nn/runtime/ExecutionPlan.h
+++ b/nn/runtime/ExecutionPlan.h
@@ -22,7 +22,9 @@
 #include <android-base/logging.h>
 #include <openssl/sha.h>
 
+#include <algorithm>
 #include <chrono>
+#include <functional>
 #include <map>
 #include <memory>
 #include <ostream>
@@ -80,6 +82,13 @@ struct ConstantReferenceLocation;
 //   output of a partition. For ExecutionStep, the inputs and outputs of the
 //   step model are boundary operands; for IfStep and WhileStep, the inputs and
 //   outputs of the corresponding operation are boundary operands.
+// - A partition boundary static temporary is a partition boundary
+//   operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+//   whose dimensions are fully specified.
+// - A partition boundary dynamic temporary is a partition boundary
+//   operand which is of lifetime TEMPORARY_VARIABLE in the source model and
+//   whose dimensions are not fully specified.
+// - A main execution is the execution of a main model.
 //
 // Referenced models can be sources of parition boundary operands. For example,
 // this happens when a referenced model is paritioned into one or more
@@ -105,6 +114,107 @@ class SourceModels {
     std::vector<const ModelBuilder*> mModels;
 };
 
+// Represents all partition boundary dynamic temporaries for a particular main
+// execution.
+//
+// Usage pattern:
+// - declare() every partition boundary dynamic temporary.
+// - endDeclarations().  After this point, lookup() is permitted.
+// - Before executing an ExecutionStep, call allocate().
+// - After executing an ExecutionStep, call redeclare() for every partition
+//   boundary dynamic temporary for which we've learned or guessed more about
+//   the dimensions or length.
+//
+// Each partition boundary temporary has a location assigned by allocate() for
+// its defining step (see declare() and allocate()).  That location remains
+// valid until redeclare() increases the length of some temporary in its defining
+// step or allocate() is called again for its defining step.
+class DynamicTemporaries {
+    DISALLOW_COPY_AND_ASSIGN(DynamicTemporaries);
+
+   public:
+    DynamicTemporaries() = default;
+    DynamicTemporaries(DynamicTemporaries&&) = default;
+    DynamicTemporaries& operator=(DynamicTemporaries&&) = default;
+
+    // Declare a dynamic temporary.  stepIndex is the step that defines the
+    // temporary (i.e., in which the temporary appears as an operation output
+    // operand).  initialDimensions and initialLength indicate what we know or
+    // (in the case of length) guess about those properties.
+    void declare(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex,
+                 const hal::hidl_vec<uint32_t>& initialDimensions, uint32_t initialLength);
+
+    // Indicate that we've finished declaring all dynamic temporaries.
+    void endDeclarations() {
+        CHECK(!mDeclared);
+        mDeclared = true;
+    }
+
+    // Redeclare a dynamic temporary, indicating what we've learned about it.
+    // This may invalidate the location of temporaries defined by its step.
+    // Returns true if dimensions or length changed, false otherwise.
+    bool redeclare(SourceOperandIndex sourceOperandIndex,
+                   const hal::hidl_vec<uint32_t>& newDimensions, uint32_t newLength);
+
+    // Ensure that all dynamic temporaries defined by the specified step have
+    // locations.  The return value is a ResultCode (e.g.,
+    // ANEURALNETWORKS_NO_ERROR).
+    //
+    // Even if dynamic temporaries have already been allocated for this step,
+    // this call may reallocate them.  A reallocation is not guaranteed to
+    // preserve location (LocationAndShape.memory, LocationAndShape.offset) or
+    // contents of temporaries.
+    int allocate(uint32_t stepIndex);
+
+    // Do the dynamic temporaries defined by this step have valid allocations?
+    // (Will be true if there are no dynamic temporaries defined by this step.)
+    bool allocated(uint32_t stepIndex) const;
+
+    // Dump information to VLOG(EXECUTION).
+    void vlogDump(const char* context = nullptr) const;
+
+    // If the specified operand is a dynamic temporary, return location and
+    // shape information; otherwise, return std::nullopt.
+    //
+    // If temporary exists but does not have a valid allocation, then:
+    //  - If mustBeAllocated == true, then trigger a failed CHECK().
+    //  - If mustBeAllocated == false, then memory == nullptr and offset == ~0.
+    struct LocationAndShape {
+        const Memory* memory;
+        uint32_t offset;
+        const hal::hidl_vec<uint32_t>* dimensions;
+        uint32_t length;
+    };
+    std::optional<LocationAndShape> lookup(SourceOperandIndex sourceOperandIndex,
+                                           bool mustBeAllocated = true) const;
+
+    // Have any dynamic temporaries been declared?
+    bool empty() const { return mSourceOperandToTemporary.empty(); }
+
+   private:
+    // The same as LocationAndShape, except the base of the location is
+    // represented not by memory but by defining stepIndex.
+    struct InternalLocationAndShape {
+        uint32_t stepIndex;
+        uint32_t offset;
+        hal::hidl_vec<uint32_t> dimensions;
+        uint32_t length;
+    };
+    std::map<SourceOperandIndex, InternalLocationAndShape> mSourceOperandToTemporary;
+
+    // Every dynamic temporary defined at a given stepIndex.
+    std::map<uint32_t, std::vector<SourceOperandIndex>> mStepIndexToSourceOperandIndexes;
+
+    std::map<uint32_t, std::unique_ptr<MemoryAshmem>> mStepIndexToMemory;
+
+    // For a given defining stepIndex, we consider either all its dynamic
+    // temporaries to be allocated (have valid locations) or none of them to be.
+    std::set<uint32_t> mAllocatedStepIndexes;
+
+    // Has endDeclarations() been called?
+    bool mDeclared = false;
+};
+
 // An excerpt of a source model to be run by a specific device.
 class ExecutionStep {
    public:
@@ -137,8 +247,14 @@ class ExecutionStep {
         return mOutputsAsStepModelInputsIndexToMainModel;
     }
 
+    const std::set<uint32_t>& getModelOutputsThatAreDownstreamInputs() const {
+        return mModelOutputsThatAreDownstreamInputs;
+    }
+
+    uint32_t getIndex() const { return mIndex; }
     uint32_t getSourceModelIndex() const { return mSourceModelIndex; }
 
+    void declareModelOutputIsDownstreamInput(uint32_t mainModelOutputIndex);
     void recordTempAsStepModelOutput(uint32_t stepOperandIndex);
 
     // If this step has a step model output of unknown size, sets
@@ -157,9 +273,16 @@ class ExecutionStep {
     //
     // This method only reads map entries for which the first element of
     // SourceOperandIndex is mSourceModelIndex.
+    //
+    // mainModelOutputShapes may be nullptr if the only main model outputs that are
+    //     inputs of this step are of fully specified shape.
     void mapInputsAndOutputs(
-            std::shared_ptr<StepExecutor> stepExecutor, const Memory* temporaryMemory,
-            const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOffsetOfTemporary,
+            std::shared_ptr<StepExecutor> stepExecutor,
+            const std::vector<hal::OutputShape>* mainModelOutputShapes,
+            const Memory* temporaryMemory,  // for static temporaries
+            const std::map<SourceOperandIndex, uint32_t>&
+                    sourceOperandToOffsetOfTemporary,  // for static temporaries
+            const DynamicTemporaries& dynamicTemporaries,
             const std::map<SourceOperandIndex, uint32_t>& sourceOperandToInputIndex,
             const std::map<SourceOperandIndex, uint32_t>& sourceOperandToOutputIndex,
             const std::map<SourceOperandIndex, ConstantReferenceLocation>&
@@ -192,6 +315,7 @@ class ExecutionStep {
     // model, the memory should be mapped using
     // ExecutionPlan::CompoundBody::mSourceOperandToInputIndex,
     // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+    // ExecutionPlan::Controller::mDynamicTemporaries, or
     // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex.
     RemapVectorType mStepModelInputs;
     // All outputs of this step model:
@@ -199,11 +323,12 @@ class ExecutionStep {
     //
     // Depending on whether the source operand is an output of the main model,
     // the memory should be mapped using
-    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex or
-    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary.
+    // ExecutionPlan::CompoundBody::mSourceOperandToOutputIndex,
+    // ExecutionPlan::Controller::mSourceOperandToOffsetOfTemporary, or
+    // ExecutionPlan::Controller::mDynamicTemporaries.
     //
-    // mOutputIndexStepModelToMainModel relies on mModelOutputs being a prefix of
-    // mStepModelOutputs.
+    // mOutputIndexStepModelToMainModel and declareModelOutputIsDownstreamInput()
+    // rely on mModelOutputs being a prefix of mStepModelOutputs.
     RemapVectorType mStepModelOutputs;
     // Inputs of main model that are also inputs of this step model:
     //     (main model operand index, step model operand index)
@@ -247,6 +372,10 @@ class ExecutionStep {
     //     mOutputsAsStepModelInputs[i].first
     std::vector<uint32_t> mOutputsAsStepModelInputsIndexToMainModel;
 
+    // Step model output indexes (not operand indexes) that are outputs of the
+    // main model used as inputs to some other partition.
+    std::set<uint32_t> mModelOutputsThatAreDownstreamInputs;
+
     // The compilation caching token.
     TokenHasher mToken;
 };
@@ -417,8 +546,8 @@ class ExecutionPlan {
     ExecutionPlan() {}
     ~ExecutionPlan() { delete mBody; }
 
-    // Controller is part of the interface to a mechanism for performing an
-    // execution in N steps.
+    // Controller is part of the interface to a mechanism for performing a
+    // main execution in N steps.
     //
     // The value of N may not be known beforehand if the model contains WHILE
     // loops. See LogicalStep.
@@ -445,15 +574,20 @@ class ExecutionPlan {
                    const BurstBuilder* burstBuilder);
         // A constructor for mState == COMPOUND.
         Controller(const ExecutionPlan* plan, ExecutionBuilder* executionBuilder,
-                   const BurstBuilder* burstBuilder, uint32_t totalSizeOfTemporaries,
+                   const BurstBuilder* burstBuilder,
+
+                   // static temporaries
+                   uint32_t totalSizeOfTemporaries,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOffsetOfTemporary2,
+
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToInputIndex,
                    std::map<SourceOperandIndex, uint32_t> sourceOperandToOutputIndex,
                    const std::map<SourceOperandIndex, ConstantCopyLocation>&
                            sourceOperandToConstantCopy,
                    std::map<SourceOperandIndex, ConstantReferenceLocation>
-                           sourceOperandToConstantReference);
+                           sourceOperandToConstantReference,
+                   DynamicTemporaries dynamicTemporaries);
 
         // Sets the location of innerOperand to be the same as the location of outerOperand.
         void setInput(const SourceOperandIndex& outerOperand,
@@ -467,7 +601,7 @@ class ExecutionPlan {
         // does not generate a sync fence.
         int waitForLastStepSyncFence() const;
 
-        const ExecutionPlan* mPlan;
+        [[maybe_unused]] const ExecutionPlan* mPlan;
         ExecutionBuilder* mExecutionBuilder;
         const BurstBuilder* mBurstBuilder;
         // Map from source operand index to an offset into mTemporaries used
@@ -496,7 +630,12 @@ class ExecutionPlan {
         // Map from source operand index to a constant reference location.
         // Used for WHILE loop operand initializers that are constant references.
         std::map<SourceOperandIndex, ConstantReferenceLocation> mSourceOperandToConstantReference;
+
+        // static temporaries
         std::unique_ptr<MemoryAshmem> mTemporaries;
+
+        DynamicTemporaries mDynamicTemporaries;
+
         // Index of the next step to be processed by ExecutionPlan::next().
         size_t mNextStepIndex;
         // The value to reset mNextStepIndex to for partial CPU fallback.
@@ -515,14 +654,19 @@ class ExecutionPlan {
     // Sets up a new StepExecutor and burstController (if applicable) if there
     // is a step to execute. See ExecutionPlan::Controller.
     // Handles control flow. See LogicalStep.
+    // burstController is nullptr if we are not to do burst execution.
+    // mainModelOutputShapes may be nullptr if the only main model outputs that are step model
+    //     inputs are of fully specified shape.
     // syncFdOfLastStep is the sync fence fd generated by the most recently processed step.
     int next(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
-             std::shared_ptr<ExecutionBurstController>* burstController = nullptr,
+             std::shared_ptr<ExecutionBurstController>* burstController,
+             const std::vector<hal::OutputShape>* mainModelOutputShapes,
              int syncFdOfLastStep = -1) const;
 
     // Create the same executor as the last one created by next().
-    int fallback(std::shared_ptr<Controller> controller,
-                 std::shared_ptr<StepExecutor>* executor) const;
+    int fallback(std::shared_ptr<Controller> controller, std::shared_ptr<StepExecutor>* executor,
+                 std::shared_ptr<ExecutionBurstController>* burstController,
+                 const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
 
     ExecutionStep* createNewExecutionStep(uint32_t sourceModelIndex,
                                           const std::shared_ptr<Device> device);
@@ -535,9 +679,11 @@ class ExecutionPlan {
 
     void becomeSingleStep(const std::shared_ptr<Device> device, const ModelBuilder* model);
 
+    // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
     int finish(int32_t executionPreference, int32_t priority,
-               const std::optional<Deadline>& deadline);
+               const std::optional<Deadline>& deadline, int simulateFailureResultCode);
 
+    void recordOutputDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
     void recordTemporaryDef(SourceOperandIndex sourceOperandIndex, uint32_t stepIndex);
 
     void dump() const;
@@ -568,6 +714,8 @@ class ExecutionPlan {
     SourceModels& getSourceModels() { return mSourceModels; }
     const SourceModels& getSourceModels() const { return mSourceModels; }
 
+    bool hasDynamicTemporaries() const;
+
     // These functions are solely intended for use by unit tests of
     // the partitioning algorithm.
     enum class Kind {
@@ -579,14 +727,22 @@ class ExecutionPlan {
     Kind forTest_getKind() const;
     std::shared_ptr<const Device> forTest_simpleGetDevice() const;
     const std::vector<std::shared_ptr<LogicalStep>>& forTest_compoundGetSteps() const;
-    bool forTest_hasStepModelOutputsOfUnknownSize() const;
+    //     The "flat" in the name signifies that this method requires that the
+    //     model not contain any control flow operations.
+    std::set<uint32_t> forTest_flatGetDynamicTemporaries() const;
     const uint8_t* forTest_simpleGetCacheToken() const;
 
    private:
     // Becomes a new COMPOUND step if mState == EMPTY, otherwise does nothing.
     // Illegal to call for when mState == SIMPLE.
     void becomeCompoundIfEmpty();
-    void findTempsAsStepModelOutputs();
+
+    const hal::Operand& getSourceOperand(
+            const std::pair<uint32_t, uint32_t>& sourceOperandIndex) const {
+        return getSourceModels()
+                .getModel(sourceOperandIndex.first)
+                ->getOperand(sourceOperandIndex.second);
+    }
 
     class Buffer {
        public:
@@ -613,26 +769,32 @@ class ExecutionPlan {
     // Handles control flow. See LogicalStep.
     int nextCompound(std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const ExecutionStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const IfStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const WhileStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
     int nextCompound(const GotoStep* step, std::shared_ptr<Controller> controller,
                      std::shared_ptr<StepExecutor>* executor,
-                     std::shared_ptr<ExecutionBurstController>* burstController) const;
+                     std::shared_ptr<ExecutionBurstController>* burstController,
+                     const std::vector<hal::OutputShape>* mainModelOutputShapes) const;
 
     struct Body {
         virtual ~Body() {}
         virtual void dump() const = 0;
         virtual int finish(const SourceModels* sourceModels, int32_t executionPreference,
-                           int32_t priority, const std::optional<Deadline>& deadline) = 0;
-        virtual bool hasStepModelOutputsOfUnknownSize() const = 0;
+                           int32_t priority, const std::optional<Deadline>& deadline,
+                           int simulateFailureResultCode) = 0;
+        virtual bool hasDynamicTemporaries() const = 0;
         virtual void forEachStepRoleOfInput(uint32_t index,
                                             const StepRoleCallback& callback) const = 0;
         virtual void forEachStepRoleOfOutput(uint32_t index,
@@ -647,8 +809,8 @@ class ExecutionPlan {
 
         void dump() const override;
         int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
-                   const std::optional<Deadline>& deadline) override;
-        bool hasStepModelOutputsOfUnknownSize() const override { return false; }
+                   const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+        bool hasDynamicTemporaries() const override { return false; }
         void forEachStepRoleOfInput(uint32_t index,
                                     const StepRoleCallback& callback) const override;
         void forEachStepRoleOfOutput(uint32_t index,
@@ -665,10 +827,8 @@ class ExecutionPlan {
     struct CompoundBody : Body {
         void dump() const override;
         int finish(const SourceModels* sourceModels, int32_t executionPreference, int32_t priority,
-                   const std::optional<Deadline>& deadline) override;
-        bool hasStepModelOutputsOfUnknownSize() const override {
-            return mHasStepModelOutputOfUnknownSize;
-        }
+                   const std::optional<Deadline>& deadline, int simulateFailureResultCode) override;
+        bool hasDynamicTemporaries() const override { return mHasDynamicTemporaries; }
         void forEachStepRoleOfInput(uint32_t index,
                                     const StepRoleCallback& callback) const override;
         void forEachStepRoleOfOutput(uint32_t index,
@@ -681,6 +841,12 @@ class ExecutionPlan {
         std::vector<std::shared_ptr<LogicalStep>> mSteps;
 
         // Map from source operand index to defining ExecutionStep index.
+        // Used for all (and only) SUBGRAPH_OUTPUTs that are defined by
+        // ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
+        // the map.
+        std::map<SourceOperandIndex, uint32_t> mOutputToDefiningExecutionStep;
+
+        // Map from source operand index to defining ExecutionStep index.
         // Used for all (and only) TEMPORARY_VARIABLEs that are defined by
         // ExecutionSteps. Those defined by IfSteps and WhileSteps are not in
         // the map.
@@ -708,11 +874,13 @@ class ExecutionPlan {
         std::map<SourceOperandIndex, ConstantReferenceLocation>
                 mSourceOperandToBoundaryConstantReference;
 
-        bool mHasStepModelOutputOfUnknownSize = false;
+        bool mHasDynamicTemporaries = false;
 
        private:
         void findTempsAsStepModelOutputs();
 
+        void findModelOutputsThatAreDownstreamInputs();
+
         // Constant values that are inputs to IF and WHILE operations and lie on
         // a partition boundary ("control flow boundary constants") require
         // special treatment. We need to be able to dynamically associate those
@@ -755,9 +923,13 @@ class ExecutionPlan {
         return static_cast<const CompoundBody*>(mBody);
     }
 
+    void forEachDynamicTemporary(const std::function<void(SourceOperandIndex, const hal::Operand&,
+                                                          uint32_t definingStepIndex)>&) const;
+
     // Pointers to compilation caching information in CompilationBuilder.
     const std::string* mCacheDir = nullptr;
     const uint8_t* mToken = nullptr;
+
     SourceModels mSourceModels;
 };
 
diff --git a/nn/runtime/Manager.cpp b/nn/runtime/Manager.cpp
index 6b80d208d..78d7c36a9 100644
--- a/nn/runtime/Manager.cpp
+++ b/nn/runtime/Manager.cpp
@@ -405,7 +405,7 @@ std::tuple<int, std::vector<OutputShape>, Timing> DriverPreparedModel::execute(
     }
 
     if (n != ANEURALNETWORKS_NO_ERROR) {
-        VLOG(EXECUTION) << "**Execution failed**";
+        VLOG(EXECUTION) << "**Execution failed** (ResultCode = " << n << ")";
         return {n, std::move(outputShapes), timing};
     }
 
diff --git a/nn/runtime/ModelBuilder.h b/nn/runtime/ModelBuilder.h
index 94baab70b..2de68b392 100644
--- a/nn/runtime/ModelBuilder.h
+++ b/nn/runtime/ModelBuilder.h
@@ -126,9 +126,11 @@ class ModelBuilder {
         return getReferencedModel(operand.location.offset);
     }
 
+    // simulateFailureResultCode == ANEURALNETWORKS_NO_ERROR means behave normally.
     int partitionTheWork(const std::vector<std::shared_ptr<Device>>& devices, uint32_t preference,
                          uint32_t priority, const std::optional<Deadline>& deadline,
-                         ExecutionPlan* plan) const;
+                         ExecutionPlan* plan,
+                         int simulateFailureResultCode = ANEURALNETWORKS_NO_ERROR) const;
 
    private:
     // TODO(b/132322449): move partitionTheWork, findBestDeviceForEachOperation,
diff --git a/nn/runtime/NeuralNetworks.cpp b/nn/runtime/NeuralNetworks.cpp
index 5d3dae4e2..f5206c866 100644
--- a/nn/runtime/NeuralNetworks.cpp
+++ b/nn/runtime/NeuralNetworks.cpp
@@ -1543,6 +1543,26 @@ int ANeuralNetworksExecution_startComputeWithDependencies(
             waitForList.push_back(syncFenceFd);
         }
     }
+
+    if (r->getCompilation()->hasDynamicTemporaries()) {
+        // The current implementation of fenced execution does not support
+        // dynamic temporaries.  Fall back to non fenced execution.
+        LOG(INFO) << "ANeuralNetworksExecution_startComputeWithDependencies falling back"
+                  << " to ANeuralNetworksExecution_startCompute"
+                  << " because of boundary operands of unknown size";
+        for (int syncFenceFd : waitForList) {
+            if (syncFenceFd > 0) {
+                auto w = syncWait(syncFenceFd, -1);
+                if (w != FenceState::SIGNALED) {
+                    VLOG(EXECUTION) << "syncWait failed, fd: " << syncFenceFd;
+                    *event = nullptr;
+                    return ANEURALNETWORKS_OP_FAILED;
+                }
+            }
+        }
+        return ANeuralNetworksExecution_startCompute(execution, event);
+    }
+
     int syncFenceToSignal = -1;
     int n = r->computeFenced(waitForList, duration, &syncFenceToSignal);
     std::unique_ptr<SyncFenceEvent> e =
diff --git a/nn/runtime/VersionedInterfaces.cpp b/nn/runtime/VersionedInterfaces.cpp
index 7139b83f1..ccb29dc22 100644
--- a/nn/runtime/VersionedInterfaces.cpp
+++ b/nn/runtime/VersionedInterfaces.cpp
@@ -638,6 +638,7 @@ static std::optional<InitialData> initializeFunction(Device* device) {
         LOG(ERROR) << "IDevice::getVersionString returned the error " << toString(versionStatus);
         return std::nullopt;
     }
+    VLOG(MANAGER) << "Version " << versionString;
 
     const int32_t type = getTypeFunction(device);
     if (type == -1) {
diff --git a/nn/runtime/include/NeuralNetworksOEM.h b/nn/runtime/include/NeuralNetworksOEM.h
index 54a5dfe83..e184d5223 100644
--- a/nn/runtime/include/NeuralNetworksOEM.h
+++ b/nn/runtime/include/NeuralNetworksOEM.h
@@ -55,9 +55,10 @@ enum {
 };  // extends OperandCode
 
 /**
- * If a model contains an {@link ANEURALNETWORKS_OEM_OPERATION}, then
- * either the model must contain only a single operation, or every
- * tensor operand type in the model must be fully specified.
+ * Before API level 30, if a model contains an
+ * {@link ANEURALNETWORKS_OEM_OPERATION}, then either the model must contain
+ * only a single operation, or every tensor operand type in the model must be
+ * fully specified.
  */
 enum {
     /**
diff --git a/nn/runtime/test/TestExecution.cpp b/nn/runtime/test/TestExecution.cpp
index 66bef6b2a..3441f9fc4 100644
--- a/nn/runtime/test/TestExecution.cpp
+++ b/nn/runtime/test/TestExecution.cpp
@@ -16,8 +16,6 @@
 
 #include <gtest/gtest.h>
 
-#include <android-base/scopeguard.h>
-
 #include <algorithm>
 #include <atomic>
 #include <cassert>
@@ -576,7 +574,7 @@ class TestCompilation : public WrapperCompilation {
         // fall back to CPU.  (If we allow CPU fallback, then when our
         // TestDriver reports an execution failure, we'll re-execute
         // on CPU, and will not see the failure.)
-        c->setPartitioning(DeviceManager::kPartitioningWithoutFallback);
+        c->forTest_setPartitioning(DeviceManager::kPartitioningWithoutFallback);
         mCompilation = reinterpret_cast<ANeuralNetworksCompilation*>(c);
     }
 };
@@ -757,19 +755,15 @@ void ExecutionTestTemplate<DriverClass>::TestWait() {
         SCOPED_TRACE("burstCompute");
 
         // TODO: If a burst API is added to nn::test_wrapper (e.g.,
-        // Execution::burstCompute()), then use that, rather than using
-        // Execution::setComputeMode() to make Execution::compute() use burst
-        // functionality.
-
-        auto oldComputeMode =
-                WrapperExecution::setComputeMode(WrapperExecution::ComputeMode::BURST);
-        base::ScopeGuard restore(
-                [oldComputeMode] { WrapperExecution::setComputeMode(oldComputeMode); });
+        // Execution::burstCompute()), then use that, rather than
+        // Execution::compute(WrapperExecution::ComputeMode::BURST).
 
         WrapperExecution execution(&mCompilation);
         ASSERT_NO_FATAL_FAILURE(setInputOutput(&execution));
         TestPreparedModelLatest::pauseExecutions(true);
-        std::thread run([this, &execution] { EXPECT_EQ(execution.compute(), kExpectResult); });
+        std::thread run([this, &execution] {
+            EXPECT_EQ(execution.compute(WrapperExecution::ComputeMode::BURST), kExpectResult);
+        });
         getDimensionsWhileRunning(execution);
         TestPreparedModelLatest::pauseExecutions(false);
         run.join();
diff --git a/nn/runtime/test/TestGenerated.cpp b/nn/runtime/test/TestGenerated.cpp
index 70b0e6f72..d7e4d75ad 100644
--- a/nn/runtime/test/TestGenerated.cpp
+++ b/nn/runtime/test/TestGenerated.cpp
@@ -82,6 +82,7 @@ class GeneratedTests : public GeneratedTestBase {
     bool mExpectFailure = false;
     bool mTestQuantizationCoupling = false;
     bool mTestDeviceMemory = false;
+    Execution::ComputeMode mComputeMode = Execution::getComputeMode();
 };
 
 int GeneratedTests::mVndkVersion = __ANDROID_API_FUTURE__;
@@ -138,13 +139,14 @@ std::optional<Compilation> GeneratedTests::compileModel(const Model& model) {
     }
 }
 
-static void computeWithPtrs(const TestModel& testModel, Execution* execution, Result* result,
+static void computeWithPtrs(const TestModel& testModel, Execution* execution,
+                            Execution::ComputeMode computeMode, Result* result,
                             std::vector<TestBuffer>* outputs) {
     {
         NNTRACE_APP(NNTRACE_PHASE_INPUTS_AND_OUTPUTS, "computeWithPtrs example");
         createRequest(testModel, execution, outputs);
     }
-    *result = execution->compute();
+    *result = execution->compute(computeMode);
 }
 
 static ANeuralNetworksMemory* createDeviceMemoryForInput(const Compilation& compilation,
@@ -175,8 +177,8 @@ static ANeuralNetworksMemory* createDeviceMemoryForOutput(const Compilation& com
 
 // Set result = Result::NO_ERROR and outputs = {} if the test should be skipped.
 static void computeWithDeviceMemories(const Compilation& compilation, const TestModel& testModel,
-                                      Execution* execution, Result* result,
-                                      std::vector<TestBuffer>* outputs) {
+                                      Execution* execution, Execution::ComputeMode computeMode,
+                                      Result* result, std::vector<TestBuffer>* outputs) {
     ASSERT_NE(execution, nullptr);
     ASSERT_NE(result, nullptr);
     ASSERT_NE(outputs, nullptr);
@@ -218,7 +220,7 @@ static void computeWithDeviceMemories(const Compilation& compilation, const Test
         }
     }
 
-    *result = execution->compute();
+    *result = execution->compute(computeMode);
 
     // Copy out output results.
     for (uint32_t i = 0; i < testModel.main.outputIndexes.size(); i++) {
@@ -245,9 +247,10 @@ void GeneratedTests::executeWithCompilation(const Compilation& compilation,
     std::vector<TestBuffer> outputs;
 
     if (mTestDeviceMemory) {
-        computeWithDeviceMemories(compilation, testModel, &execution, &result, &outputs);
+        computeWithDeviceMemories(compilation, testModel, &execution, mComputeMode, &result,
+                                  &outputs);
     } else {
-        computeWithPtrs(testModel, &execution, &result, &outputs);
+        computeWithPtrs(testModel, &execution, mComputeMode, &result, &outputs);
     }
 
     if (result == Result::NO_ERROR && outputs.empty()) {
@@ -265,6 +268,7 @@ void GeneratedTests::executeWithCompilation(const Compilation& compilation,
 
         // Check output dimensions.
         for (uint32_t i = 0; i < testModel.main.outputIndexes.size(); i++) {
+            SCOPED_TRACE("Output index: " + std::to_string(i));
             const auto& output = testModel.main.operands[testModel.main.outputIndexes[i]];
             if (output.isIgnored) continue;
             std::vector<uint32_t> actualDimensions;
@@ -387,21 +391,18 @@ void GeneratedTests::TearDown() {
 
 #ifdef NNTEST_COMPUTE_MODE
 TEST_P(GeneratedTests, Sync) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::SYNC);
+    mComputeMode = Execution::ComputeMode::SYNC;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 TEST_P(GeneratedTests, Async) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::ASYNC);
+    mComputeMode = Execution::ComputeMode::ASYNC;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 TEST_P(GeneratedTests, Burst) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::BURST);
+    mComputeMode = Execution::ComputeMode::BURST;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 #else
 TEST_P(GeneratedTests, Test) {
@@ -426,9 +427,8 @@ TEST_P(DeviceMemoryTest, Test) {
 }
 
 TEST_P(FencedComputeTest, Test) {
-    const auto oldComputeMode = Execution::setComputeMode(Execution::ComputeMode::FENCED);
+    mComputeMode = Execution::ComputeMode::FENCED;
     execute(testModel);
-    Execution::setComputeMode(oldComputeMode);
 }
 
 INSTANTIATE_GENERATED_TEST(GeneratedTests,
diff --git a/nn/runtime/test/TestNeuralNetworksWrapper.h b/nn/runtime/test/TestNeuralNetworksWrapper.h
index ae40121c7..d89854b1a 100644
--- a/nn/runtime/test/TestNeuralNetworksWrapper.h
+++ b/nn/runtime/test/TestNeuralNetworksWrapper.h
@@ -409,8 +409,23 @@ class Execution {
         return result;
     }
 
-    Result compute() {
-        switch (mComputeMode) {
+    // By default, compute() uses the synchronous API. Either an argument or
+    // setComputeMode() can be used to change the behavior of compute() to
+    // either:
+    // - use the asynchronous or fenced API and then wait for computation to complete
+    // or
+    // - use the burst API
+    // Returns the previous ComputeMode.
+    enum class ComputeMode { SYNC, ASYNC, BURST, FENCED };
+    static ComputeMode setComputeMode(ComputeMode mode) {
+        ComputeMode oldComputeMode = mComputeMode;
+        mComputeMode = mode;
+        return oldComputeMode;
+    }
+    static ComputeMode getComputeMode() { return mComputeMode; }
+
+    Result compute(ComputeMode computeMode = mComputeMode) {
+        switch (computeMode) {
             case ComputeMode::SYNC: {
                 return static_cast<Result>(ANeuralNetworksExecution_compute(mExecution));
             }
@@ -455,19 +470,6 @@ class Execution {
         return Result::BAD_DATA;
     }
 
-    // By default, compute() uses the synchronous API. setComputeMode() can be
-    // used to change the behavior of compute() to either:
-    // - use the asynchronous API and then wait for computation to complete
-    // or
-    // - use the burst API
-    // Returns the previous ComputeMode.
-    enum class ComputeMode { SYNC, ASYNC, BURST, FENCED };
-    static ComputeMode setComputeMode(ComputeMode mode) {
-        ComputeMode oldComputeMode = mComputeMode;
-        mComputeMode = mode;
-        return oldComputeMode;
-    }
-
     Result getOutputOperandDimensions(uint32_t index, std::vector<uint32_t>* dimensions) {
         uint32_t rank = 0;
         Result result = static_cast<Result>(
diff --git a/nn/runtime/test/TestPartitioning.cpp b/nn/runtime/test/TestPartitioning.cpp
index 45dabe37c..d85717ce7 100644
--- a/nn/runtime/test/TestPartitioning.cpp
+++ b/nn/runtime/test/TestPartitioning.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <filesystem>
 #include <functional>
+#include <iostream>
 #include <map>
 #include <memory>
 #include <queue>
@@ -160,16 +161,12 @@ using ModelBuilder = ::android::nn::ModelBuilder;
 using Result = ::android::nn::test_wrapper::Result;
 using SampleDriver = ::android::nn::sample_driver::SampleDriver;
 using WrapperCompilation = ::android::nn::test_wrapper::Compilation;
+using WrapperExecution = ::android::nn::test_wrapper::Execution;
 using WrapperModel = ::android::nn::test_wrapper::Model;
 using WrapperOperandType = ::android::nn::test_wrapper::OperandType;
 using WrapperSymmPerChannelQuantParams = ::android::nn::test_wrapper::SymmPerChannelQuantParams;
 using WrapperType = ::android::nn::test_wrapper::Type;
 
-template <typename T>
-using MQDescriptorSync = ::android::hardware::MQDescriptorSync<T>;
-
-constexpr Timing kBadTiming = {.timeOnDevice = UINT64_MAX, .timeInDriver = UINT64_MAX};
-
 Capabilities makeCapabilities(float perf) {
     PerformanceInfo perfInfo = {.execTime = perf, .powerUsage = perf};
     return {.relaxedFloat32toFloat16PerformanceScalar = perfInfo,
@@ -309,52 +306,6 @@ void dump(const char* name, const ModelBuilder* model) {
 // operation kind K corresponds to the bit (1 << K).  The other operations are
 // represented by a set of OperationType.
 class PartitioningDriver : public SampleDriver {
-   private:
-    // Placeholder class -- a prepared model must not be nullptr.
-    class PartitioningPreparedModel : public IPreparedModel {
-       public:
-        Return<V1_0::ErrorStatus> execute(const V1_0::Request&,
-                                          const sp<V1_0::IExecutionCallback>&) override {
-            return V1_0::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<V1_0::ErrorStatus> execute_1_2(const V1_0::Request&, MeasureTiming,
-                                              const sp<V1_2::IExecutionCallback>&) override {
-            return V1_0::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<V1_3::ErrorStatus> execute_1_3(const V1_3::Request&, MeasureTiming,
-                                              const OptionalTimePoint&,
-                                              const OptionalTimeoutDuration&,
-                                              const sp<V1_3::IExecutionCallback>&) override {
-            return V1_3::ErrorStatus::DEVICE_UNAVAILABLE;
-        }
-        Return<void> executeSynchronously(const V1_0::Request&, MeasureTiming,
-                                          executeSynchronously_cb cb) override {
-            cb(V1_0::ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
-            return Void();
-        }
-        Return<void> executeSynchronously_1_3(const V1_3::Request&, MeasureTiming,
-                                              const OptionalTimePoint&,
-                                              const OptionalTimeoutDuration&,
-                                              executeSynchronously_1_3_cb cb) override {
-            cb(V1_3::ErrorStatus::DEVICE_UNAVAILABLE, {}, kBadTiming);
-            return Void();
-        }
-        Return<void> configureExecutionBurst(
-                const sp<V1_2::IBurstCallback>& /*callback*/,
-                const MQDescriptorSync<V1_2::FmqRequestDatum>& /*requestChannel*/,
-                const MQDescriptorSync<V1_2::FmqResultDatum>& /*resultChannel*/,
-                configureExecutionBurst_cb cb) override {
-            cb(V1_0::ErrorStatus::DEVICE_UNAVAILABLE, nullptr);
-            return Void();
-        }
-        Return<void> executeFenced(const Request&, const hidl_vec<hidl_handle>&, MeasureTiming,
-                                   const OptionalTimePoint&, const OptionalTimeoutDuration&,
-                                   const OptionalTimeoutDuration&, executeFenced_cb cb) {
-            cb(ErrorStatus::DEVICE_UNAVAILABLE, hidl_handle(nullptr), nullptr);
-            return Void();
-        }
-    };
-
    public:
     enum OEM {
         OEMNo,          // rejected by getSupportedOperations and prepareModel
@@ -372,9 +323,11 @@ class PartitioningDriver : public SampleDriver {
           mOEM(oem),
           mOperationTypes(std::move(operationTypes)) {
         CHECK_EQ(mOperationTypes.count(OperationType::OEM_OPERATION), size_t(0));
-        std::for_each(mOperationTypes.begin(), mOperationTypes.end(), [](OperationType type) {
-            CHECK_EQ(operationToFirstEncoding.count(type), size_t(0));
-        });
+        if (operationMask) {
+            std::for_each(mOperationTypes.begin(), mOperationTypes.end(), [](OperationType type) {
+                CHECK_EQ(operationToFirstEncoding.count(type), size_t(0));
+            });
+        }
     }
     ~PartitioningDriver() override {}
 
@@ -384,20 +337,38 @@ class PartitioningDriver : public SampleDriver {
     }
 
     Return<V1_3::ErrorStatus> prepareModel_1_3(
-            const Model& model, ExecutionPreference, Priority, const OptionalTimePoint&,
-            const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const CacheToken&,
-            const sp<V1_3::IPreparedModelCallback>& cb) override {
-        V1_3::ErrorStatus status = V1_3::ErrorStatus::NONE;
-        if (mOEM != OEMYes) {
+            const Model& model, ExecutionPreference preference, Priority priority,
+            const OptionalTimePoint& deadline, const hidl_vec<hidl_handle>& modelCache,
+            const hidl_vec<hidl_handle>& dataCache, const CacheToken& token,
+            const sp<V1_3::IPreparedModelCallback>& callback) override {
+        if (mOEM == OEMIndecisive) {
             for (const auto& operation : model.main.operations) {
                 if (operation.type == OperationType::OEM_OPERATION) {
-                    status = V1_3::ErrorStatus::INVALID_ARGUMENT;
-                    break;
+                    callback->notify_1_3(V1_3::ErrorStatus::INVALID_ARGUMENT, nullptr);
+                    return V1_3::ErrorStatus::INVALID_ARGUMENT;
                 }
             }
         }
-        cb->notify_1_3(status, new PartitioningPreparedModel);
-        return status;
+
+        // NOTE: We verify that all operations in the model are supported.
+        V1_3::ErrorStatus outStatus = V1_3::ErrorStatus::INVALID_ARGUMENT;
+        auto ret = getSupportedOperations_1_3(
+                model, [&outStatus](V1_3::ErrorStatus inStatus,
+                                    const hidl_vec<bool>& supportedOperations) {
+                    if (inStatus == V1_3::ErrorStatus::NONE) {
+                        if (std::all_of(supportedOperations.begin(), supportedOperations.end(),
+                                        [](bool v) { return v; })) {
+                            outStatus = V1_3::ErrorStatus::NONE;
+                        }
+                    }
+                });
+        if (ret.isOk() && (outStatus == V1_3::ErrorStatus::NONE)) {
+            return SampleDriver::prepareModel_1_3(model, preference, priority, deadline, modelCache,
+                                                  dataCache, token, callback);
+        } else {
+            callback->notify_1_3(V1_3::ErrorStatus::INVALID_ARGUMENT, nullptr);
+            return V1_3::ErrorStatus::INVALID_ARGUMENT;
+        }
     }
 
     Return<DeviceStatus> getStatus() override { return DeviceStatus::AVAILABLE; }
@@ -422,13 +393,6 @@ class PartitioningDriver : public SampleDriver {
         return Void();
     }
 
-    Return<V1_0::ErrorStatus> prepareModelFromCache(
-            const hidl_vec<hidl_handle>&, const hidl_vec<hidl_handle>&, const CacheToken&,
-            const sp<V1_2::IPreparedModelCallback>& callback) override {
-        callback->notify_1_2(V1_0::ErrorStatus::NONE, new PartitioningPreparedModel);
-        return V1_0::ErrorStatus::NONE;
-    }
-
    private:
     std::vector<bool> getSupportedOperationsForSubgraph(const Model& model,
                                                         const Subgraph& subgraph) {
@@ -624,10 +588,40 @@ class PartitioningDriverV1_0 : public V1_0::IDevice {
     const sp<V1_3::IDevice> mLatestDriver;
 };
 
-enum class Dimensioned { NO, YES };
+enum class Dimensioned {
+    NO,     // either a scalar, or a tensor of either unspecified rank (usually)
+            // or specified rank but with no specified dimensions (where
+            // specifically stated)
+    YES_1,  // tensor of shape { 1 }
+    YES_2,  // tensor of shape { 2 }
+    YES = YES_1
+};
+
+std::vector<uint32_t> dimensions(Dimensioned dimensioned) {
+    switch (dimensioned) {
+        default:
+            EXPECT_TRUE(false) << "Unknown value";
+            FALLTHROUGH_INTENDED;
+        case Dimensioned::NO:
+            return {};
+        case Dimensioned::YES_1:
+            return {1};
+        case Dimensioned::YES_2:
+            return {2};
+    }
+}
 
 std::string toString(Dimensioned dimensioned) {
-    return dimensioned == Dimensioned::NO ? "NO" : "YES";
+    switch (dimensioned) {
+        default:
+            return "<Unknown value>";
+        case Dimensioned::NO:
+            return "NO";
+        case Dimensioned::YES_1:
+            return "YES_1";
+        case Dimensioned::YES_2:
+            return "YES_2";
+    }
 }
 
 // This class adds some simple abstractions and utilities on top of
@@ -642,12 +636,24 @@ class PartitioningModel : private WrapperModel {
     using WrapperModel::identifyInputsAndOutputs;
     using WrapperModel::isValid;
     using WrapperModel::relaxComputationFloat32toFloat16;
+    using WrapperModel::setOperandValue;
 
     // Create a tensor operand of the specified type, and return the
     // corresponding operand index.
+    uint32_t addIntOperand(Dimensioned dimensioned = Dimensioned::YES) {
+        return addOperand(WrapperType::TENSOR_INT32, dimensioned);
+    }
+    uint32_t addIntScalarOperand(std::optional<int> v = std::nullopt) {
+        uint32_t opnd = addOperand(WrapperType::INT32);
+        if (v.has_value()) {
+            setOperandValue(opnd, &v.value());
+        }
+        return opnd;
+    }
     uint32_t addFloatOperand(Dimensioned dimensioned = Dimensioned::YES) {
         return addOperand(WrapperType::TENSOR_FLOAT32, dimensioned);
     }
+    uint32_t addFloatScalarOperand() { return addOperand(WrapperType::FLOAT32); }
     uint32_t addQuantOperand(Dimensioned dimensioned = Dimensioned::YES) {
         return addOperand(WrapperType::TENSOR_QUANT8_ASYMM, dimensioned);
     }
@@ -658,14 +664,6 @@ class PartitioningModel : private WrapperModel {
     // Create an operand of the specified type, and return the corresponding
     // operand index.
     uint32_t addOperand(WrapperType wrapperType, Dimensioned dimensioned = Dimensioned::YES) {
-        auto dimensions = [dimensioned]() -> std::vector<uint32_t> {
-            if (dimensioned == Dimensioned::YES) {
-                return {1};
-            } else {
-                return {};
-            }
-        };
-
         switch (static_cast<int>(wrapperType)) {
             case ANEURALNETWORKS_BOOL:
             case ANEURALNETWORKS_FLOAT16:
@@ -680,7 +678,7 @@ class PartitioningModel : private WrapperModel {
             case ANEURALNETWORKS_TENSOR_FLOAT16:
             case ANEURALNETWORKS_TENSOR_FLOAT32:
             case ANEURALNETWORKS_TENSOR_OEM_BYTE:
-                return addOperand(WrapperOperandType{wrapperType, dimensions()});
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned)});
 
             case ANEURALNETWORKS_TENSOR_INT32:
             case ANEURALNETWORKS_TENSOR_QUANT8_ASYMM:
@@ -688,10 +686,10 @@ class PartitioningModel : private WrapperModel {
             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM:
             case ANEURALNETWORKS_TENSOR_QUANT16_ASYMM:
             case ANEURALNETWORKS_TENSOR_QUANT16_SYMM:
-                return addOperand(WrapperOperandType{wrapperType, dimensions(), 1.0f});
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned), 1.0f});
 
             case ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL:
-                return addOperand(WrapperOperandType{wrapperType, dimensions(),
+                return addOperand(WrapperOperandType{wrapperType, dimensions(dimensioned),
                                                      WrapperSymmPerChannelQuantParams({1.0f}, 0)});
 
             default:
@@ -862,11 +860,21 @@ class PartitioningModel : private WrapperModel {
 
     // Create an operand of the same type as the specified operand,
     // and return the operand index of the new operand.
+    //
+    // If a tensor, the new operand will have the same rank as the specified
+    // operand.  If dimensioned == Dimensioned::NO, then all dimensions of a new
+    // tensor operand will be unspecified.  If dimensioned != Dimensioned::NO,
+    // then all dimensions of a new tensor operand will have the implied value
+    // (e.g., YES_1 means each dimension will have the value "1").
     uint32_t addOperandOfSameType(uint32_t operand, Dimensioned dimensioned = Dimensioned::YES) {
         WrapperOperandType type = mWrapperOperandType.at(operand);
+
+        const auto d = dimensions(dimensioned);
+        EXPECT_TRUE(d.size() <= 1);
         for (auto& dimension : type.dimensions) {
-            dimension = (dimensioned == Dimensioned::YES);
+            dimension = (dimensioned == Dimensioned::NO ? 0 : d[0]);
         }
+
         mWrapperOperandType.push_back(type);
         return WrapperModel::addOperand(&type);
     }
@@ -888,7 +896,13 @@ class PartitioningCompilation : public WrapperCompilation {
     }
 
     Result setPartitioning(uint32_t partitioning) {
-        return static_cast<Result>(builder()->setPartitioning(partitioning));
+        return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
+    }
+
+    // Simulate recoverable partitioning failure.
+    Result failPartitioning() {
+        return static_cast<Result>(
+                builder()->forTest_failPartitioning(static_cast<int>(Result::OP_FAILED)));
     }
 
     using WrapperCompilation::finish;
@@ -926,6 +940,7 @@ class PartitioningCompilation : public WrapperCompilation {
 
 class PartitioningTest : public ::testing::Test {
    protected:
+    using DynamicTemporariesType = decltype(ExecutionPlan().forTest_flatGetDynamicTemporaries());
     using RemapVectorType = ExecutionStep::RemapVectorType;
     using StepModelOutputSetType = ExecutionStep::StepModelOutputSetType;
 
@@ -1270,6 +1285,12 @@ class PartitioningTest : public ::testing::Test {
             uint32_t outputA = modelA->getOutputOperandIndex(i);
             uint32_t outputB = modelB->getOutputOperandIndex(i);
             if (!compare(modelA->getOperand(outputA), modelB->getOperand(outputB))) {
+#ifdef VERBOSE
+                std::cout << "modelA.output[" << i << "] = operand[" << outputA
+                          << "] = " << toString(modelA->getOperand(outputA)) << std::endl;
+                std::cout << "modelB.output[" << i << "] = operand[" << outputB
+                          << "] = " << toString(modelB->getOperand(outputB)) << std::endl;
+#endif
                 RETURN_FALSE();
             }
             equivalentOperandsAToB[outputA] = outputB;
@@ -1347,6 +1368,12 @@ class PartitioningTest : public ::testing::Test {
                 }
                 // We haven't identified an equivalent operand for inputA.
                 if (!compare(modelA->getOperand(inputA), modelB->getOperand(inputB))) {
+#ifdef VERBOSE
+                    std::cout << "modelA.input[" << i << "] = operand[" << inputA
+                              << "] = " << toString(modelA->getOperand(inputA)) << std::endl;
+                    std::cout << "modelB.input[" << i << "] = operand[" << inputB
+                              << "] = " << toString(modelB->getOperand(inputB)) << std::endl;
+#endif
                     RETURN_FALSE();
                 }
                 equivalentOperandsAToB[inputA] = inputB;
@@ -1392,7 +1419,8 @@ class PartitioningTest : public ::testing::Test {
                  std::shared_ptr<Device> device, const RemapVectorType& modelInputs,
                  const RemapVectorType& modelOutputs, const RemapVectorType& tempsAsStepModelInputs,
                  const StepModelOutputSetType& tempsAsStepModelOutputs,
-                 const RemapVectorType& outputsAsStepModelInputs) {
+                 const RemapVectorType& outputsAsStepModelInputs,
+                 const std::set<uint32_t>& modelOutputsThatAreDownstreamInputs) {
         ASSERT_TRUE(logicalStep->isExecution());
         const ExecutionStep* step = logicalStep->executionStep();
         std::map<uint32_t, uint32_t> inputsAndOutputsModelToStep;
@@ -1410,6 +1438,8 @@ class PartitioningTest : public ::testing::Test {
         ASSERT_TRUE(compareRemapVectors(inputsAndOutputsModelToStep,
                                         step->getOutputsAsStepModelInputs(),
                                         outputsAsStepModelInputs));
+        ASSERT_TRUE(modelOutputsThatAreDownstreamInputs ==
+                    step->getModelOutputsThatAreDownstreamInputs());
     }
 
    private:
@@ -1455,6 +1485,7 @@ TEST_F(PartitioningTest, SimpleModel) {
     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planA),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "good");
@@ -1467,6 +1498,7 @@ TEST_F(PartitioningTest, SimpleModel) {
     ASSERT_EQ(model.partitionTheWork(devicesC, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planC),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planC.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planC.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(planC.forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
 
@@ -1479,6 +1511,7 @@ TEST_F(PartitioningTest, SimpleModel) {
     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planB),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planB.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(2));
@@ -1498,7 +1531,8 @@ TEST_F(PartitioningTest, SimpleModel) {
                         RemapVectorType{},                                    // modelOutputs
                         RemapVectorType{},                         // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd2, b0Opnd2}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs;
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[1].
@@ -1520,7 +1554,8 @@ TEST_F(PartitioningTest, SimpleModel) {
                 RemapVectorType{{opnd4, b1Opnd4}},  // modelOutputs
                 RemapVectorType{{opnd2, b1Opnd2}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{},           // tempsAsStepModelOutputs
-                RemapVectorType{}));                // outputsAsStepModelInputs
+                RemapVectorType{},                  // outputsAsStepModelInputs
+                {}));                               // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1548,6 +1583,7 @@ TEST_F(PartitioningTest, SliceModel) {
     ASSERT_EQ(model.partitionTheWork(devicesA, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planA),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(planA.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(planA.forTest_simpleGetDevice()->getName(), "V1_3");
@@ -1562,6 +1598,7 @@ TEST_F(PartitioningTest, SliceModel) {
     ASSERT_EQ(model.partitionTheWork(devicesB, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &planB),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(planB.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(planB.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& stepsB = planB.forTest_compoundGetSteps();
     ASSERT_EQ(stepsB.size(), size_t(4));
@@ -1581,7 +1618,8 @@ TEST_F(PartitioningTest, SliceModel) {
                         RemapVectorType{{opnd4, b0Opnd2}},                    // modelOutputs
                         RemapVectorType{},         // tempsAsStepModelInputs
                         StepModelOutputSetType{},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));       // outputsAsStepModelInputs
+                        RemapVectorType{},         // outputsAsStepModelInputs
+                        {}));                      // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[1].
@@ -1594,13 +1632,16 @@ TEST_F(PartitioningTest, SliceModel) {
         modelB1.finish();
         ASSERT_TRUE(modelB1.isValid());
 
+        // Note that this is also an important test that we can detect
+        // modelOutputsThatAreDownstreamInputs.
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[1], &modelB1, devicesB[0],
                         RemapVectorType{{opnd0, b1Opnd0}, {opnd1, b1Opnd1}},  // modelInputs
                         RemapVectorType{{opnd2, b1Opnd2}},                    // modelOutputs
                         RemapVectorType{},                         // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd3, b1Opnd3}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {0u}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[2].
@@ -1617,9 +1658,10 @@ TEST_F(PartitioningTest, SliceModel) {
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[2], &modelB2, devicesB[3], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd6, b2Opnd1}},                    // modelOutputs
-                        RemapVectorType{},                    // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, b2Opnd0}}));  // outputsAsStepModelInputs
+                        RemapVectorType{},                  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, b2Opnd0}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from stepsB[3].
@@ -1640,9 +1682,10 @@ TEST_F(PartitioningTest, SliceModel) {
         ASSERT_NO_FATAL_FAILURE(
                 compare(stepsB[3], &modelB3, devicesB[2], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd5, b3Opnd2}},                    // modelOutputs
-                        RemapVectorType{{opnd3, b3Opnd1}},    // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, b3Opnd0}}));  // outputsAsStepModelInputs
+                        RemapVectorType{{opnd3, b3Opnd1}},  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, b3Opnd0}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
 
     // TODO: Make sure this still works when we have multiple devices
@@ -1670,6 +1713,7 @@ TEST_F(PartitioningTest, SliceModelToEmpty) {
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_NE(plan.forTest_simpleGetDevice().get(), nullptr);
     ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "V1_3");
@@ -1709,6 +1753,7 @@ TEST_F(PartitioningTest, Cpu) {
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& steps = plan.forTest_compoundGetSteps();
     ASSERT_EQ(steps.size(), size_t(3));
@@ -1732,7 +1777,8 @@ TEST_F(PartitioningTest, Cpu) {
                         RemapVectorType{},  // tempsAsStepModelInputs
                         StepModelOutputSetType{{opnd2, m0Opnd2},
                                                {opnd3, m0Opnd3}},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));                       // outputsAsStepModelInputs
+                        RemapVectorType{},                         // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         const auto& step1 = steps[1];
@@ -1754,7 +1800,8 @@ TEST_F(PartitioningTest, Cpu) {
                 RemapVectorType{{opnd4, m1Opnd4}},                    // modelOutputs
                 RemapVectorType{{opnd3, m1Opnd3}, {opnd2, m1Opnd2}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{{opnd5, m1Opnd5}},             // tempsAsStepModelOutputs
-                RemapVectorType{}));                                  // outputsAsStepModelInputs
+                RemapVectorType{},                                    // outputsAsStepModelInputs
+                {}));  // modelOutputsThatAreDownstreamInputs
     }
     {
         const auto& step2 = steps[2];
@@ -1775,7 +1822,8 @@ TEST_F(PartitioningTest, Cpu) {
                 RemapVectorType{{opnd8, m2Opnd8}},                              // modelOutputs
                 RemapVectorType{{opnd3, m2Opnd3}, {opnd5, m2Opnd5}},  // tempsAsStepModelInputs
                 StepModelOutputSetType{},                             // tempsAsStepModelOutputs
-                RemapVectorType{}));                                  // outputsAsStepModelInputs
+                RemapVectorType{},                                    // outputsAsStepModelInputs
+                {}));  // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1790,10 +1838,6 @@ TEST_F(PartitioningTest, SetPartitioning) {
     model.finish();
     ASSERT_TRUE(model.isValid());
 
-    // We expect that we cannot successfully partition, because we
-    // have an intermediate operand (opnd2) without dimensions, and
-    // this is not currently handled.
-
     // One device that can and should execute operation 0.
     const auto devices = makeDevices({{"hw", 0.5, (1 << 0)}});
 
@@ -1803,32 +1847,31 @@ TEST_F(PartitioningTest, SetPartitioning) {
     // didn't actually do any partitioning.
     PartitioningCompilation cPNo(&model, devices);
     ASSERT_EQ(cPNo.setPartitioning(DeviceManager::kPartitioningNo), Result::NO_ERROR);
+    ASSERT_EQ(cPNo.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPNo.finish(), Result::NO_ERROR);
     ASSERT_EQ(cPNo.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(cPNo.getExecutionPlan().forTest_simpleGetDevice(), DeviceManager::getCpuDevice());
 
-    // Test kPartitioningWithFallback.  We should attempt
-    // partitioning, reach the end of the partitioning process (so we
-    // have an unsuccessful execution plan), discover the dimensionless
-    // intermediate operand, then fallback to CPU with a SIMPLE plan, and
-    // finally return success.
-    // No need to compare the original model to the model from the plan -- we
-    // didn't actually do any partitioning.
+    // Test kPartitioningWithFallback.  We should attempt partitioning, simulate
+    // a recoverable failure, then fallback to CPU with a SIMPLE plan, and
+    // finally return success.  No need to compare the original model to the
+    // model from the plan -- we didn't actually do any partitioning.
     PartitioningCompilation cPWithFallback(&model, devices);
     ASSERT_EQ(cPWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
               Result::NO_ERROR);
+    ASSERT_EQ(cPWithFallback.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPWithFallback.finish(), Result::NO_ERROR);
     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
     ASSERT_EQ(cPWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
               DeviceManager::getCpuDevice());
 
-    // Test kPartitioningWithoutFallback.  We should attempt
-    // partitioning, and fail.
+    // Test kPartitioningWithoutFallback.  We should attempt partitioning,
+    // simulate a recoverable failure, and fail.
     PartitioningCompilation cPWithoutFallback(&model, devices);
     ASSERT_EQ(cPWithoutFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
               Result::NO_ERROR);
+    ASSERT_EQ(cPWithoutFallback.failPartitioning(), Result::NO_ERROR);
     ASSERT_EQ(cPWithoutFallback.finish(), Result::OP_FAILED);
-    ASSERT_TRUE(cPWithoutFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize());
     ASSERT_EQ(cPWithoutFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::ERROR);
 }
 
@@ -1854,6 +1897,7 @@ TEST_F(PartitioningTest, ModelOutputAsStepModelInput) {
     ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                      ExecutePriority::DEFAULT, {}, &plan),
               ANEURALNETWORKS_NO_ERROR);
+    EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
     ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
     const auto& steps = plan.forTest_compoundGetSteps();
     ASSERT_EQ(steps.size(), size_t(2));
@@ -1872,7 +1916,8 @@ TEST_F(PartitioningTest, ModelOutputAsStepModelInput) {
                         RemapVectorType{{opnd2, m0Opnd2}},                    // modelOutputs
                         RemapVectorType{},         // tempsAsStepModelInputs
                         StepModelOutputSetType{},  // tempsAsStepModelOutputs
-                        RemapVectorType{}));       // outputsAsStepModelInputs
+                        RemapVectorType{},         // outputsAsStepModelInputs
+                        {0u}));                    // modelOutputsThatAreDownstreamInputs
     }
     {
         // Build a model to compare against the step model from steps[1].
@@ -1887,8 +1932,9 @@ TEST_F(PartitioningTest, ModelOutputAsStepModelInput) {
                 compare(steps[1], &model1, devices[1], RemapVectorType{},  // modelInputs
                         RemapVectorType{{opnd3, m1Opnd3}},                 // modelOutputs
                         RemapVectorType{},                                 // tempsAsStepModelInputs
-                        StepModelOutputSetType{},             // tempsAsStepModelOutputs
-                        RemapVectorType{{opnd2, m1Opnd2}}));  // outputsAsStepModelInputs
+                        StepModelOutputSetType{},           // tempsAsStepModelOutputs
+                        RemapVectorType{{opnd2, m1Opnd2}},  // outputsAsStepModelInputs
+                        {}));                               // modelOutputsThatAreDownstreamInputs
     }
 }
 
@@ -1955,6 +2001,7 @@ TEST_F(PartitioningTest, RelaxedFP) {
         ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                          ExecutePriority::DEFAULT, {}, &plan),
                   ANEURALNETWORKS_NO_ERROR);
+        EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
         ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
         ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), expectDevice);
     };
@@ -2008,6 +2055,7 @@ TEST_F(PartitioningTest, Perf) {
             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                              ExecutePriority::DEFAULT, {}, &plan),
                       ANEURALNETWORKS_NO_ERROR);
+            EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
             ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "good");
         }
@@ -2027,6 +2075,7 @@ TEST_F(PartitioningTest, Perf) {
             ASSERT_EQ(model.partitionTheWork(devices, ExecutePreference::PREFER_LOW_POWER,
                                              ExecutePriority::DEFAULT, {}, &plan),
                       ANEURALNETWORKS_NO_ERROR);
+            EXPECT_TRUE(plan.forTest_flatGetDynamicTemporaries().empty());
             ASSERT_EQ(plan.forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
             ASSERT_EQ(plan.forTest_simpleGetDevice()->getName(), "base");
         }
@@ -2042,6 +2091,262 @@ TEST_F(PartitioningTest, Perf) {
     }
 }
 
+// Test dynamic temporaries and related parts of the partitioning implementation.
+//
+// opnd0 = model input                   // fill shape
+// opnd1 = constant                      // fill value
+// opnd2 = FILL(opnd0, opnd1)            // model output
+// opnd3 = FILL(opnd0, opnd1)
+// opnd4 = ADD(opnd2, opnd3, FUSED_NONE) // model output
+class DynamicTemporariesTest : public PartitioningTest {
+   protected:
+    // Call these functions in sequence in order to perform the test.
+    // Call to declareOutputDimensions() can be omitted (see the default values below).
+    void declareOutputDimensions(bool opnd2ModelAndPartitionOutputSpecified,
+                                 bool opnd3PartitionOutputSpecified,
+                                 bool opnd4ModelOutputSpecified);
+    void makeModelAndValidate();
+    void compileModelAndComparePlan();
+    void executeCompilationAndCompareOutput(bool opnd2ModelOutputBigEnough,
+                                            bool opnd4ModelOutputBigEnough);
+
+    // set by declareOutputDimensions()
+    bool mOpnd2ModelAndPartitionOutputSpecified = false;
+    bool mOpnd3PartitionOutputSpecified = false;
+    bool mOpnd4ModelOutputSpecified = false;
+
+    // created by makeModelAndValidate()
+    std::optional<PartitioningModel> mModel;
+    std::vector<uint32_t> mOpnds;
+
+    // created by compileModelAndComparePlan();
+    std::optional<PartitioningCompilation> mCompilation;
+
+    static Dimensioned dimensioned(bool specified) {
+        return specified ? Dimensioned::YES_2 : Dimensioned::NO;
+    }
+
+    static constexpr float kFillValue = 3.0f;
+};
+
+void DynamicTemporariesTest::declareOutputDimensions(bool opnd2ModelAndPartitionOutputSpecified,
+                                                     bool opnd3PartitionOutputSpecified,
+                                                     bool opnd4ModelOutputSpecified) {
+    ASSERT_FALSE(mModel.has_value());
+    mOpnd2ModelAndPartitionOutputSpecified = opnd2ModelAndPartitionOutputSpecified;
+    mOpnd3PartitionOutputSpecified = opnd3PartitionOutputSpecified;
+    mOpnd4ModelOutputSpecified = opnd4ModelOutputSpecified;
+}
+
+void DynamicTemporariesTest::makeModelAndValidate() {
+    ASSERT_FALSE(mModel.has_value());
+    mModel = PartitioningModel();
+
+    uint32_t opndActivation = mModel->addIntScalarOperand(ANEURALNETWORKS_FUSED_NONE);
+
+    uint32_t opnd0 = mModel->addIntOperand(Dimensioned::NO);  // desired output tensor shape
+    uint32_t opnd1 = mModel->addFloatScalarOperand();         // fill value
+    mModel->setOperandValue(opnd1, &kFillValue, sizeof(kFillValue));
+    uint32_t opnd2 = mModel->addExplicitOperationXTo1(
+            ANEURALNETWORKS_FILL, {opnd0, opnd1}, WrapperType::TENSOR_FLOAT32,
+            dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+    uint32_t opnd3 = mModel->addExplicitOperationXTo1(ANEURALNETWORKS_FILL, {opnd0, opnd1},
+                                                      WrapperType::TENSOR_FLOAT32,
+                                                      dimensioned(mOpnd3PartitionOutputSpecified));
+    uint32_t opnd4 = mModel->addExplicitOperationXTo1(
+            ANEURALNETWORKS_ADD, {opnd2, opnd3, opndActivation}, WrapperType::TENSOR_FLOAT32,
+            dimensioned(mOpnd4ModelOutputSpecified));
+    mModel->identifyInputsAndOutputs({opnd0}, {opnd2, opnd4});
+    mModel->finish();
+    ASSERT_TRUE(mModel->isValid());
+
+    mOpnds = {opnd0, opnd1, opnd2, opnd3, opnd4};
+}
+
+void DynamicTemporariesTest::compileModelAndComparePlan() {
+    ASSERT_TRUE(mModel.has_value());
+    ASSERT_TRUE(!mCompilation.has_value());
+
+    auto devices = makeDevices({{"fill", 0.9, 0U, PartitioningDriver::OEMNo, {OperationType::FILL}},
+                                {"add", 0.9, 0U, PartitioningDriver::OEMNo, {OperationType::ADD}}});
+
+    mCompilation = PartitioningCompilation(&mModel.value(), devices);
+    ASSERT_EQ(mCompilation->setPartitioning(DeviceManager::kPartitioningWithoutFallback),
+              Result::NO_ERROR);
+    ASSERT_EQ(mCompilation->finish(), Result::NO_ERROR);
+    const ExecutionPlan& planA = mCompilation->getExecutionPlan();
+    EXPECT_TRUE(planA.forTest_flatGetDynamicTemporaries() ==
+                (mOpnd3PartitionOutputSpecified ? DynamicTemporariesType{}
+                                                : DynamicTemporariesType{mOpnds[3]}));
+    ASSERT_EQ(planA.forTest_getKind(), ExecutionPlan::Kind::COMPOUND);
+    const auto& stepsA = planA.forTest_compoundGetSteps();
+    ASSERT_EQ(stepsA.size(), size_t(2));
+    {
+        // Build a model to compare against the step model from stepsA[0].
+        PartitioningModel modelA0;
+        uint32_t a0Opnd0 = modelA0.addIntOperand(Dimensioned::NO);
+        uint32_t a0Opnd1 = modelA0.addFloatScalarOperand();
+        modelA0.setOperandValue(a0Opnd1, &kFillValue, sizeof(kFillValue));
+        uint32_t a0Opnd2 = modelA0.addExplicitOperationXTo1(
+                ANEURALNETWORKS_FILL, {a0Opnd0, a0Opnd1}, WrapperType::TENSOR_FLOAT32,
+                dimensioned(mOpnd3PartitionOutputSpecified));
+        uint32_t a0Opnd3 = modelA0.addExplicitOperationXTo1(
+                ANEURALNETWORKS_FILL, {a0Opnd0, a0Opnd1}, WrapperType::TENSOR_FLOAT32,
+                dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+        modelA0.identifyInputsAndOutputs({a0Opnd0}, {a0Opnd3, a0Opnd2});
+        modelA0.finish();
+        ASSERT_TRUE(modelA0.isValid());
+
+        ASSERT_NO_FATAL_FAILURE(
+                compare(stepsA[0], &modelA0, devices[0],
+                        RemapVectorType{{mOpnds[0], a0Opnd0}},         // modelInputs
+                        RemapVectorType{{mOpnds[2], a0Opnd3}},         // modelOutputs
+                        RemapVectorType{},                             // tempsAsStepModelInputs
+                        StepModelOutputSetType{{mOpnds[3], a0Opnd2}},  // tempsAsStepModelOutputs
+                        RemapVectorType{},                             // outputsAsStepModelInputs
+                        {0u}));  // modelOutputsThatAreDownstreamInputs
+    }
+    {
+        // Build a model to compare against the step model from stepsA[1].
+        PartitioningModel modelA1;
+        uint32_t a1Opnd2 =
+                modelA1.addFloatOperand(dimensioned(mOpnd2ModelAndPartitionOutputSpecified));
+        uint32_t a1Opnd3 = modelA1.addFloatOperand(dimensioned(mOpnd3PartitionOutputSpecified));
+        uint32_t a1Opnd4 = modelA1.addOperation2To1V1_0(0, a1Opnd2, a1Opnd3,
+                                                        dimensioned(mOpnd4ModelOutputSpecified));
+        modelA1.identifyInputsAndOutputs({a1Opnd3, a1Opnd2}, {a1Opnd4});
+        modelA1.finish();
+        ASSERT_TRUE(modelA1.isValid());
+
+        ASSERT_NO_FATAL_FAILURE(
+                compare(stepsA[1], &modelA1, devices[1], RemapVectorType{},  // modelInputs
+                        RemapVectorType{{mOpnds[4], a1Opnd4}},               // modelOutputs
+                        RemapVectorType{{mOpnds[3], a1Opnd3}},  // tempsAsStepModelInputs
+                        StepModelOutputSetType{},               // tempsAsStepModelOutputs
+                        RemapVectorType{{mOpnds[2], a1Opnd2}},  // outputsAsStepModelInputs
+                        {}));  // modelOutputsThatAreDownstreamInputs
+    }
+}
+
+void DynamicTemporariesTest::executeCompilationAndCompareOutput(bool opnd2ModelOutputBigEnough,
+                                                                bool opnd4ModelOutputBigEnough) {
+    ASSERT_TRUE(opnd2ModelOutputBigEnough || !mOpnd2ModelAndPartitionOutputSpecified);
+    ASSERT_TRUE(opnd4ModelOutputBigEnough || !mOpnd4ModelOutputSpecified);
+
+    ASSERT_TRUE(mCompilation.has_value());
+    WrapperExecution e(&mCompilation.value());
+
+    WrapperOperandType shapeType(WrapperType::TENSOR_INT32, {1});
+    const int shape[1] = {2};
+    e.setInput(0, &shape, &shapeType.operandType);
+
+    auto setOutput = [&e](uint32_t index, float* buffer, bool bigEnough, bool specified) {
+        const uint32_t elts = bigEnough ? 2 : 1;
+        std::fill(buffer, buffer + elts, 0.0f);
+        using DimsType = std::vector<uint32_t>;
+        WrapperOperandType outputType(WrapperType::TENSOR_FLOAT32,
+                                      specified ? DimsType{elts} : DimsType{});
+        e.setOutput(index, buffer, elts * sizeof(float), &outputType.operandType);
+    };
+    float opnd2ModelOutput[2], opnd4ModelOutput[2];
+    setOutput(0, opnd2ModelOutput, opnd2ModelOutputBigEnough,
+              mOpnd2ModelAndPartitionOutputSpecified);
+    setOutput(1, opnd4ModelOutput, opnd4ModelOutputBigEnough, mOpnd4ModelOutputSpecified);
+
+    const Result expectResult = opnd2ModelOutputBigEnough && opnd4ModelOutputBigEnough
+                                        ? Result::NO_ERROR
+                                        : Result::OUTPUT_INSUFFICIENT_SIZE;
+    ASSERT_EQ(e.compute(), expectResult);
+    if (expectResult == Result::NO_ERROR) {
+        ASSERT_TRUE(std::all_of(std::begin(opnd2ModelOutput), std::end(opnd2ModelOutput),
+                                [](float v) { return v == kFillValue; }));
+        ASSERT_TRUE(std::all_of(std::begin(opnd4ModelOutput), std::end(opnd4ModelOutput),
+                                [](float v) { return v == kFillValue * 2; }));
+    }
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutputsSufficientSize) {
+    // The purpose of this test is to confirm that the partitioner and the
+    // runtime can handle a model output of unspecified dimensions but
+    // sufficient size that is written by one partition and read by another.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+
+TEST_F(DynamicTemporariesTest, DynamicTemporariesUnspecifiedOutputs) {
+    // The purpose of this test is to confirm that the partitioner can produce
+    // dynamic temporaries and that the runtime can handle them properly.  Note
+    // that all model outputs are of unspecified dimensions but sufficient size.
+
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+
+TEST_F(DynamicTemporariesTest, DynamicTemporariesSpecifiedOutputs) {
+    // The purpose of this test is to confirm that the partitioner can produce
+    // dynamic temporaries and that the runtime can handle them properly.  Note
+    // that all model outputs are of specified dimensions.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/true,
+                                                    /*opnd3PartitionOutputSpecified=*/false,
+                                                    /*opnd4ModelOutputSpecified=*/true));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, true));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutputsInsufficientSizeWithDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the presence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, false));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutputsInsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, false));
+}
+
+TEST_F(DynamicTemporariesTest, ModelOutput2InsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(false, true));
+}
+
+// TODO: enable this test once b/168657259 is fixed
+TEST_F(DynamicTemporariesTest, ModelOutput4InsufficientSizeWithoutDynamicTemporary) {
+    // The purpose of this test is to confirm that the runtime can detect a
+    // model output of insufficient size in the absence of a dynamic temporary.
+
+    ASSERT_NO_FATAL_FAILURE(declareOutputDimensions(/*opnd2ModelAndPartitionOutputSpecified=*/false,
+                                                    /*opnd3PartitionOutputSpecified=*/true,
+                                                    /*opnd4ModelOutputSpecified=*/false));
+    ASSERT_NO_FATAL_FAILURE(makeModelAndValidate());
+    ASSERT_NO_FATAL_FAILURE(compileModelAndComparePlan());
+    ASSERT_NO_FATAL_FAILURE(executeCompilationAndCompareOutput(true, false));
+}
+
 // Test token rehashing during the compilation step.
 class CacheTest : public PartitioningTest {
    protected:
@@ -2731,8 +3036,8 @@ TEST_F(ControlFlowPartitioningTest, WHILE_SimplePlan) {
 void ControlFlowPartitioningTest::testIfUnknownSize(Dimensioned dimensionedMain,
                                                     Dimensioned dimensionedThen,
                                                     Dimensioned dimensionedElse) {
-    if (dimensionedMain == Dimensioned::YES && dimensionedThen == Dimensioned::YES &&
-        dimensionedElse == Dimensioned::YES) {
+    if (dimensionedMain != Dimensioned::NO && dimensionedThen != Dimensioned::NO &&
+        dimensionedElse != Dimensioned::NO) {
         // No unknown size.
         return;
     }
@@ -2771,8 +3076,8 @@ TEST_F(ControlFlowPartitioningTest, IF_UnknownSize) {
 void ControlFlowPartitioningTest::testWhileUnknownSize(Dimensioned dimensionedMain,
                                                        Dimensioned dimensionedCond,
                                                        Dimensioned dimensionedBody) {
-    if (dimensionedMain == Dimensioned::YES && dimensionedCond == Dimensioned::YES &&
-        dimensionedBody == Dimensioned::YES) {
+    if (dimensionedMain != Dimensioned::NO && dimensionedCond != Dimensioned::NO &&
+        dimensionedBody != Dimensioned::NO) {
         // No unknown size.
         return;
     }
diff --git a/nn/runtime/test/TestPartitioningRandom.cpp b/nn/runtime/test/TestPartitioningRandom.cpp
index 968625e48..51d7910cc 100644
--- a/nn/runtime/test/TestPartitioningRandom.cpp
+++ b/nn/runtime/test/TestPartitioningRandom.cpp
@@ -46,6 +46,12 @@
 //
 // #define VERBOSE VERBOSE
 
+// Uncomment the following line to generate some debugging output that
+// may be useful to determine test coverage for support of dynamic
+// temporaries (http://b/132458982):
+//
+// #define TRACE_DYNTEMP TRACE_DYNTEMP
+
 // We randomly generate tests (model + input data) at runtime, and verify
 // that we get the same results whether we do partitioned compilation/execution
 // or non partitioned compilation/execution.  We perform a test as follows:
@@ -220,7 +226,7 @@ class TestCompilation : public WrapperCompilation {
     using WrapperCompilation::finish;
 
     Result setPartitioning(uint32_t partitioning) {
-        return static_cast<Result>(builder()->setPartitioning(partitioning));
+        return static_cast<Result>(builder()->forTest_setPartitioning(partitioning));
     }
 
     const ExecutionPlan& getExecutionPlan() const { return builder()->forTest_getExecutionPlan(); }
@@ -751,7 +757,14 @@ TEST_P(RandomPartitioningTest, Test) {
 
     const unsigned problemSize = 1 + randUInt(kMaxProblemSize);
     const WrapperOperandType problemType(WrapperType::TENSOR_FLOAT32, {problemSize, problemSize});
-    const WrapperOperandType unknownDimensionsType(WrapperType::TENSOR_FLOAT32, {0, 0});
+    const WrapperOperandType unknownDimensionsTypes[] = {
+            {WrapperType::TENSOR_FLOAT32, {}},
+            {WrapperType::TENSOR_FLOAT32, {0, 0}},
+            {WrapperType::TENSOR_FLOAT32, {0, problemSize}},
+            {WrapperType::TENSOR_FLOAT32, {problemSize, 0}},
+    };
+    const unsigned kUnknownDimensionsTypesCount =
+            sizeof(unknownDimensionsTypes) / sizeof(unknownDimensionsTypes[0]);
 
     static const WrapperOperandType activationFunctionType(WrapperType::INT32, {});
 
@@ -772,10 +785,21 @@ TEST_P(RandomPartitioningTest, Test) {
     // joining disjoint subgraphs rather than by forcing a root.
     const bool forceCommonRoot = (randFrac() < 0.75);
 
+    auto computeMode = WrapperExecution::getComputeMode();
+    // We check randFrac() independent of compute mode, because we don't want
+    // the random number sequence to change depending on compute mode: Compute
+    // mode should only affect how we perform the inference, not how we build the
+    // Model, the Compilation, or the Execution.
+    if (randFrac() < 0.5 && computeMode == WrapperExecution::ComputeMode::ASYNC) {
+        computeMode = WrapperExecution::ComputeMode::FENCED;
+    }
+
     TestModel model;
     std::vector<uint32_t> modelInputs;
     std::vector<uint32_t> modelOutputs;
 
+    std::set<uint32_t> operandsWithUnknownDimensions;
+
     // Each region in weights is a problem-sized 2-D TENSOR_FLOAT32.
     TestMemories weights;
 
@@ -803,11 +827,6 @@ TEST_P(RandomPartitioningTest, Test) {
     // operations).
     unsigned rootOperationCount = 0;
 
-    // Track if we added operands with unknown dimensions. In this case,
-    // partitioned compilation will fail if such an operand is read in a
-    // different partition than it is written.
-    bool hasUnknownDimensions = false;
-
     // Generate operations.
     for (unsigned i = 0; i < numOperations; i++) {
         const unsigned operationPatternIndex = randUInt(std::size(kOperationPatterns));
@@ -995,19 +1014,21 @@ TEST_P(RandomPartitioningTest, Test) {
         // OUTPUTS /////////////////////////////////////////////////////////////////////////////////
 
         std::vector<uint32_t> operationOutputs(operationPattern.mNumOutputs);
-        std::generate(operationOutputs.begin(), operationOutputs.end(),
-                      [&model, &problemType, &unknownDimensionsType, &hasUnknownDimensions,
-                       allowUnknownDimensions, this] {
-                          // 3% unknowns causes ~35% of partitionings to fail
-                          // (determined by commenting out the fallback code,
-                          // running tests and noting number of failures).
-                          if (allowUnknownDimensions && randFrac() < 0.03) {
-                              hasUnknownDimensions = true;
-                              return model.addOperand(&unknownDimensionsType);
-                          } else {
-                              return model.addOperand(&problemType);
-                          }
-                      });
+        std::generate(
+                operationOutputs.begin(), operationOutputs.end(),
+                [&operandsWithUnknownDimensions, &model, &problemType, &unknownDimensionsTypes,
+                 allowUnknownDimensions, this] {
+                    // Before the fix for http://b/132458982, 3% unknowns
+                    // causes ~35% of partitionings to fail.
+                    if (allowUnknownDimensions && randFrac() < 0.03) {
+                        uint32_t opndIdx = model.addOperand(
+                                &unknownDimensionsTypes[randUInt(kUnknownDimensionsTypesCount)]);
+                        operandsWithUnknownDimensions.insert(opndIdx);
+                        return opndIdx;
+                    } else {
+                        return model.addOperand(&problemType);
+                    }
+                });
 
         // OPERATION ///////////////////////////////////////////////////////////////////////////////
 
@@ -1090,6 +1111,21 @@ TEST_P(RandomPartitioningTest, Test) {
         const auto& outputs = model.getOperationOutputs(randUInt(model.operationCount()));
         modelOutputs.push_back(outputs[randUInt(outputs.size())]);
     }
+    if (computeMode == WrapperExecution::ComputeMode::FENCED) {
+        if (std::any_of(modelOutputs.begin(), modelOutputs.end(),
+                        [&operandsWithUnknownDimensions](uint32_t opndIdx) {
+                            return operandsWithUnknownDimensions.count(opndIdx) != 0;
+                        })) {
+            // Workaround for http://b/162980246: Fenced execution is documented
+            // as requiring model outputs to have fully specified dimensions,
+            // either from Model or from Execution, but its implementation
+            // requires this to come from Model.  This test only guarantees that
+            // they have fully specified dimensions from Execution.  So in the
+            // case of a Model where some output does not have fully specified
+            // dimensions, perform asynchronous execution instead.
+            computeMode = WrapperExecution::ComputeMode::ASYNC;
+        }
+    }
 
     model.identifyInputsAndOutputs(modelInputs, modelOutputs);
 #ifdef VERBOSE
@@ -1157,37 +1193,49 @@ TEST_P(RandomPartitioningTest, Test) {
     // CPU fallback device
     devices.push_back(DeviceManager::getCpuDevice());
 
-    // Partitioned compilation.
-    // For test cases without unknown intermediate operand sizes we require the
-    // partitioning to succeed without CPU fallback. With unknown sizes we
-    // retry with a fallback if the non-fallback partitioning fails and require
-    // the fallback to succeed.
-    TestCompilation cNoFallback(&model, devices);
-    TestCompilation cWithFallback(&model, devices);
-    TestCompilation* c2 = nullptr;
-    ASSERT_EQ(cNoFallback.setPartitioning(DeviceManager::kPartitioningWithoutFallback),
-              Result::NO_ERROR);
-    auto compilationResult = cNoFallback.finish();
-    if (hasUnknownDimensions && compilationResult == Result::OP_FAILED &&
-        cNoFallback.getExecutionPlan().forTest_hasStepModelOutputsOfUnknownSize()) {
-        ASSERT_EQ(cWithFallback.setPartitioning(DeviceManager::kPartitioningWithFallback),
-                  Result::NO_ERROR);
-        ASSERT_EQ(cWithFallback.finish(), Result::NO_ERROR);
-        ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_getKind(), ExecutionPlan::Kind::SIMPLE);
-        ASSERT_EQ(cWithFallback.getExecutionPlan().forTest_simpleGetDevice(),
-                  DeviceManager::getCpuDevice());
-        c2 = &cWithFallback;
-    } else {
-        ASSERT_EQ(compilationResult, Result::NO_ERROR);
-        c2 = &cNoFallback;
+    // Partitioned compilation.  We require the partitioning to succeed without
+    // CPU fallback.
+    TestCompilation c2(&model, devices);
+    ASSERT_EQ(c2.setPartitioning(DeviceManager::kPartitioningWithoutFallback), Result::NO_ERROR);
+    ASSERT_EQ(c2.finish(), Result::NO_ERROR);
+#ifdef TRACE_DYNTEMP
+    {
+        const ExecutionPlan& plan = c2.getExecutionPlan();
+        const size_t dynamicTemporaryCount = plan.forTest_flatGetDynamicTemporaries().size();
+        std::cout << "TRACE_DYNTEMP: dynamic temporary count = " << dynamicTemporaryCount
+                  << std::endl;
+        if (plan.forTest_getKind() == ExecutionPlan::Kind::COMPOUND) {
+            size_t stepsWithModelOutputsThatAreDownstreamInputs = 0;
+            size_t countOfModelOutputsThatAreDownstreamInputs = 0;
+            for (const auto& step : plan.forTest_compoundGetSteps()) {
+                if (const size_t count = step->executionStep()
+                                                 ->getModelOutputsThatAreDownstreamInputs()
+                                                 .size()) {
+                    ++stepsWithModelOutputsThatAreDownstreamInputs;
+                    countOfModelOutputsThatAreDownstreamInputs += count;
+                }
+            }
+            if (countOfModelOutputsThatAreDownstreamInputs != 0) {
+                std::cout << "TRACE_DYNTEMP: model outputs that are downstream inputs: "
+                          << countOfModelOutputsThatAreDownstreamInputs << " / "
+                          << modelOutputs.size() << ", over "
+                          << stepsWithModelOutputsThatAreDownstreamInputs << " / "
+                          << plan.forTest_compoundGetSteps().size() << " steps" << std::endl;
+                EXPECT_LE(countOfModelOutputsThatAreDownstreamInputs, modelOutputs.size());
+            }
+        } else {
+            EXPECT_EQ(dynamicTemporaryCount, size_t(0))
+                    << "Only COMPOUND plan should have dynamic temporaries";
+        }
     }
+#endif
 
 #ifdef VERBOSE
     {
         std::cout << "signatures = " << signatures.size() << ", devices = " << devices.size()
                   << std::endl;
         // TODO: When dumping steps, include non-ExecutionSteps.
-        const ExecutionPlan& plan = c2->getExecutionPlan();
+        const ExecutionPlan& plan = c2.getExecutionPlan();
         switch (plan.forTest_getKind()) {
             case ExecutionPlan::Kind::SIMPLE:
                 std::cout << "plan: simple" << std::endl;
@@ -1345,7 +1393,7 @@ TEST_P(RandomPartitioningTest, Test) {
     // Non-partitioned execution.
     WrapperExecution e(&c);
     ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e));
-    ASSERT_EQ(e.compute(), Result::NO_ERROR);
+    ASSERT_EQ(e.compute(computeMode), Result::NO_ERROR);
 
     // Copy the outputs of the non-partitioned execution to a save area.
     std::vector<float> nonPartitionedOutputs(problemSize * problemSize * model.outputCount());
@@ -1376,9 +1424,9 @@ TEST_P(RandomPartitioningTest, Test) {
     }
 
     // Partitioned execution.
-    WrapperExecution e2(c2);
+    WrapperExecution e2(&c2);
     ASSERT_NO_FATAL_FAILURE(prepareForExecution(&e2));
-    ASSERT_EQ(e2.compute(), Result::NO_ERROR);
+    ASSERT_EQ(e2.compute(computeMode), Result::NO_ERROR);
 
     // Compare the outputs of the partitioned execution to the save
     // area containing the outpus of the non-partitioned execution.
author	David Gross <dgross@google.com>	2020-09-29 19:44:18 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2020-09-29 19:44:18 +0000
commit	3ab5ca5044de7f1f5929cd90147503025f2982e1 (patch)
tree	608562c909d4b71ece25cd40fb0e135db766a55d
parent	6c03731d6e9450068d8cf3866edcec185326fa42 (diff)
parent	9d1874d82fd53740c2248afff1d8ad20200c99fa (diff)
download	ml-3ab5ca5044de7f1f5929cd90147503025f2982e1.tar.gz