Add test for FP16 rewrite when an external output happens to be an input to another node

This can happen when a subgraph is split because an operation is not supported by XNNPACK. The FP16 rewriting code does not account for this: When counting the number of external inputs, we assert that a node's input value which has a fp32_id is an external input, this is not always true, as it could be an external output PiperOrigin-RevId: 469766360
author: Zhi An Ng <zhin@google.com> 2022-08-24 10:38:43 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> 2022-08-24 10:39:49 -0700
commit: edfac6301d77feb42f4f0562d02251fd02fe4825 (patch)
tree: 1a4457f746c24a22226e4a47fbef3b4a98679dc4
parent: a066c3191d564b824a64b78fee498ae5ac48e998 (diff)
download: XNNPACK-edfac6301d77feb42f4f0562d02251fd02fe4825.tar.gz
7 files changed, 148 insertions, 17 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 35c7a89dc..ec2005453 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -15657,6 +15657,19 @@ xnnpack_unit_test(
 )
 
 xnnpack_unit_test(
+    name = "subgraph_fp16_test",
+    srcs = [
+        "test/subgraph-fp16.cc",
+        "test/subgraph-tester.h",
+    ],
+    deps = [
+        ":XNNPACK",
+        ":node_type",
+        ":subgraph_test_mode",
+    ],
+)
+
+xnnpack_unit_test(
     name = "jit_test",
     srcs = [
         "test/jit.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c7cfb7c4e..7ecc493cb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7655,6 +7655,11 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(subgraph-nchw-test PRIVATE XNNPACK cache gtest gtest_main microparams_init logging subgraph operators)
   ADD_TEST(NAME subgraph-nchw-test COMMAND subgraph-nchw-test)
 
+  ADD_EXECUTABLE(subgraph-fp16-test test/subgraph-fp16.cc)
+  TARGET_INCLUDE_DIRECTORIES(subgraph-fp16-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(subgraph-fp16-test PRIVATE XNNPACK cache gtest gtest_main microparams_init logging subgraph operators)
+  ADD_TEST(NAME subgraph-fp16-test COMMAND subgraph-fp16-test)
+
   # ---[ Build subgraph-level unit tests
   ADD_EXECUTABLE(workspace-test test/workspace.cc)
   TARGET_INCLUDE_DIRECTORIES(workspace-test PRIVATE src test)
diff --git a/src/subgraph.c b/src/subgraph.c
index 029866c79..908132e73 100644
--- a/src/subgraph.c
+++ b/src/subgraph.c
@@ -887,8 +887,11 @@ bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph)
         assert(value->data == NULL);
         assert(value->datatype == xnn_datatype_fp16);
         assert(subgraph->values[value->fp32_id].datatype == xnn_datatype_fp32);
-        assert(subgraph->values[value->fp32_id].flags & XNN_VALUE_FLAG_EXTERNAL_INPUT);
-        num_external_inputs += 1;
+        // This value isn't always an external input, it could be an external output of the current subgraph (due to
+        // partition), and be simultaneously consumed by the current node.
+        if (subgraph->values[value->fp32_id].flags & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
+          num_external_inputs += 1;
+        }
       }
     }
     for (uint32_t o = 0; o < node->num_outputs; o++) {
@@ -935,14 +938,18 @@ bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph)
     for (uint32_t i = 0; i < node->num_inputs; i++) {
       const struct xnn_value* value = &subgraph->values[node->inputs[i]];
       if (value->fp32_id != XNN_INVALID_VALUE_ID && value->first_consumer == n - 1) {
-        xnn_log_debug("Inserted FP32->FP16 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
-          value->fp32_id, value->id);
-        const uint32_t output_node_id = output_node->id;
-        assert(output_node >= subgraph->nodes);
-        xnn_node_clear(output_node);
-        output_node->id = output_node_id;
-        xnn_init_convert_node(output_node, xnn_compute_type_fp32_to_fp16, value->fp32_id, value->id, 0 /* flags */);
-        output_node -= 1;
+        // Only insert convert nodes if the value actually is an external input. This value could be an external output,
+        // if that's the case, we have already inserted a convert node in loop above for outputs.
+        if (subgraph->values[value->fp32_id].flags & XNN_VALUE_FLAG_EXTERNAL_INPUT) {
+          xnn_log_debug("Inserted FP32->FP16 Convert Node from tensor #%"PRIu32" to tensor #%"PRIu32,
+                        value->fp32_id, value->id);
+          const uint32_t output_node_id = output_node->id;
+          assert(output_node >= subgraph->nodes);
+          xnn_node_clear(output_node);
+          output_node->id = output_node_id;
+          xnn_init_convert_node(output_node, xnn_compute_type_fp32_to_fp16, value->fp32_id, value->id, 0 /* flags */);
+          output_node -= 1;
+        }
       }
     }
   }
diff --git a/src/xnnpack/subgraph.h b/src/xnnpack/subgraph.h
index 7da92c812..a0ba588a6 100644
--- a/src/xnnpack/subgraph.h
+++ b/src/xnnpack/subgraph.h
@@ -373,6 +373,8 @@ size_t xnn_shape_multiply_non_channel_dims(
 enum xnn_status xnn_subgraph_optimize(xnn_subgraph_t subgraph, uint32_t flags);
 
 void xnn_subgraph_rewrite_for_nchw(xnn_subgraph_t subgraph);
+// Rewrites subgraph for FP16, returns true if success, false if rewrite failed.
+bool xnn_subgraph_rewrite_for_fp16(xnn_subgraph_t subgraph);
 
 void xnn_node_clear(struct xnn_node* node);
 void xnn_value_clear(struct xnn_value* value);
diff --git a/test/subgraph-fp16.cc b/test/subgraph-fp16.cc
new file mode 100644
index 000000000..88e587be8
--- /dev/null
+++ b/test/subgraph-fp16.cc
@@ -0,0 +1,90 @@
+// Copyright 2022 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <array>
+
+#include <xnnpack.h>
+#include <xnnpack/node-type.h>
+
+#include "subgraph-tester.h"
+#include <gtest/gtest.h>
+
+namespace xnnpack {
+
+TEST(SUBGRAPH_FP16, value_both_external_output_and_input) {
+  auto tester = SubgraphTester(4);
+  std::array<size_t, 4> pre_paddings = {0,1,0,0};
+  std::array<size_t, 4> post_paddings = {0,1,0,0};
+  // external input[0]
+  //      /
+  // [constant pad]
+  //     /
+  //  external     dynamic[1]
+  //  output[2]     /
+  //           \   /
+  //           [add]
+  //             |
+  //         external
+  //         output[3]
+  tester
+      .AddInputTensorF32({1, 2, 2, 3}, 0)
+      .AddDynamicTensorF32({1, 1, 1, 3}, 1)
+      .AddOutputTensorF32({1, 4, 2, 3}, 2)
+      .AddOutputTensorF32({1, 4, 2, 3}, 3)
+      .AddConstantPad(pre_paddings.data(), post_paddings.data(), 0.0f, 0, 2)
+      .AddAddition(2, 1, 3)
+      .Optimize()
+      .RewriteForFp16();
+
+  // After rewriting for FP16, the graph should look like this, with * indicating new operators and values created:
+  //
+  //   external input[0]
+  //        |
+  //    [convert]*
+  //        |
+  //     input[4]*
+  //       /
+  // [constant pad]
+  //     /
+  //   fp16 value[5]*
+  //    |       \
+  //  [convert]* \
+  //    |         \
+  //  external     \    dynamic[1] converted in-place
+  //  output[2]     \     /
+  //                 \   /
+  //                 [add]
+  //                   |
+  //                fp16 value[6]*
+  //                   |
+  //                [convert]*
+  //                   |
+  //               external
+  //               output[3]
+
+  // We should have 3 convert nodes, one for external input, 2 for external
+  // outputs, so 5 in total, including the pad and add in the original graph.
+  ASSERT_EQ(tester.NumNodes(), 5);
+
+  const xnn_node* output_convert_node = tester.Node(4);
+  ASSERT_EQ(output_convert_node->type, xnn_node_type_convert);
+  ASSERT_EQ(output_convert_node->compute_type, xnn_compute_type_fp16_to_fp32);
+
+  // Check that Addition node refers to the FP16 value before conversion.
+  const xnn_node* addition_node = tester.Node(3);
+  ASSERT_EQ(addition_node->type, xnn_node_type_add2);
+  ASSERT_EQ(addition_node->inputs[0], 5);
+  ASSERT_EQ(addition_node->inputs[1], 1);
+  ASSERT_EQ(tester.Value(5)->datatype, xnn_datatype_fp16);
+  ASSERT_EQ(tester.Value(1)->datatype, xnn_datatype_fp16);
+
+  ASSERT_EQ(tester.Node(2)->type, xnn_node_type_convert);
+  ASSERT_EQ(tester.Node(2)->compute_type, xnn_compute_type_fp16_to_fp32);
+  ASSERT_EQ(tester.Node(1)->type, xnn_node_type_static_constant_pad);
+  ASSERT_EQ(tester.Node(0)->type, xnn_node_type_convert);
+  ASSERT_EQ(tester.Node(0)->compute_type, xnn_compute_type_fp32_to_fp16);
+}
+
+}  // namespace xnnpack
diff --git a/test/subgraph-nchw.cc b/test/subgraph-nchw.cc
index c0e9d600c..04fd3c492 100644
--- a/test/subgraph-nchw.cc
+++ b/test/subgraph-nchw.cc
@@ -28,7 +28,7 @@ TEST(SUBGRAPH_NCHW, single_conv) {
           /*group_output_channels=*/ 32,
         }, 0, 1, 2, 3)
     .Optimize()
-    .Rewrite();
+    .RewriteForNchw();
 
   ASSERT_EQ(tester.GetLayout(0), xnn_layout_type_nhwc);
   ASSERT_EQ(tester.GetLayout(3), xnn_layout_type_nhwc);
@@ -54,7 +54,7 @@ TEST(SUBGRAPH_NCHW, single_conv_and_global_average_pooling) {
         }, 0, 1, 2, 3)
     .AddGlobalAveragePooling(3, 4)
     .Optimize()
-    .Rewrite();
+    .RewriteForNchw();
 
   ASSERT_EQ(tester.GetLayout(0), xnn_layout_type_nhwc);
   ASSERT_EQ(tester.GetLayout(3), xnn_layout_type_nhwc);
@@ -94,7 +94,7 @@ TEST(SUBGRAPH_NCHW, pixelwise_conv_sandwich) {
         }, 3, 4, 5, 6)
     .AddGlobalAveragePooling(6, 7)
     .Optimize()
-    .Rewrite();
+    .RewriteForNchw();
 
   ASSERT_EQ(tester.GetLayout(0), xnn_layout_type_nhwc);
   ASSERT_EQ(tester.GetLayout(3), xnn_layout_type_nchw);
@@ -163,7 +163,7 @@ TEST(SUBGRAPH_NCHW, bottleneck) {
     .AddAddition(3, 12, 13)
     .AddGlobalAveragePooling(13, 14)
     .Optimize()
-    .Rewrite();
+    .RewriteForNchw();
 
   ASSERT_EQ(tester.GetLayout(0), xnn_layout_type_nhwc);
   ASSERT_EQ(tester.GetLayout(3), xnn_layout_type_nchw);
diff --git a/test/subgraph-tester.h b/test/subgraph-tester.h
index 30b58300c..fd525c114 100644
--- a/test/subgraph-tester.h
+++ b/test/subgraph-tester.h
@@ -349,18 +349,32 @@ class SubgraphTester {
     return *this;
   }
 
-  inline SubgraphTester& Rewrite() {
+  inline SubgraphTester& RewriteForNchw() {
     xnn_subgraph_rewrite_for_nchw(subgraph_.get());
 
     return *this;
   }
 
+  inline SubgraphTester& RewriteForFp16() {
+    EXPECT_TRUE(xnn_subgraph_rewrite_for_fp16(subgraph_.get()));
+
+    return *this;
+  }
+
   inline xnn_layout_type GetLayout(uint32_t value_id) const {
     return subgraph_->values[value_id].layout;
   }
 
-  inline const xnn_node* const Node(uint32_t node_index) const {
-    return &subgraph_->nodes[node_index];
+  inline const xnn_value* const Value(uint32_t value_id) const {
+    return &subgraph_->values[value_id];
+  }
+
+  inline const xnn_node* const Node(uint32_t node_id) const {
+    return &subgraph_->nodes[node_id];
+  }
+
+  inline size_t NumNodes() const {
+    return subgraph_->num_nodes;
   }
 
  protected:
author	Zhi An Ng <zhin@google.com>	2022-08-24 10:38:43 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	2022-08-24 10:39:49 -0700
commit	edfac6301d77feb42f4f0562d02251fd02fe4825 (patch)
tree	1a4457f746c24a22226e4a47fbef3b4a98679dc4
parent	a066c3191d564b824a64b78fee498ae5ac48e998 (diff)
download	XNNPACK-edfac6301d77feb42f4f0562d02251fd02fe4825.tar.gz