Port in minimal changes to support BroadcastSub and BroadcastDivandroid-9.0.0_r47 android-9.0.0_r46 android-9.0.0_r45 android-9.0.0_r44 android-9.0.0_r43 android-9.0.0_r42 android-9.0.0_r41 android-9.0.0_r40 android-9.0.0_r39 android-9.0.0_r38 android-9.0.0_r37 android-9.0.0_r36 android-9.0.0_r35 android-9.0.0_r34 android-9.0.0_r33 android-9.0.0_r32 android-9.0.0_r31 android-9.0.0_r30 android-9.0.0_r22 android-9.0.0_r21 android-9.0.0_r20 android-9.0.0_r19 android-9.0.0_r16 android-9.0.0_r12 android-9.0.0_r11 pie-qpr3-s1-release pie-qpr3-release pie-qpr3-b-release pie-qpr2-release pie-qpr1-s3-release pie-qpr1-s2-release pie-qpr1-s1-release pie-qpr1-release pie-dr1-release pie-dr1-dev pie-dev pie-b4s4-release pie-b4s4-dev

Bug: 73661777 Test: mm Test: NeuralNetworksTests Change-Id: I4edfa9e39ef0a28c03de9f725d8eeef716cf88cd
author: Miao Wang <miaowang@google.com> 2018-03-23 14:24:44 -0700
committer: Miao Wang <miaowang@google.com> 2018-03-23 14:24:44 -0700
commit: 6c79676c3e1ce1fae7ca151d116a846f3f68f59c (patch)
tree: 5374170b45a510be77e0b7f90c84b8be7ebb7c0f
parent: 18b73567d561fd5a5f0067ebbc54ca1dd2c2a1ab (diff)
download: tensorflow-pie-qpr1-s2-release.tar.gz
1 files changed, 91 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d4f031e130f..132c9e1e5f0 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1928,6 +1928,51 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 // TODO(aselle): This is not actually optimized yet.
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
@@ -1955,6 +2000,52 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
     }
   }
 }
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+                  output_activation_min, output_activation_max);
+        }
+      }
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
author	Miao Wang <miaowang@google.com>	2018-03-23 14:24:44 -0700
committer	Miao Wang <miaowang@google.com>	2018-03-23 14:24:44 -0700
commit	6c79676c3e1ce1fae7ca151d116a846f3f68f59c (patch)
tree	5374170b45a510be77e0b7f90c84b8be7ebb7c0f
parent	18b73567d561fd5a5f0067ebbc54ca1dd2c2a1ab (diff)
download	tensorflow-pie-qpr1-s2-release.tar.gz