aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMiao Wang <miaowang@google.com>2018-03-23 14:24:44 -0700
committerMiao Wang <miaowang@google.com>2018-03-23 14:24:44 -0700
commit6c79676c3e1ce1fae7ca151d116a846f3f68f59c (patch)
tree5374170b45a510be77e0b7f90c84b8be7ebb7c0f
parent18b73567d561fd5a5f0067ebbc54ca1dd2c2a1ab (diff)
downloadtensorflow-pie-qpr1-s2-release.tar.gz
Bug: 73661777 Test: mm Test: NeuralNetworksTests Change-Id: I4edfa9e39ef0a28c03de9f725d8eeef716cf88cd
-rw-r--r--tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h91
1 files changed, 91 insertions, 0 deletions
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d4f031e130f..132c9e1e5f0 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1928,6 +1928,51 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
}
}
+// TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastDiv is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T output_activation_min, T output_activation_max,
+ T* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastDiv");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] /
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
// TODO(aselle): This is not actually optimized yet.
inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
const float* input2_data, const Dims<4>& input2_dims,
@@ -1955,6 +2000,52 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
}
}
}
+
+// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
+// dimensionality if the runtime code does a single loop over one dimension
+// that handles broadcasting as the base case. The code generator would then
+// generate max(D1, D2) nested for loops.
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+template <typename T>
+void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
+ const T* input2_data, const Dims<4>& input2_dims,
+ T output_activation_min, T output_activation_max,
+ T* output_data, const Dims<4>& output_dims) {
+ gemmlowp::ScopedProfilingLabel label("BroadcastSub");
+
+ NdArrayDesc<4> desc1;
+ NdArrayDesc<4> desc2;
+ NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+
+ // In Tensorflow, the dimensions are canonically named (batch_number, row,
+ // col, channel), with extents (batches, height, width, depth), with the
+ // trailing dimension changing most rapidly (channels has the smallest stride,
+ // typically 1 element).
+ //
+ // In generated C code, we store arrays with the dimensions reversed. The
+ // first dimension has smallest stride.
+ //
+ // We name our variables by their Tensorflow convention, but generate C code
+ // nesting loops such that the innermost loop has the smallest stride for the
+ // best cache behavior.
+ for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+ for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+ for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+ for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+ output_data[Offset(output_dims, c, x, y, b)] =
+ ActivationFunctionWithMinMax(
+ input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
+ input2_data[SubscriptToIndex(desc2, c, x, y, b)],
+ output_activation_min, output_activation_max);
+ }
+ }
+ }
+ }
+}
+
template <FusedActivationFunctionType Ac, typename Scalar>
void Concatenation(int concat_dim, const Scalar* const* input_data,
const Dims<4>* const* input_dims, int inputs_count,