1 files changed, 81 insertions, 3 deletions
diff --git a/test/test.cc b/test/test.cc
index eee16b4..735ad1e 100644
--- a/test/test.cc
+++ b/test/test.cc
@@ -1277,6 +1277,47 @@ void TestOutputStages(int rows, int depth, int cols, int result_offset,
     }
   }
 
+  // Test a variant of the familiar default pipeline consisting of quantize-down
+  // and clamp-and-cast-to-int16.
+  OutputStageSaturatingCastToInt16 saturating_cast_int16_stage;
+  auto quantize_down_and_saturating_cast_int16_pipeline =
+      std::make_tuple(quantize_down_stage, saturating_cast_int16_stage);
+  Matrix<std::int16_t, ResultOrder> result_quantized_down_saturated_int16(rows,
+                                                                          cols);
+  GemmWithOutputPipeline<std::uint8_t, std::int16_t, DefaultL8R8BitDepthParams>(
+      &context, lhs.const_map(), rhs.const_map(),
+      &result_quantized_down_saturated_int16, lhs_offset, rhs_offset,
+      quantize_down_and_saturating_cast_int16_pipeline);
+
+  for (int r = 0; r < rows; r++) {
+    for (int c = 0; c < cols; c++) {
+      std::int32_t quantized = result_quantized_down_int32(r, c);
+      std::int16_t expected = std::min(std::max(quantized, -32768), 32767);
+      Check(expected == result_quantized_down_saturated_int16(r, c));
+    }
+  }
+
+#ifdef GEMMLOWP_MSA
+  // Test a pipeline consisting of quantize-down and truncating-cast-to-uint8.
+  OutputStageTruncatingCastToUint8 truncating_cast_stage;
+  auto quantize_down_and_truncating_cast_pipeline =
+      std::make_tuple(quantize_down_stage, truncating_cast_stage);
+  Matrix<std::uint8_t, ResultOrder> result_quantized_down_truncated_uint8(
+      rows, cols);
+  GemmWithOutputPipeline<std::uint8_t, std::uint8_t, DefaultL8R8BitDepthParams>(
+      &context, lhs.const_map(), rhs.const_map(),
+      &result_quantized_down_truncated_uint8, lhs_offset, rhs_offset,
+      quantize_down_and_truncating_cast_pipeline);
+
+  for (int r = 0; r < rows; r++) {
+    for (int c = 0; c < cols; c++) {
+      std::int32_t quantized = result_quantized_down_int32(r, c);
+      std::uint8_t expected = quantized & 255;
+      Check(expected == result_quantized_down_truncated_uint8(r, c));
+    }
+  }
+#endif
+
   // Test a bias-addition with row-vector
   std::vector<std::int32_t> row_vector_data(cols);
   std::uniform_int_distribution<std::int32_t> uniform_minus_500_plus_500(-500,
@@ -1428,8 +1469,8 @@ void TestOutputStages(int rows, int depth, int cols, int result_offset,
     result_fixedpoint_shift++;
   }
   Check(result_fixedpoint_shift >= 0);
-  // Now test OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
-  OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
+  // Now test OutputStageQuantizeDownInt32ByFixedPoint
+  OutputStageQuantizeDownInt32ByFixedPoint
       quantize_down_by_fixedpoint_stage;
   quantize_down_by_fixedpoint_stage.result_offset_after_shift =
       static_cast<std::int32_t>(
@@ -1447,7 +1488,6 @@ void TestOutputStages(int rows, int depth, int cols, int result_offset,
       &result_quantized_down_by_fixedpoint_int32, lhs_offset, rhs_offset,
       quantize_down_by_fixedpoint_pipeline);
 
-  std::vector<std::int32_t> diffs_caused_by_fixedpoint;
   for (int r = 0; r < rows; r++) {
     for (int c = 0; c < cols; c++) {
       const std::int32_t actual =
@@ -1462,6 +1502,44 @@ void TestOutputStages(int rows, int depth, int cols, int result_offset,
     }
   }
 
+  // Test OutputStageScaleInt32ByFixedPointAndExponent
+  for (int exponent = -2; exponent <= 2; exponent++) {
+    OutputStageScaleInt32ByFixedPointAndExponent
+        scale_by_fixedpoint_and_exponent_stage;
+    scale_by_fixedpoint_and_exponent_stage.result_offset_after_shift =
+        static_cast<std::int32_t>(round(static_cast<double>(
+            result_offset * result_mult_int * std::pow(2.0, exponent))));
+    scale_by_fixedpoint_and_exponent_stage.result_fixedpoint_multiplier =
+        result_fixedpoint_multiplier;
+    scale_by_fixedpoint_and_exponent_stage.result_exponent = exponent;
+    auto scale_by_fixedpoint_and_exponent_pipeline =
+        std::make_tuple(scale_by_fixedpoint_and_exponent_stage);
+    Matrix<std::int32_t, ResultOrder>
+        result_scaled_by_fixedpoint_and_exponent_int32(rows, cols);
+    GemmWithOutputPipeline<std::uint8_t, std::int32_t,
+                           DefaultL8R8BitDepthParams>(
+        &context, lhs.const_map(), rhs.const_map(),
+        &result_scaled_by_fixedpoint_and_exponent_int32, lhs_offset, rhs_offset,
+        scale_by_fixedpoint_and_exponent_pipeline);
+
+    for (int r = 0; r < rows; r++) {
+      for (int c = 0; c < cols; c++) {
+        const std::int32_t actual =
+            result_scaled_by_fixedpoint_and_exponent_int32(r, c);
+        const std::int32_t raw = result_raw_int32(r, c);
+        int left_shift = std::max(0, exponent);
+        int right_shift = std::max(0, -exponent);
+        const std::int32_t expected =
+            scale_by_fixedpoint_and_exponent_stage.result_offset_after_shift +
+            RoundingDivideByPOT(
+                SaturatingRoundingDoublingHighMul((1 << left_shift) * raw,
+                                                  result_fixedpoint_multiplier),
+                right_shift);
+        Check(actual == expected);
+      }
+    }
+  }
+
   // Test the variant of the familiar default pipeline consisting of
   // quantize-down and
   // clamp-and-cast-to-uint8, where we used fixedpoint multipliers for the