1 files changed, 205 insertions, 104 deletions
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index ca1062a76..587cec692 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
   fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+  constexpr int kMaxQRoundingFactorFp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
 class VP9QuantizeBase : public AbstractBench {
  public:
   VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
@@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 
  protected:
   virtual void Run();
+  void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
@@ -159,6 +199,101 @@ void VP9QuantizeTest::Run() {
                scan_->iscan);
 }
 
+void VP9QuantizeTest::Speed(bool is_median) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  TX_SIZE starting_sz, ending_sz;
+
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff_.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff_.Set(&rnd, -500, 500);
+      }
+
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
+      if (is_median) {
+        RunNTimes(10000000 / count_);
+        PrintMedian(title);
+      } else {
+        Buffer<tran_low_t> ref_qcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_qcoeff.Init());
+        Buffer<tran_low_t> ref_dqcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_dqcoeff.Init());
+        uint16_t ref_eob = 0;
+
+        const int kNumTests = 5000000;
+        vpx_usec_timer timer, simd_timer;
+
+        vpx_usec_timer_start(&timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
+                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                           ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                           scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&timer);
+
+        vpx_usec_timer_start(&simd_timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
+                       scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&simd_timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        const int simd_elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+        printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+               elapsed_time, simd_elapsed_time,
+               ((float)elapsed_time / simd_elapsed_time));
+      }
+    }
+  }
+}
+
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
@@ -254,45 +389,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 
-void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
-                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
-                          int16_t *quant_fp) {
-  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
-  const int max_qrounding_factor_fp = 64;
-
-  for (int j = 0; j < 2; j++) {
-    // The range is 4 to 1828 in the VP9 tables.
-    const int qlookup = rnd->RandRange(1825) + 4;
-    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
-    quant_fp[j] = (1 << 16) / qlookup;
-
-    // Values determined by deconstructing vp9_init_quantizer().
-    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
-    // values or U/V values of any bit depth. This is because y_delta is not
-    // factored into the vp9_ac_quant() call.
-    zbin[j] = rnd->RandRange(1200);
-
-    // round may be up to 685 for Y values or 914 for U/V.
-    round[j] = rnd->RandRange(914);
-    // quant ranges from 1 to -32703
-    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
-    // quant_shift goes up to 1 << 16.
-    quant_shift[j] = rnd->RandRange(16384);
-    // dequant maxes out at 1828 for all cases.
-    dequant[j] = rnd->RandRange(1828);
-  }
-  for (int j = 2; j < 8; j++) {
-    zbin[j] = zbin[1];
-    round_fp[j] = round_fp[1];
-    quant_fp[j] = quant_fp[1];
-    round[j] = round[1];
-    quant[j] = quant[1];
-    quant_shift[j] = quant_shift[1];
-    dequant[j] = dequant[1];
-  }
-}
-
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   ASSERT_TRUE(coeff_.Init());
@@ -403,60 +499,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   }
 }
 
-TEST_P(VP9QuantizeTest, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  ASSERT_TRUE(coeff_.Init());
-  ASSERT_TRUE(qcoeff_.Init());
-  ASSERT_TRUE(dqcoeff_.Init());
-  TX_SIZE starting_sz, ending_sz;
-
-  if (max_size_ == 16) {
-    starting_sz = TX_4X4;
-    ending_sz = TX_16X16;
-  } else {
-    starting_sz = TX_32X32;
-    ending_sz = TX_32X32;
-  }
-
-  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
-    // zbin > coeff, zbin < coeff.
-    for (int i = 0; i < 2; ++i) {
-      // TX_TYPE defines the scan order. That is not relevant to the speed test.
-      // Pick the first one.
-      const TX_TYPE tx_type = DCT_DCT;
-      count_ = (4 << sz) * (4 << sz);
-      scan_ = &vp9_scan_orders[sz][tx_type];
-
-      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                           quant_fp_ptr_);
-
-      if (i == 0) {
-        // When |coeff values| are less than zbin the results are 0.
-        int threshold = 100;
-        if (max_size_ == 32) {
-          // For 32x32, the threshold is halved. Double it to keep the values
-          // from clearing it.
-          threshold = 200;
-        }
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
-        coeff_.Set(&rnd, -99, 99);
-      } else if (i == 1) {
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
-        coeff_.Set(&rnd, -500, 500);
-      }
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
 
-      RunNTimes(10000000 / count_);
-      const char *type =
-          (i == 0) ? "Bypass calculations " : "Full calculations ";
-      char block_size[16];
-      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
-      char title[100];
-      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
-      PrintMedian(title);
-    }
-  }
-}
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
 
 using std::make_tuple;
 
@@ -467,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
@@ -492,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if VPX_ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -506,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
-#else
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
-
-#endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
@@ -529,14 +565,78 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true),
+        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true),
+                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_avx2,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -550,6 +650,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH