diff options
Diffstat (limited to 'test/vp9_quantize_test.cc')
-rw-r--r-- | test/vp9_quantize_test.cc | 309 |
1 files changed, 205 insertions, 104 deletions
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index ca1062a76..587cec692 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan); } +void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, + int16_t *quant, int16_t *quant_shift, + int16_t *dequant, int16_t *round_fp, + int16_t *quant_fp) { + // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. + constexpr int kMaxQRoundingFactorFp = 64; + + for (int j = 0; j < 2; j++) { + // The range is 4 to 1828 in the VP9 tables. + const int qlookup = rnd->RandRange(1825) + 4; + round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7; + quant_fp[j] = (1 << 16) / qlookup; + + // Values determined by deconstructing vp9_init_quantizer(). + // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y + // values or U/V values of any bit depth. This is because y_delta is not + // factored into the vp9_ac_quant() call. + zbin[j] = rnd->RandRange(1200); + + // round may be up to 685 for Y values or 914 for U/V. + round[j] = rnd->RandRange(914); + // quant ranges from 1 to -32703 + quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703; + // quant_shift goes up to 1 << 16. + quant_shift[j] = rnd->RandRange(16384); + // dequant maxes out at 1828 for all cases. + dequant[j] = rnd->RandRange(1828); + } + for (int j = 2; j < 8; j++) { + zbin[j] = zbin[1]; + round_fp[j] = round_fp[1]; + quant_fp[j] = quant_fp[1]; + round[j] = round[1]; + quant[j] = quant[1]; + quant_shift[j] = quant_shift[1]; + dequant[j] = dequant[1]; + } +} + class VP9QuantizeBase : public AbstractBench { public: VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp) @@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase, protected: virtual void Run(); + void Speed(bool is_median); const QuantizeFunc quantize_op_; const QuantizeFunc ref_quantize_op_; }; @@ -159,6 +199,101 @@ void VP9QuantizeTest::Run() { scan_->iscan); } +void VP9QuantizeTest::Speed(bool is_median) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + ASSERT_TRUE(coeff_.Init()); + ASSERT_TRUE(qcoeff_.Init()); + ASSERT_TRUE(dqcoeff_.Init()); + TX_SIZE starting_sz, ending_sz; + + if (max_size_ == 16) { + starting_sz = TX_4X4; + ending_sz = TX_16X16; + } else { + starting_sz = TX_32X32; + ending_sz = TX_32X32; + } + + for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { + // zbin > coeff, zbin < coeff. + for (int i = 0; i < 2; ++i) { + // TX_TYPE defines the scan order. That is not relevant to the speed test. + // Pick the first one. + const TX_TYPE tx_type = DCT_DCT; + count_ = (4 << sz) * (4 << sz); + scan_ = &vp9_scan_orders[sz][tx_type]; + + GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, + quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, + quant_fp_ptr_); + + if (i == 0) { + // When |coeff values| are less than zbin the results are 0. + int threshold = 100; + if (max_size_ == 32) { + // For 32x32, the threshold is halved. Double it to keep the values + // from clearing it. + threshold = 200; + } + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; + coeff_.Set(&rnd, -99, 99); + } else if (i == 1) { + for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; + coeff_.Set(&rnd, -500, 500); + } + + const char *type = + (i == 0) ? "Bypass calculations " : "Full calculations "; + char block_size[16]; + snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); + char title[100]; + snprintf(title, sizeof(title), "%25s %8s ", type, block_size); + + if (is_median) { + RunNTimes(10000000 / count_); + PrintMedian(title); + } else { + Buffer<tran_low_t> ref_qcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_qcoeff.Init()); + Buffer<tran_low_t> ref_dqcoeff = + Buffer<tran_low_t>(max_size_, max_size_, 0, 32); + ASSERT_TRUE(ref_dqcoeff.Init()); + uint16_t ref_eob = 0; + + const int kNumTests = 5000000; + vpx_usec_timer timer, simd_timer; + + vpx_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, + q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(), + ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&timer); + + vpx_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_, + quant_shift_ptr_, qcoeff_.TopLeftPixel(), + dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, + scan_->scan, scan_->iscan); + } + vpx_usec_timer_mark(&simd_timer); + + const int elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast<int>(vpx_usec_timer_elapsed(&simd_timer)); + printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title, + elapsed_time, simd_elapsed_time, + ((float)elapsed_time / simd_elapsed_time)); + } + } + } +} + // This quantizer compares the AC coefficients to the quantization step size to // determine if further multiplication operations are needed. // Based on vp9_quantize_fp_sse2(). @@ -254,45 +389,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1); } -void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round, - int16_t *quant, int16_t *quant_shift, - int16_t *dequant, int16_t *round_fp, - int16_t *quant_fp) { - // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V. - const int max_qrounding_factor_fp = 64; - - for (int j = 0; j < 2; j++) { - // The range is 4 to 1828 in the VP9 tables. - const int qlookup = rnd->RandRange(1825) + 4; - round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7; - quant_fp[j] = (1 << 16) / qlookup; - - // Values determined by deconstructing vp9_init_quantizer(). - // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y - // values or U/V values of any bit depth. This is because y_delta is not - // factored into the vp9_ac_quant() call. - zbin[j] = rnd->RandRange(1200); - - // round may be up to 685 for Y values or 914 for U/V. - round[j] = rnd->RandRange(914); - // quant ranges from 1 to -32703 - quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703; - // quant_shift goes up to 1 << 16. - quant_shift[j] = rnd->RandRange(16384); - // dequant maxes out at 1828 for all cases. - dequant[j] = rnd->RandRange(1828); - } - for (int j = 2; j < 8; j++) { - zbin[j] = zbin[1]; - round_fp[j] = round_fp[1]; - quant_fp[j] = quant_fp[1]; - round[j] = round[1]; - quant[j] = quant[1]; - quant_shift[j] = quant_shift[1]; - dequant[j] = dequant[1]; - } -} - TEST_P(VP9QuantizeTest, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); ASSERT_TRUE(coeff_.Init()); @@ -403,60 +499,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) { } } -TEST_P(VP9QuantizeTest, DISABLED_Speed) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - ASSERT_TRUE(coeff_.Init()); - ASSERT_TRUE(qcoeff_.Init()); - ASSERT_TRUE(dqcoeff_.Init()); - TX_SIZE starting_sz, ending_sz; - - if (max_size_ == 16) { - starting_sz = TX_4X4; - ending_sz = TX_16X16; - } else { - starting_sz = TX_32X32; - ending_sz = TX_32X32; - } - - for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) { - // zbin > coeff, zbin < coeff. - for (int i = 0; i < 2; ++i) { - // TX_TYPE defines the scan order. That is not relevant to the speed test. - // Pick the first one. - const TX_TYPE tx_type = DCT_DCT; - count_ = (4 << sz) * (4 << sz); - scan_ = &vp9_scan_orders[sz][tx_type]; - - GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_, - quant_shift_ptr_, dequant_ptr_, round_fp_ptr_, - quant_fp_ptr_); - - if (i == 0) { - // When |coeff values| are less than zbin the results are 0. - int threshold = 100; - if (max_size_ == 32) { - // For 32x32, the threshold is halved. Double it to keep the values - // from clearing it. - threshold = 200; - } - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold; - coeff_.Set(&rnd, -99, 99); - } else if (i == 1) { - for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50; - coeff_.Set(&rnd, -500, 500); - } +TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); } - RunNTimes(10000000 / count_); - const char *type = - (i == 0) ? "Bypass calculations " : "Full calculations "; - char block_size[16]; - snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz); - char title[100]; - snprintf(title, sizeof(title), "%25s %8s ", type, block_size); - PrintMedian(title); - } - } -} +TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); } using std::make_tuple; @@ -467,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8, 16, false), make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c, @@ -492,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P( #endif // HAVE_SSE2 #if HAVE_SSSE3 -#if VPX_ARCH_X86_64 INSTANTIATE_TEST_SUITE_P( SSSE3, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, @@ -506,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>, &QuantFPWrapper<quantize_fp_32x32_nz_c>, VPX_BITS_8, 32, true))); -#else -INSTANTIATE_TEST_SUITE_P( - SSSE3, VP9QuantizeTest, - ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c, - VPX_BITS_8, 16, false), - make_tuple(&vpx_quantize_b_32x32_ssse3, - &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, - false))); - -#endif // VPX_ARCH_X86_64 #endif // HAVE_SSSE3 #if HAVE_AVX @@ -529,14 +565,78 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest, #endif // HAVE_AVX #if VPX_ARCH_X86_64 && HAVE_AVX2 +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + AVX2, VP9QuantizeTest, + ::testing::Values( + make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>, + &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>, + &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16, + true), + make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>, + &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12, + 32, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_avx2, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false))); +#else INSTANTIATE_TEST_SUITE_P( AVX2, VP9QuantizeTest, ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>, &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, - 16, true))); + 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>, + &QuantFPWrapper<quantize_fp_32x32_nz_c>, + VPX_BITS_8, 32, true), + make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_quantize_b_32x32_avx2, + &vpx_quantize_b_32x32_c, VPX_BITS_8, 32, + false))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_AVX2 #if HAVE_NEON +#if CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, VP9QuantizeTest, + ::testing::Values( + make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16, + false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_8, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_10, 16, false), + make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c, + VPX_BITS_12, 16, false), + make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c, + VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false), + make_tuple(&vpx_highbd_quantize_b_32x32_neon, + &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>, + &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true), + make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, + &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, + true))); +#else INSTANTIATE_TEST_SUITE_P( NEON, VP9QuantizeTest, ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, @@ -550,6 +650,7 @@ INSTANTIATE_TEST_SUITE_P( make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>, &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, true))); +#endif // CONFIG_VP9_HIGHBITDEPTH #endif // HAVE_NEON #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH |