diff options
Diffstat (limited to 'libvpx/vp9')
138 files changed, 13738 insertions, 3926 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c new file mode 100644 index 000000000..057d2e9c0 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c @@ -0,0 +1,445 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +// Use macros to make sure argument lane is passed in as an constant integer. + +#define vmull_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = vmull_lane_s32(vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = vmull_lane_s32(vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = vmull_lane_s32(vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = vmull_lane_s32(vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlal_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlal_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlal_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlal_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlal_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +#define vmlsl_lane_s32_dual(in, c, lane, out) \ + do { \ + out[0].val[0] = \ + vmlsl_lane_s32(out[0].val[0], vget_low_s32(in.val[0]), c, lane); \ + out[0].val[1] = \ + vmlsl_lane_s32(out[0].val[1], vget_low_s32(in.val[1]), c, lane); \ + out[1].val[0] = \ + vmlsl_lane_s32(out[1].val[0], vget_high_s32(in.val[0]), c, lane); \ + out[1].val[1] = \ + vmlsl_lane_s32(out[1].val[1], vget_high_s32(in.val[1]), c, lane); \ + } while (0) + +static INLINE int32x4x2_t +highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) { + int32x4x2_t out; + out.val[0] = vcombine_s32(vrshrn_n_s64(in[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[0], DCT_CONST_BITS)); + out.val[1] = vcombine_s32(vrshrn_n_s64(in[0].val[1], DCT_CONST_BITS), + vrshrn_n_s64(in[1].val[1], DCT_CONST_BITS)); + return out; +} + +#define highbd_iadst_half_butterfly(in, c, lane, out) \ + do { \ + int64x2x2_t t[2]; \ + vmull_lane_s32_dual(in, c, lane, t); \ + out = highbd_dct_const_round_shift_low_8(t); \ + } while (0) + +#define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \ + do { \ + vmull_lane_s32_dual(in0, c, lane0, s0); \ + vmull_lane_s32_dual(in0, c, lane1, s1); \ + vmlal_lane_s32_dual(in1, c, lane1, s0); \ + vmlsl_lane_s32_dual(in1, c, lane0, s1); \ + } while (0) + +static INLINE int32x4x2_t vaddq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vaddq_s32(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vaddq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vaddq_s64(in0.val[0], in1.val[0]); + out.val[1] = vaddq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vsubq_s32_dual(const int32x4x2_t in0, + const int32x4x2_t in1) { + int32x4x2_t out; + out.val[0] = vsubq_s32(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int64x2x2_t vsubq_s64_dual(const int64x2x2_t in0, + const int64x2x2_t in1) { + int64x2x2_t out; + out.val[0] = vsubq_s64(in0.val[0], in1.val[0]); + out.val[1] = vsubq_s64(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t vcombine_s32_dual(const int32x2x2_t in0, + const int32x2x2_t in1) { + int32x4x2_t out; + out.val[0] = vcombine_s32(in0.val[0], in1.val[0]); + out.val[1] = vcombine_s32(in0.val[1], in1.val[1]); + return out; +} + +static INLINE int32x4x2_t highbd_add_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sum_lo = vaddq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sum_hi = vaddq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sum_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sum_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sum_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sum_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t highbd_sub_dct_const_round_shift_low_8( + const int64x2x2_t *const in0, const int64x2x2_t *const in1) { + const int64x2x2_t sub_lo = vsubq_s64_dual(in0[0], in1[0]); + const int64x2x2_t sub_hi = vsubq_s64_dual(in0[1], in1[1]); + int32x2x2_t out_lo, out_hi; + + out_lo.val[0] = vrshrn_n_s64(sub_lo.val[0], DCT_CONST_BITS); + out_lo.val[1] = vrshrn_n_s64(sub_lo.val[1], DCT_CONST_BITS); + out_hi.val[0] = vrshrn_n_s64(sub_hi.val[0], DCT_CONST_BITS); + out_hi.val[1] = vrshrn_n_s64(sub_hi.val[1], DCT_CONST_BITS); + return vcombine_s32_dual(out_lo, out_hi); +} + +static INLINE int32x4x2_t vnegq_s32_dual(const int32x4x2_t in) { + int32x4x2_t out; + out.val[0] = vnegq_s32(in.val[0]); + out.val[1] = vnegq_s32(in.val[1]); + return out; +} + +void vpx_highbd_iadst16_neon(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, const int bd) { + const int32x4_t c_1_31_5_27 = + create_s32x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int32x4_t c_9_23_13_19 = + create_s32x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int32x4_t c_17_15_21_11 = + create_s32x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int32x4_t c_25_7_29_3 = + create_s32x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int32x4_t c_4_28_20_12 = + create_s32x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int32x4_t c_16_n16_8_24 = + create_s32x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int32x4x2_t in[16], out[16]; + int32x4x2_t x[16], t[12]; + int64x2x2_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int64x2x2_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + highbd_iadst_butterfly(x[0], x[1], vget_low_s32(c_1_31_5_27), 0, 1, s0, s1); + highbd_iadst_butterfly(x[2], x[3], vget_high_s32(c_1_31_5_27), 0, 1, s2, s3); + highbd_iadst_butterfly(x[4], x[5], vget_low_s32(c_9_23_13_19), 0, 1, s4, s5); + highbd_iadst_butterfly(x[6], x[7], vget_high_s32(c_9_23_13_19), 0, 1, s6, s7); + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_17_15_21_11), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_17_15_21_11), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[12], x[13], vget_low_s32(c_25_7_29_3), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[14], x[15], vget_high_s32(c_25_7_29_3), 0, 1, s14, + s15); + + x[0] = highbd_add_dct_const_round_shift_low_8(s0, s8); + x[1] = highbd_add_dct_const_round_shift_low_8(s1, s9); + x[2] = highbd_add_dct_const_round_shift_low_8(s2, s10); + x[3] = highbd_add_dct_const_round_shift_low_8(s3, s11); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s12); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s13); + x[6] = highbd_add_dct_const_round_shift_low_8(s6, s14); + x[7] = highbd_add_dct_const_round_shift_low_8(s7, s15); + x[8] = highbd_sub_dct_const_round_shift_low_8(s0, s8); + x[9] = highbd_sub_dct_const_round_shift_low_8(s1, s9); + x[10] = highbd_sub_dct_const_round_shift_low_8(s2, s10); + x[11] = highbd_sub_dct_const_round_shift_low_8(s3, s11); + x[12] = highbd_sub_dct_const_round_shift_low_8(s4, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s5, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s6, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + highbd_iadst_butterfly(x[8], x[9], vget_low_s32(c_4_28_20_12), 0, 1, s8, s9); + highbd_iadst_butterfly(x[10], x[11], vget_high_s32(c_4_28_20_12), 0, 1, s10, + s11); + highbd_iadst_butterfly(x[13], x[12], vget_low_s32(c_4_28_20_12), 1, 0, s13, + s12); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_4_28_20_12), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[4]); + x[1] = vaddq_s32_dual(t[1], t[5]); + x[2] = vaddq_s32_dual(t[2], t[6]); + x[3] = vaddq_s32_dual(t[3], t[7]); + x[4] = vsubq_s32_dual(t[0], t[4]); + x[5] = vsubq_s32_dual(t[1], t[5]); + x[6] = vsubq_s32_dual(t[2], t[6]); + x[7] = vsubq_s32_dual(t[3], t[7]); + x[8] = highbd_add_dct_const_round_shift_low_8(s8, s12); + x[9] = highbd_add_dct_const_round_shift_low_8(s9, s13); + x[10] = highbd_add_dct_const_round_shift_low_8(s10, s14); + x[11] = highbd_add_dct_const_round_shift_low_8(s11, s15); + x[12] = highbd_sub_dct_const_round_shift_low_8(s8, s12); + x[13] = highbd_sub_dct_const_round_shift_low_8(s9, s13); + x[14] = highbd_sub_dct_const_round_shift_low_8(s10, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly(x[4], x[5], vget_high_s32(c_16_n16_8_24), 0, 1, s4, + s5); + highbd_iadst_butterfly(x[7], x[6], vget_high_s32(c_16_n16_8_24), 1, 0, s7, + s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + highbd_iadst_butterfly(x[12], x[13], vget_high_s32(c_16_n16_8_24), 0, 1, s12, + s13); + highbd_iadst_butterfly(x[15], x[14], vget_high_s32(c_16_n16_8_24), 1, 0, s15, + s14); + + x[0] = vaddq_s32_dual(t[0], t[2]); + x[1] = vaddq_s32_dual(t[1], t[3]); + x[2] = vsubq_s32_dual(t[0], t[2]); + x[3] = vsubq_s32_dual(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s4, s6); + x[5] = highbd_add_dct_const_round_shift_low_8(s5, s7); + x[6] = highbd_sub_dct_const_round_shift_low_8(s4, s6); + x[7] = highbd_sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s32_dual(t[8], t[10]); + x[9] = vaddq_s32_dual(t[9], t[11]); + x[10] = vsubq_s32_dual(t[8], t[10]); + x[11] = vsubq_s32_dual(t[9], t[11]); + x[12] = highbd_add_dct_const_round_shift_low_8(s12, s14); + x[13] = highbd_add_dct_const_round_shift_low_8(s13, s15); + x[14] = highbd_sub_dct_const_round_shift_low_8(s12, s14); + x[15] = highbd_sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + { + const int32x4x2_t sum = vaddq_s32_dual(x[2], x[3]); + const int32x4x2_t sub = vsubq_s32_dual(x[2], x[3]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[2]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[3]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[7], x[6]); + const int32x4x2_t sub = vsubq_s32_dual(x[7], x[6]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[6]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[7]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[11], x[10]); + const int32x4x2_t sub = vsubq_s32_dual(x[11], x[10]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 0, x[10]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[11]); + } + { + const int32x4x2_t sum = vaddq_s32_dual(x[14], x[15]); + const int32x4x2_t sub = vsubq_s32_dual(x[14], x[15]); + highbd_iadst_half_butterfly(sum, vget_low_s32(c_16_n16_8_24), 1, x[14]); + highbd_iadst_half_butterfly(sub, vget_low_s32(c_16_n16_8_24), 0, x[15]); + } + + out[0] = x[0]; + out[1] = vnegq_s32_dual(x[8]); + out[2] = x[12]; + out[3] = vnegq_s32_dual(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s32_dual(x[13]); + out[14] = x[9]; + out[15] = vnegq_s32_dual(x[1]); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +typedef void (*highbd_iht_1d)(const int32_t *input, int32_t *output, + uint16_t *dest, const int stride, const int bd); + +typedef struct { + highbd_iht_1d cols, rows; // vertical and horizontal +} highbd_iht_2d; + +void vp9_highbd_iht16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + if (bd == 8) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 1); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 1); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 1); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 1); // right 8 columns + } else { + static const highbd_iht_2d IHT_16[] = { + { vpx_highbd_idct16x16_256_add_half1d, + vpx_highbd_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_highbd_iadst16_neon, + vpx_highbd_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_highbd_idct16x16_256_add_half1d, + vpx_highbd_iadst16_neon }, // DCT_ADST = 2 + { vpx_highbd_iadst16_neon, vpx_highbd_iadst16_neon } // ADST_ADST = 3 + }; + const highbd_iht_2d ht = IHT_16[tx_type]; + int32_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, bd); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, bd); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, bd); // left 8 columns + ht.cols(row_output + 8 * 16, NULL, dest + 8, stride, + bd); // right 8 columns + } +} diff --git a/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c new file mode 100644 index 000000000..52c4f1937 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void highbd_iadst4(int32x4_t *const io) { + const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 }; + const int32x4_t sinpi = vld1q_s32(sinpis); + int64x2x2_t s[7], t[4]; + int32x4_t s7; + + s[0].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 0); + s[0].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 0); + s[1].val[0] = vmull_lane_s32(vget_low_s32(io[0]), vget_low_s32(sinpi), 1); + s[1].val[1] = vmull_lane_s32(vget_high_s32(io[0]), vget_low_s32(sinpi), 1); + s[2].val[0] = vmull_lane_s32(vget_low_s32(io[1]), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(io[1]), vget_high_s32(sinpi), 0); + s[3].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_high_s32(sinpi), 1); + s[3].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_high_s32(sinpi), 1); + s[4].val[0] = vmull_lane_s32(vget_low_s32(io[2]), vget_low_s32(sinpi), 0); + s[4].val[1] = vmull_lane_s32(vget_high_s32(io[2]), vget_low_s32(sinpi), 0); + s[5].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_low_s32(sinpi), 1); + s[5].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_low_s32(sinpi), 1); + s[6].val[0] = vmull_lane_s32(vget_low_s32(io[3]), vget_high_s32(sinpi), 1); + s[6].val[1] = vmull_lane_s32(vget_high_s32(io[3]), vget_high_s32(sinpi), 1); + s7 = vsubq_s32(io[0], io[2]); + s7 = vaddq_s32(s7, io[3]); + + s[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + s[0].val[0] = vaddq_s64(s[0].val[0], s[5].val[0]); + s[0].val[1] = vaddq_s64(s[0].val[1], s[5].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[4].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[4].val[1]); + s[1].val[0] = vsubq_s64(s[1].val[0], s[6].val[0]); + s[1].val[1] = vsubq_s64(s[1].val[1], s[6].val[1]); + s[3] = s[2]; + s[2].val[0] = vmull_lane_s32(vget_low_s32(s7), vget_high_s32(sinpi), 0); + s[2].val[1] = vmull_lane_s32(vget_high_s32(s7), vget_high_s32(sinpi), 0); + + t[0].val[0] = vaddq_s64(s[0].val[0], s[3].val[0]); + t[0].val[1] = vaddq_s64(s[0].val[1], s[3].val[1]); + t[1].val[0] = vaddq_s64(s[1].val[0], s[3].val[0]); + t[1].val[1] = vaddq_s64(s[1].val[1], s[3].val[1]); + t[2] = s[2]; + t[3].val[0] = vaddq_s64(s[0].val[0], s[1].val[0]); + t[3].val[1] = vaddq_s64(s[0].val[1], s[1].val[1]); + t[3].val[0] = vsubq_s64(t[3].val[0], s[3].val[0]); + t[3].val[1] = vsubq_s64(t[3].val[1], s[3].val[1]); + io[0] = vcombine_s32(vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS)); + io[1] = vcombine_s32(vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS)); + io[2] = vcombine_s32(vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS)); + io[3] = vcombine_s32(vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS), + vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS)); +} + +void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t a[2]; + int32x4_t c[4]; + + c[0] = vld1q_s32(input); + c[1] = vld1q_s32(input + 4); + c[2] = vld1q_s32(input + 8); + c[3] = vld1q_s32(input + 12); + + if (bd == 8) { + a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1])); + a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3])); + transpose_s16_4x4q(&a[0], &a[1]); + + switch (tx_type) { + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + break; + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); + break; + } + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + + case DCT_ADST: { + const int32x4_t cospis = vld1q_s32(kCospi32); + + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, c); + } else { + idct4x4_16_kernel_bd12(cospis, c); + } + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]); + highbd_iadst4(c); + break; + } + } + a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4)); + a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max); + highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max); +} diff --git a/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c new file mode 100644 index 000000000..2232c6841 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c @@ -0,0 +1,345 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vp9/common/vp9_enums.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/highbd_idct_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_iadst_half_butterfly_neon(int32x4_t *const x, + const int32x2_t c) { + const int32x4_t sum = vaddq_s32(x[0], x[1]); + const int32x4_t sub = vsubq_s32(x[0], x[1]); + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0); + const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS); + const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS); + const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS); + const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS); + + x[0] = vcombine_s32(out0_lo, out0_hi); + x[1] = vcombine_s32(out1_lo, out1_hi); +} + +static INLINE void highbd_iadst_butterfly_lane_0_1_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 1); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 0); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 1); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 0); +} + +static INLINE void highbd_iadst_butterfly_lane_1_0_neon(const int32x4_t in0, + const int32x4_t in1, + const int32x2_t c, + int64x2_t *const s0, + int64x2_t *const s1) { + const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(in0), c, 1); + const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(in0), c, 0); + const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(in0), c, 1); + const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(in0), c, 0); + + s0[0] = vmlal_lane_s32(t0_lo, vget_low_s32(in1), c, 0); + s1[0] = vmlsl_lane_s32(t1_lo, vget_low_s32(in1), c, 1); + s0[1] = vmlal_lane_s32(t0_hi, vget_high_s32(in1), c, 0); + s1[1] = vmlsl_lane_s32(t1_hi, vget_high_s32(in1), c, 1); +} + +static INLINE int32x4_t highbd_add_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sum_lo = vaddq_s64(in0[0], in1[0]); + const int64x2_t sum_hi = vaddq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sum_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sum_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE int32x4_t highbd_sub_dct_const_round_shift_low_8( + const int64x2_t *const in0, const int64x2_t *const in1) { + const int64x2_t sub_lo = vsubq_s64(in0[0], in1[0]); + const int64x2_t sub_hi = vsubq_s64(in0[1], in1[1]); + const int32x2_t out_lo = vrshrn_n_s64(sub_lo, DCT_CONST_BITS); + const int32x2_t out_hi = vrshrn_n_s64(sub_hi, DCT_CONST_BITS); + return vcombine_s32(out_lo, out_hi); +} + +static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1, + int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, + int32x4_t *const io6, int32x4_t *const io7) { + const int32x4_t c0 = + create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int32x4_t c1 = + create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int32x4_t c2 = + create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int32x4_t x[8], t[4]; + int64x2_t s[8][2]; + + x[0] = *io7; + x[1] = *io0; + x[2] = *io5; + x[3] = *io2; + x[4] = *io3; + x[5] = *io4; + x[6] = *io1; + x[7] = *io6; + + // stage 1 + highbd_iadst_butterfly_lane_0_1_neon(x[0], x[1], vget_low_s32(c0), s[0], + s[1]); + highbd_iadst_butterfly_lane_0_1_neon(x[2], x[3], vget_high_s32(c0), s[2], + s[3]); + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_low_s32(c1), s[4], + s[5]); + highbd_iadst_butterfly_lane_0_1_neon(x[6], x[7], vget_high_s32(c1), s[6], + s[7]); + + x[0] = highbd_add_dct_const_round_shift_low_8(s[0], s[4]); + x[1] = highbd_add_dct_const_round_shift_low_8(s[1], s[5]); + x[2] = highbd_add_dct_const_round_shift_low_8(s[2], s[6]); + x[3] = highbd_add_dct_const_round_shift_low_8(s[3], s[7]); + x[4] = highbd_sub_dct_const_round_shift_low_8(s[0], s[4]); + x[5] = highbd_sub_dct_const_round_shift_low_8(s[1], s[5]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[2], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[3], s[7]); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + highbd_iadst_butterfly_lane_0_1_neon(x[4], x[5], vget_high_s32(c2), s[4], + s[5]); + highbd_iadst_butterfly_lane_1_0_neon(x[7], x[6], vget_high_s32(c2), s[7], + s[6]); + + x[0] = vaddq_s32(t[0], t[2]); + x[1] = vaddq_s32(t[1], t[3]); + x[2] = vsubq_s32(t[0], t[2]); + x[3] = vsubq_s32(t[1], t[3]); + x[4] = highbd_add_dct_const_round_shift_low_8(s[4], s[6]); + x[5] = highbd_add_dct_const_round_shift_low_8(s[5], s[7]); + x[6] = highbd_sub_dct_const_round_shift_low_8(s[4], s[6]); + x[7] = highbd_sub_dct_const_round_shift_low_8(s[5], s[7]); + + // stage 3 + highbd_iadst_half_butterfly_neon(x + 2, vget_low_s32(c2)); + highbd_iadst_half_butterfly_neon(x + 6, vget_low_s32(c2)); + + *io0 = x[0]; + *io1 = vnegq_s32(x[4]); + *io2 = x[6]; + *io3 = vnegq_s32(x[2]); + *io4 = x[3]; + *io5 = vnegq_s32(x[7]); + *io6 = x[5]; + *io7 = vnegq_s32(x[1]); +} + +void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int32x4_t a[16]; + int16x8_t c[8]; + + a[0] = vld1q_s32(input); + a[1] = vld1q_s32(input + 4); + a[2] = vld1q_s32(input + 8); + a[3] = vld1q_s32(input + 12); + a[4] = vld1q_s32(input + 16); + a[5] = vld1q_s32(input + 20); + a[6] = vld1q_s32(input + 24); + a[7] = vld1q_s32(input + 28); + a[8] = vld1q_s32(input + 32); + a[9] = vld1q_s32(input + 36); + a[10] = vld1q_s32(input + 40); + a[11] = vld1q_s32(input + 44); + a[12] = vld1q_s32(input + 48); + a[13] = vld1q_s32(input + 52); + a[14] = vld1q_s32(input + 56); + a[15] = vld1q_s32(input + 60); + + if (bd == 8) { + c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); + c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); + c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); + c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); + c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); + c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); + c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); + c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); + + switch (tx_type) { + case DCT_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + case ADST_DCT: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + idct8x8_64_1d_bd8(cospis0, cospis1, c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + + case DCT_ADST: { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + idct8x8_64_1d_bd8(cospis0, cospis1, c); + break; + } + + default: { + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], + &c[7]); + iadst8(c); + break; + } + } + + c[0] = vrshrq_n_s16(c[0], 5); + c[1] = vrshrq_n_s16(c[1], 5); + c[2] = vrshrq_n_s16(c[2], 5); + c[3] = vrshrq_n_s16(c[3], 5); + c[4] = vrshrq_n_s16(c[4], 5); + c[5] = vrshrq_n_s16(c[5], 5); + c[6] = vrshrq_n_s16(c[6], 5); + c[7] = vrshrq_n_s16(c[7], 5); + } else { + switch (tx_type) { + case DCT_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + } + break; + } + + case ADST_DCT: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + + case DCT_ADST: { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = + vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); + break; + } + + default: { + assert(tx_type == ADST_ADST); + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + break; + } + } + + c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); + c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); + c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); + c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); + c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); + c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); + c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); + c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); + } + highbd_add8x8(c, dest, stride, bd); +} diff --git a/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c new file mode 100644 index 000000000..db72ff116 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_iht16x16_add_neon.c @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { + int16x8_t in[16], out[16]; + const int16x4_t c_1_31_5_27 = + create_s16x4_neon(cospi_1_64, cospi_31_64, cospi_5_64, cospi_27_64); + const int16x4_t c_9_23_13_19 = + create_s16x4_neon(cospi_9_64, cospi_23_64, cospi_13_64, cospi_19_64); + const int16x4_t c_17_15_21_11 = + create_s16x4_neon(cospi_17_64, cospi_15_64, cospi_21_64, cospi_11_64); + const int16x4_t c_25_7_29_3 = + create_s16x4_neon(cospi_25_64, cospi_7_64, cospi_29_64, cospi_3_64); + const int16x4_t c_4_28_20_12 = + create_s16x4_neon(cospi_4_64, cospi_28_64, cospi_20_64, cospi_12_64); + const int16x4_t c_16_n16_8_24 = + create_s16x4_neon(cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64); + int16x8_t x[16], t[12]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + int32x4_t s8[2], s9[2], s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + + // Load input (16x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + x[0] = in[15]; + x[1] = in[0]; + x[2] = in[13]; + x[3] = in[2]; + x[4] = in[11]; + x[5] = in[4]; + x[6] = in[9]; + x[7] = in[6]; + x[8] = in[7]; + x[9] = in[8]; + x[10] = in[5]; + x[11] = in[10]; + x[12] = in[3]; + x[13] = in[12]; + x[14] = in[1]; + x[15] = in[14]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c_1_31_5_27, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c_1_31_5_27, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c_9_23_13_19, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c_9_23_13_19, s6, s7); + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_17_15_21_11, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_17_15_21_11, s10, s11); + iadst_butterfly_lane_0_1_neon(x[12], x[13], c_25_7_29_3, s12, s13); + iadst_butterfly_lane_2_3_neon(x[14], x[15], c_25_7_29_3, s14, s15); + + x[0] = add_dct_const_round_shift_low_8(s0, s8); + x[1] = add_dct_const_round_shift_low_8(s1, s9); + x[2] = add_dct_const_round_shift_low_8(s2, s10); + x[3] = add_dct_const_round_shift_low_8(s3, s11); + x[4] = add_dct_const_round_shift_low_8(s4, s12); + x[5] = add_dct_const_round_shift_low_8(s5, s13); + x[6] = add_dct_const_round_shift_low_8(s6, s14); + x[7] = add_dct_const_round_shift_low_8(s7, s15); + x[8] = sub_dct_const_round_shift_low_8(s0, s8); + x[9] = sub_dct_const_round_shift_low_8(s1, s9); + x[10] = sub_dct_const_round_shift_low_8(s2, s10); + x[11] = sub_dct_const_round_shift_low_8(s3, s11); + x[12] = sub_dct_const_round_shift_low_8(s4, s12); + x[13] = sub_dct_const_round_shift_low_8(s5, s13); + x[14] = sub_dct_const_round_shift_low_8(s6, s14); + x[15] = sub_dct_const_round_shift_low_8(s7, s15); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + t[4] = x[4]; + t[5] = x[5]; + t[6] = x[6]; + t[7] = x[7]; + iadst_butterfly_lane_0_1_neon(x[8], x[9], c_4_28_20_12, s8, s9); + iadst_butterfly_lane_2_3_neon(x[10], x[11], c_4_28_20_12, s10, s11); + iadst_butterfly_lane_1_0_neon(x[13], x[12], c_4_28_20_12, s13, s12); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_4_28_20_12, s15, s14); + + x[0] = vaddq_s16(t[0], t[4]); + x[1] = vaddq_s16(t[1], t[5]); + x[2] = vaddq_s16(t[2], t[6]); + x[3] = vaddq_s16(t[3], t[7]); + x[4] = vsubq_s16(t[0], t[4]); + x[5] = vsubq_s16(t[1], t[5]); + x[6] = vsubq_s16(t[2], t[6]); + x[7] = vsubq_s16(t[3], t[7]); + x[8] = add_dct_const_round_shift_low_8(s8, s12); + x[9] = add_dct_const_round_shift_low_8(s9, s13); + x[10] = add_dct_const_round_shift_low_8(s10, s14); + x[11] = add_dct_const_round_shift_low_8(s11, s15); + x[12] = sub_dct_const_round_shift_low_8(s8, s12); + x[13] = sub_dct_const_round_shift_low_8(s9, s13); + x[14] = sub_dct_const_round_shift_low_8(s10, s14); + x[15] = sub_dct_const_round_shift_low_8(s11, s15); + + // stage 3 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c_16_n16_8_24, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c_16_n16_8_24, s7, s6); + t[8] = x[8]; + t[9] = x[9]; + t[10] = x[10]; + t[11] = x[11]; + iadst_butterfly_lane_2_3_neon(x[12], x[13], c_16_n16_8_24, s12, s13); + iadst_butterfly_lane_3_2_neon(x[15], x[14], c_16_n16_8_24, s15, s14); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + x[8] = vaddq_s16(t[8], t[10]); + x[9] = vaddq_s16(t[9], t[11]); + x[10] = vsubq_s16(t[8], t[10]); + x[11] = vsubq_s16(t[9], t[11]); + x[12] = add_dct_const_round_shift_low_8(s12, s14); + x[13] = add_dct_const_round_shift_low_8(s13, s15); + x[14] = sub_dct_const_round_shift_low_8(s12, s14); + x[15] = sub_dct_const_round_shift_low_8(s13, s15); + + // stage 4 + iadst_half_butterfly_neg_neon(&x[3], &x[2], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[7], &x[6], c_16_n16_8_24); + iadst_half_butterfly_pos_neon(&x[11], &x[10], c_16_n16_8_24); + iadst_half_butterfly_neg_neon(&x[15], &x[14], c_16_n16_8_24); + + out[0] = x[0]; + out[1] = vnegq_s16(x[8]); + out[2] = x[12]; + out[3] = vnegq_s16(x[4]); + out[4] = x[6]; + out[5] = x[14]; + out[6] = x[10]; + out[7] = x[2]; + out[8] = x[3]; + out[9] = x[11]; + out[10] = x[15]; + out[11] = x[7]; + out[12] = x[5]; + out[13] = vnegq_s16(x[13]); + out[14] = x[9]; + out[15] = vnegq_s16(x[1]); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void vp9_iht16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + static const iht_2d IHT_16[] = { + { vpx_idct16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // DCT_DCT = 0 + { vpx_iadst16x16_256_add_half1d, + vpx_idct16x16_256_add_half1d }, // ADST_DCT = 1 + { vpx_idct16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d }, // DCT_ADST = 2 + { vpx_iadst16x16_256_add_half1d, + vpx_iadst16x16_256_add_half1d } // ADST_ADST = 3 + }; + const iht_2d ht = IHT_16[tx_type]; + int16_t row_output[16 * 16]; + + // pass 1 + ht.rows(input, row_output, dest, stride, 0); // upper 8 rows + ht.rows(input + 8 * 16, row_output + 8, dest, stride, 0); // lower 8 rows + + // pass 2 + ht.cols(row_output, NULL, dest, stride, 0); // left 8 columns + ht.cols(row_output + 16 * 8, NULL, dest + 8, stride, 0); // right 8 columns +} diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index 025254c3f..4f0a90f21 100644 --- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -14,206 +14,63 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) { - int32x4_t q8s32, q9s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - - d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16)); - d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16)); - - q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1])); - q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1])); - q0x2s32 = vtrnq_s32(q8s32, q9s32); - - *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]); - *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]); -} - -static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16) { - *d0s16 = vdup_n_s16(cospi_8_64); - *d1s16 = vdup_n_s16(cospi_16_64); - *d2s16 = vdup_n_s16(cospi_24_64); -} - -static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16) { - *d3s16 = vdup_n_s16(sinpi_1_9); - *d4s16 = vdup_n_s16(sinpi_2_9); - *q3s16 = vdupq_n_s16(sinpi_3_9); - *d5s16 = vdup_n_s16(sinpi_4_9); -} - -static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, - int16x4_t *d2s16, int16x8_t *q8s16, - int16x8_t *q9s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - int32x4_t q10s32, q13s32, q14s32, q15s32; - int16x8_t q13s16, q14s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, *d2s16); - q10s32 = vmull_s16(d17s16, *d0s16); - q13s32 = vmull_s16(d23s16, *d1s16); - q14s32 = vmull_s16(d24s16, *d1s16); - q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); - q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - - d26s16 = vrshrn_n_s32(q13s32, 14); - d27s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d28s16 = vrshrn_n_s32(q10s32, 14); - - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - *q8s16 = vaddq_s16(q13s16, q14s16); - *q9s16 = vsubq_s16(q13s16, q14s16); - *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp -} - -static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, - int16x4_t *d5s16, int16x8_t *q3s16, - int16x8_t *q8s16, int16x8_t *q9s16) { - int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; - int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d6s16 = vget_low_s16(*q3s16); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - - q10s32 = vmull_s16(*d3s16, d16s16); - q11s32 = vmull_s16(*d4s16, d16s16); - q12s32 = vmull_s16(d6s16, d17s16); - q13s32 = vmull_s16(*d5s16, d18s16); - q14s32 = vmull_s16(*d3s16, d18s16); - q15s32 = vmovl_s16(d16s16); - q15s32 = vaddw_s16(q15s32, d19s16); - q8s32 = vmull_s16(*d4s16, d19s16); - q15s32 = vsubw_s16(q15s32, d18s16); - q9s32 = vmull_s16(*d5s16, d19s16); - - q10s32 = vaddq_s32(q10s32, q13s32); - q10s32 = vaddq_s32(q10s32, q8s32); - q11s32 = vsubq_s32(q11s32, q14s32); - q8s32 = vdupq_n_s32(sinpi_3_9); - q11s32 = vsubq_s32(q11s32, q9s32); - q15s32 = vmulq_s32(q15s32, q8s32); - - q13s32 = vaddq_s32(q10s32, q12s32); - q10s32 = vaddq_s32(q10s32, q11s32); - q14s32 = vaddq_s32(q11s32, q12s32); - q10s32 = vsubq_s32(q10s32, q12s32); - - d16s16 = vrshrn_n_s32(q13s32, 14); - d17s16 = vrshrn_n_s32(q14s32, 14); - d18s16 = vrshrn_n_s32(q15s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - - *q8s16 = vcombine_s16(d16s16, d17s16); - *q9s16 = vcombine_s16(d18s16, d19s16); -} - void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - uint8x8_t d26u8, d27u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; - uint32x2_t d26u32, d27u32; - int16x8_t q3s16, q8s16, q9s16; - uint16x8_t q8u16, q9u16; - - d26u32 = d27u32 = vdup_n_u32(0); + int16x8_t a[2]; + uint8x8_t s[2], d[2]; + uint16x8_t sum[2]; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - TRANSPOSE4X4(&q8s16, &q9s16); + a[0] = load_tran_low_to_s16q(input); + a[1] = load_tran_low_to_s16q(input + 8); + transpose_s16_4x4q(&a[0], &a[1]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate constants - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - case 2: // idct_iadst - // generate constantsyy - GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); + case ADST_DCT: + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; - case 3: // iadst_iadst - // generate constants - GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); - - // first transform rows - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); - // transpose the matrix - TRANSPOSE4X4(&q8s16, &q9s16); - - // then transform columns - IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); + case DCT_ADST: + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + idct4x4_16_kernel_bd8(a); + a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1])); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst4(a); + transpose_s16_4x4q(&a[0], &a[1]); + iadst4(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += stride; - d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += stride; - d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); + a[0] = vrshrq_n_s16(a[0], 4); + a[1] = vrshrq_n_s16(a[1], 4); + s[0] = load_u8(dest, stride); + s[1] = load_u8(dest + 2 * stride, stride); + sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]); + sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]); + d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0])); + d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1])); + store_u8(dest, stride, d[0]); + store_u8(dest + 2 * stride, stride, d[1]); } diff --git a/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 1c739861c..46ee632e0 100644 --- a/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -14,527 +14,55 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/arm/neon/vp9_iht_neon.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/transpose_neon.h" -static int16_t cospi_2_64 = 16305; -static int16_t cospi_4_64 = 16069; -static int16_t cospi_6_64 = 15679; -static int16_t cospi_8_64 = 15137; -static int16_t cospi_10_64 = 14449; -static int16_t cospi_12_64 = 13623; -static int16_t cospi_14_64 = 12665; -static int16_t cospi_16_64 = 11585; -static int16_t cospi_18_64 = 10394; -static int16_t cospi_20_64 = 9102; -static int16_t cospi_22_64 = 7723; -static int16_t cospi_24_64 = 6270; -static int16_t cospi_26_64 = 4756; -static int16_t cospi_28_64 = 3196; -static int16_t cospi_30_64 = 1606; - -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vrshrn_n_s32(q2s32, 14); - d9s16 = vrshrn_n_s32(q3s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vrshrn_n_s32(q2s32, 14); - d15s16 = vrshrn_n_s32(q3s32, 14); - d12s16 = vrshrn_n_s32(q9s32, 14); - d13s16 = vrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16(cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); - - d18s16 = vrshrn_n_s32(q2s32, 14); - d19s16 = vrshrn_n_s32(q3s32, 14); - d22s16 = vrshrn_n_s32(q13s32, 14); - d23s16 = vrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vrshrn_n_s32(q2s32, 14); - d27s16 = vrshrn_n_s32(q3s32, 14); - d30s16 = vrshrn_n_s32(q8s32, 14); - d31s16 = vrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16(cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vrshrn_n_s32(q9s32, 14); - d11s16 = vrshrn_n_s32(q10s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); -} - -static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q2s16, q4s16, q5s16, q6s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; - int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - d14s16 = vdup_n_s16(cospi_2_64); - d15s16 = vdup_n_s16(cospi_30_64); - - q1s32 = vmull_s16(d30s16, d14s16); - q2s32 = vmull_s16(d31s16, d14s16); - q3s32 = vmull_s16(d30s16, d15s16); - q4s32 = vmull_s16(d31s16, d15s16); - - d30s16 = vdup_n_s16(cospi_18_64); - d31s16 = vdup_n_s16(cospi_14_64); - - q1s32 = vmlal_s16(q1s32, d16s16, d15s16); - q2s32 = vmlal_s16(q2s32, d17s16, d15s16); - q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); - q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); - - q5s32 = vmull_s16(d22s16, d30s16); - q6s32 = vmull_s16(d23s16, d30s16); - q7s32 = vmull_s16(d22s16, d31s16); - q8s32 = vmull_s16(d23s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d24s16, d31s16); - q6s32 = vmlal_s16(q6s32, d25s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); - q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); - - q11s32 = vaddq_s32(q1s32, q5s32); - q12s32 = vaddq_s32(q2s32, q6s32); - q1s32 = vsubq_s32(q1s32, q5s32); - q2s32 = vsubq_s32(q2s32, q6s32); - - d22s16 = vrshrn_n_s32(q11s32, 14); - d23s16 = vrshrn_n_s32(q12s32, 14); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q12s32 = vaddq_s32(q3s32, q7s32); - q15s32 = vaddq_s32(q4s32, q8s32); - q3s32 = vsubq_s32(q3s32, q7s32); - q4s32 = vsubq_s32(q4s32, q8s32); - - d2s16 = vrshrn_n_s32(q1s32, 14); - d3s16 = vrshrn_n_s32(q2s32, 14); - d24s16 = vrshrn_n_s32(q12s32, 14); - d25s16 = vrshrn_n_s32(q15s32, 14); - d6s16 = vrshrn_n_s32(q3s32, 14); - d7s16 = vrshrn_n_s32(q4s32, 14); - *q12s16 = vcombine_s16(d24s16, d25s16); - - d0s16 = vdup_n_s16(cospi_10_64); - d1s16 = vdup_n_s16(cospi_22_64); - q4s32 = vmull_s16(d26s16, d0s16); - q5s32 = vmull_s16(d27s16, d0s16); - q2s32 = vmull_s16(d26s16, d1s16); - q6s32 = vmull_s16(d27s16, d1s16); - - d30s16 = vdup_n_s16(cospi_26_64); - d31s16 = vdup_n_s16(cospi_6_64); - - q4s32 = vmlal_s16(q4s32, d20s16, d1s16); - q5s32 = vmlal_s16(q5s32, d21s16, d1s16); - q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); - q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); - - q0s32 = vmull_s16(d18s16, d30s16); - q13s32 = vmull_s16(d19s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d31s16); - q13s32 = vmlal_s16(q13s32, d29s16, d31s16); - - q10s32 = vmull_s16(d18s16, d31s16); - q9s32 = vmull_s16(d19s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); - q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); - - q14s32 = vaddq_s32(q2s32, q10s32); - q15s32 = vaddq_s32(q6s32, q9s32); - q2s32 = vsubq_s32(q2s32, q10s32); - q6s32 = vsubq_s32(q6s32, q9s32); - - d28s16 = vrshrn_n_s32(q14s32, 14); - d29s16 = vrshrn_n_s32(q15s32, 14); - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q6s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - q9s32 = vaddq_s32(q4s32, q0s32); - q10s32 = vaddq_s32(q5s32, q13s32); - q4s32 = vsubq_s32(q4s32, q0s32); - q5s32 = vsubq_s32(q5s32, q13s32); - - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); - - d18s16 = vrshrn_n_s32(q9s32, 14); - d19s16 = vrshrn_n_s32(q10s32, 14); - d8s16 = vrshrn_n_s32(q4s32, 14); - d9s16 = vrshrn_n_s32(q5s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q5s32 = vmull_s16(d2s16, d30s16); - q6s32 = vmull_s16(d3s16, d30s16); - q7s32 = vmull_s16(d2s16, d31s16); - q0s32 = vmull_s16(d3s16, d31s16); - - q5s32 = vmlal_s16(q5s32, d6s16, d31s16); - q6s32 = vmlal_s16(q6s32, d7s16, d31s16); - q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); - - q1s32 = vmull_s16(d4s16, d30s16); - q3s32 = vmull_s16(d5s16, d30s16); - q10s32 = vmull_s16(d4s16, d31s16); - q2s32 = vmull_s16(d5s16, d31s16); - - q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); - q10s32 = vmlal_s16(q10s32, d8s16, d30s16); - q2s32 = vmlal_s16(q2s32, d9s16, d30s16); - - *q8s16 = vaddq_s16(*q11s16, *q9s16); - *q11s16 = vsubq_s16(*q11s16, *q9s16); - q4s16 = vaddq_s16(*q12s16, *q14s16); - *q12s16 = vsubq_s16(*q12s16, *q14s16); - - q14s32 = vaddq_s32(q5s32, q1s32); - q15s32 = vaddq_s32(q6s32, q3s32); - q5s32 = vsubq_s32(q5s32, q1s32); - q6s32 = vsubq_s32(q6s32, q3s32); - - d18s16 = vrshrn_n_s32(q14s32, 14); - d19s16 = vrshrn_n_s32(q15s32, 14); - d10s16 = vrshrn_n_s32(q5s32, 14); - d11s16 = vrshrn_n_s32(q6s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - - q1s32 = vaddq_s32(q7s32, q10s32); - q3s32 = vaddq_s32(q0s32, q2s32); - q7s32 = vsubq_s32(q7s32, q10s32); - q0s32 = vsubq_s32(q0s32, q2s32); - - d28s16 = vrshrn_n_s32(q1s32, 14); - d29s16 = vrshrn_n_s32(q3s32, 14); - d14s16 = vrshrn_n_s32(q7s32, 14); - d15s16 = vrshrn_n_s32(q0s32, 14); - *q14s16 = vcombine_s16(d28s16, d29s16); - - d30s16 = vdup_n_s16(cospi_16_64); - - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - q2s32 = vmull_s16(d22s16, d30s16); - q3s32 = vmull_s16(d23s16, d30s16); - q13s32 = vmull_s16(d22s16, d30s16); - q1s32 = vmull_s16(d23s16, d30s16); - - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - q2s32 = vmlal_s16(q2s32, d24s16, d30s16); - q3s32 = vmlal_s16(q3s32, d25s16, d30s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); - q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); - - d4s16 = vrshrn_n_s32(q2s32, 14); - d5s16 = vrshrn_n_s32(q3s32, 14); - d24s16 = vrshrn_n_s32(q13s32, 14); - d25s16 = vrshrn_n_s32(q1s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - *q12s16 = vcombine_s16(d24s16, d25s16); - - q13s32 = vmull_s16(d10s16, d30s16); - q1s32 = vmull_s16(d11s16, d30s16); - q11s32 = vmull_s16(d10s16, d30s16); - q0s32 = vmull_s16(d11s16, d30s16); - - q13s32 = vmlal_s16(q13s32, d14s16, d30s16); - q1s32 = vmlal_s16(q1s32, d15s16, d30s16); - q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); - q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); - - d20s16 = vrshrn_n_s32(q13s32, 14); - d21s16 = vrshrn_n_s32(q1s32, 14); - d12s16 = vrshrn_n_s32(q11s32, 14); - d13s16 = vrshrn_n_s32(q0s32, 14); - *q10s16 = vcombine_s16(d20s16, d21s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q5s16 = vdupq_n_s16(0); - - *q9s16 = vsubq_s16(q5s16, *q9s16); - *q11s16 = vsubq_s16(q5s16, q2s16); - *q13s16 = vsubq_s16(q5s16, q6s16); - *q15s16 = vsubq_s16(q5s16, q4s16); -} - void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { - int i; - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 8 * 2); - q11s16 = vld1q_s16(input + 8 * 3); - q12s16 = vld1q_s16(input + 8 * 4); - q13s16 = vld1q_s16(input + 8 * 5); - q14s16 = vld1q_s16(input + 8 * 6); - q15s16 = vld1q_s16(input + 8 * 7); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a[8]; + + a[0] = load_tran_low_to_s16q(input + 0 * 8); + a[1] = load_tran_low_to_s16q(input + 1 * 8); + a[2] = load_tran_low_to_s16q(input + 2 * 8); + a[3] = load_tran_low_to_s16q(input + 3 * 8); + a[4] = load_tran_low_to_s16q(input + 4 * 8); + a[5] = load_tran_low_to_s16q(input + 5 * 8); + a[6] = load_tran_low_to_s16q(input + 6 * 8); + a[7] = load_tran_low_to_s16q(input + 7 * 8); + + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); switch (tx_type) { - case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, stride, tx_type); - return; - case 1: // iadst_idct - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // first transform rows - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - case 2: // idct_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - // generate IDCT constants - // GENERATE_IDCT_CONSTANTS - - // then transform columns - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case ADST_DCT: + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; - case 3: // iadst_iadst - // generate IADST constants - // GENERATE_IADST_CONSTANTS - - // first transform rows - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // transpose the matrix - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - // then transform columns - IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + case DCT_ADST: + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a); break; - default: // iadst_idct - assert(0); + + default: + assert(tx_type == ADST_ADST); + iadst8(a); + transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + iadst8(a); break; } - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - for (d1 = d2 = dest, i = 0; i < 2; i++) { - if (i != 0) { - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - } - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = - vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = - vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += stride; - } + idct8x8_add8x8_neon(a, dest, stride); } diff --git a/libvpx/vp9/common/arm/neon/vp9_iht_neon.h b/libvpx/vp9/common/arm/neon/vp9_iht_neon.h new file mode 100644 index 000000000..c64822e27 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_iht_neon.h @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ +#define VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ + +#include <arm_neon.h> + +#include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void iadst4(int16x8_t *const io) { + const int32x4_t c3 = vdupq_n_s32(sinpi_3_9); + int16x4_t x[4]; + int32x4_t s[8], output[4]; + const int16x4_t c = + create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9); + + x[0] = vget_low_s16(io[0]); + x[1] = vget_low_s16(io[1]); + x[2] = vget_high_s16(io[0]); + x[3] = vget_high_s16(io[1]); + + s[0] = vmull_lane_s16(x[0], c, 0); + s[1] = vmull_lane_s16(x[0], c, 1); + s[2] = vmull_lane_s16(x[1], c, 2); + s[3] = vmull_lane_s16(x[2], c, 3); + s[4] = vmull_lane_s16(x[2], c, 0); + s[5] = vmull_lane_s16(x[3], c, 1); + s[6] = vmull_lane_s16(x[3], c, 3); + s[7] = vaddl_s16(x[0], x[3]); + s[7] = vsubw_s16(s[7], x[2]); + + s[0] = vaddq_s32(s[0], s[3]); + s[0] = vaddq_s32(s[0], s[5]); + s[1] = vsubq_s32(s[1], s[4]); + s[1] = vsubq_s32(s[1], s[6]); + s[3] = s[2]; + s[2] = vmulq_s32(c3, s[7]); + + output[0] = vaddq_s32(s[0], s[3]); + output[1] = vaddq_s32(s[1], s[3]); + output[2] = s[2]; + output[3] = vaddq_s32(s[0], s[1]); + output[3] = vsubq_s32(output[3], s[3]); + dct_const_round_shift_low_8_dual(output, &io[0], &io[1]); +} + +static INLINE void iadst_half_butterfly_neon(int16x8_t *const x, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + x[0] = dct_const_round_shift_low_8(t0); + x[1] = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_neg_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 1); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 1); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 1); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 1); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_half_butterfly_pos_neon(int16x8_t *const x0, + int16x8_t *const x1, + const int16x4_t c) { + // Don't add/sub before multiply, which will overflow in iadst8. + const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(*x0), c, 0); + const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(*x0), c, 0); + const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(*x1), c, 0); + const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(*x1), c, 0); + int32x4_t t0[2], t1[2]; + + t0[0] = vaddq_s32(x0_lo, x1_lo); + t0[1] = vaddq_s32(x0_hi, x1_hi); + t1[0] = vsubq_s32(x0_lo, x1_lo); + t1[1] = vsubq_s32(x0_hi, x1_hi); + *x1 = dct_const_round_shift_low_8(t0); + *x0 = dct_const_round_shift_low_8(t1); +} + +static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); +} + +static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); +} + +static INLINE void iadst_butterfly_lane_1_0_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); +} + +static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0, + const int16x8_t in1, + const int16x4_t c, + int32x4_t *const s0, + int32x4_t *const s1) { + s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); + s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); + s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); + s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); + + s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); + s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); + s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); + s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); +} + +static INLINE int16x8_t add_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vaddq_s32(in0[0], in1[0]); + sum[1] = vaddq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE int16x8_t sub_dct_const_round_shift_low_8( + const int32x4_t *const in0, const int32x4_t *const in1) { + int32x4_t sum[2]; + + sum[0] = vsubq_s32(in0[0], in1[0]); + sum[1] = vsubq_s32(in0[1], in1[1]); + return dct_const_round_shift_low_8(sum); +} + +static INLINE void iadst8(int16x8_t *const io) { + const int16x4_t c0 = + create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); + const int16x4_t c1 = + create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); + const int16x4_t c2 = + create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); + int16x8_t x[8], t[4]; + int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + + x[0] = io[7]; + x[1] = io[0]; + x[2] = io[5]; + x[3] = io[2]; + x[4] = io[3]; + x[5] = io[4]; + x[6] = io[1]; + x[7] = io[6]; + + // stage 1 + iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1); + iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3); + iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5); + iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7); + + x[0] = add_dct_const_round_shift_low_8(s0, s4); + x[1] = add_dct_const_round_shift_low_8(s1, s5); + x[2] = add_dct_const_round_shift_low_8(s2, s6); + x[3] = add_dct_const_round_shift_low_8(s3, s7); + x[4] = sub_dct_const_round_shift_low_8(s0, s4); + x[5] = sub_dct_const_round_shift_low_8(s1, s5); + x[6] = sub_dct_const_round_shift_low_8(s2, s6); + x[7] = sub_dct_const_round_shift_low_8(s3, s7); + + // stage 2 + t[0] = x[0]; + t[1] = x[1]; + t[2] = x[2]; + t[3] = x[3]; + iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5); + iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6); + + x[0] = vaddq_s16(t[0], t[2]); + x[1] = vaddq_s16(t[1], t[3]); + x[2] = vsubq_s16(t[0], t[2]); + x[3] = vsubq_s16(t[1], t[3]); + x[4] = add_dct_const_round_shift_low_8(s4, s6); + x[5] = add_dct_const_round_shift_low_8(s5, s7); + x[6] = sub_dct_const_round_shift_low_8(s4, s6); + x[7] = sub_dct_const_round_shift_low_8(s5, s7); + + // stage 3 + iadst_half_butterfly_neon(x + 2, c2); + iadst_half_butterfly_neon(x + 6, c2); + + io[0] = x[0]; + io[1] = vnegq_s16(x[4]); + io[2] = x[6]; + io[3] = vnegq_s16(x[2]); + io[4] = x[3]; + io[5] = vnegq_s16(x[7]); + io[6] = x[5]; + io[7] = vnegq_s16(x[1]); +} + +void vpx_iadst16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef void (*iht_1d)(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +typedef struct { + iht_1d cols, rows; // vertical and horizontal +} iht_2d; + +#endif // VPX_VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_ diff --git a/libvpx/vp9/common/ppc/vp9_idct_vsx.c b/libvpx/vp9/common/ppc/vp9_idct_vsx.c new file mode 100644 index 000000000..1b2a93edb --- /dev/null +++ b/libvpx/vp9/common/ppc/vp9_idct_vsx.c @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/ppc/inv_txfm_vsx.h" +#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h" + +#include "vp9/common/vp9_enums.h" + +void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[2], out[2]; + + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + case ADST_DCT: + vpx_idct4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst4_vsx(in, out); + vpx_idct4_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst4_vsx(in, out); + vp9_iadst4_vsx(out, in); + break; + } + + vpx_round_store4x4_vsx(in, out, dest, stride); +} + +void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { + int16x8_t in[8], out[8]; + + // load input data + in[0] = load_tran_low(0, input); + in[1] = load_tran_low(8 * sizeof(*input), input); + in[2] = load_tran_low(2 * 8 * sizeof(*input), input); + in[3] = load_tran_low(3 * 8 * sizeof(*input), input); + in[4] = load_tran_low(4 * 8 * sizeof(*input), input); + in[5] = load_tran_low(5 * 8 * sizeof(*input), input); + in[6] = load_tran_low(6 * 8 * sizeof(*input), input); + in[7] = load_tran_low(7 * 8 * sizeof(*input), input); + + switch (tx_type) { + case DCT_DCT: + vpx_idct8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + case ADST_DCT: + vpx_idct8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + case DCT_ADST: + vp9_iadst8_vsx(in, out); + vpx_idct8_vsx(out, in); + break; + default: + assert(tx_type == ADST_ADST); + vp9_iadst8_vsx(in, out); + vp9_iadst8_vsx(out, in); + break; + } + + vpx_round_store8x8_vsx(in, dest, stride); +} + +void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride, int tx_type) { + int16x8_t in0[16], in1[16]; + + LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0); + LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input), + 8 * sizeof(*input), in1); + + switch (tx_type) { + case DCT_DCT: + vpx_idct16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + case ADST_DCT: + vpx_idct16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + case DCT_ADST: + vpx_iadst16_vsx(in0, in1); + vpx_idct16_vsx(in0, in1); + break; + default: + assert(tx_type == ADST_ADST); + vpx_iadst16_vsx(in0, in1); + vpx_iadst16_vsx(in0, in1); + break; + } + + vpx_round_store16x16_vsx(in0, in1, dest, stride); +} diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h index a3a163857..5faa4f2be 100644 --- a/libvpx/vp9/common/vp9_alloccommon.h +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ -#define VP9_COMMON_VP9_ALLOCCOMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ #define INVALID_IDX -1 // Invalid buffer index. @@ -41,4 +41,4 @@ void vp9_swap_current_and_last_seg_map(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ +#endif // VPX_VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index 780b29208..f0887157e 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_BLOCKD_H_ -#define VP9_COMMON_VP9_BLOCKD_H_ +#ifndef VPX_VP9_COMMON_VP9_BLOCKD_H_ +#define VPX_VP9_COMMON_VP9_BLOCKD_H_ #include "./vpx_config.h" @@ -60,6 +60,7 @@ typedef struct { #define GOLDEN_FRAME 2 #define ALTREF_FRAME 3 #define MAX_REF_FRAMES 4 + typedef int8_t MV_REFERENCE_FRAME; // This structure now relates to 8x8 block regions. @@ -130,6 +131,8 @@ struct macroblockd_plane { // encoder const int16_t *dequant; + + int *eob; }; #define BLOCK_OFFSET(x, i) ((x) + (i)*16) @@ -193,6 +196,8 @@ typedef struct macroblockd { int corrupted; struct vpx_internal_error_info *error_info; + + PARTITION_TYPE *partition; } MACROBLOCKD; static INLINE PLANE_TYPE get_plane_type(int plane) { @@ -285,4 +290,4 @@ void vp9_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd, } // extern "C" #endif -#endif // VP9_COMMON_VP9_BLOCKD_H_ +#endif // VPX_VP9_COMMON_VP9_BLOCKD_H_ diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h index 666c3beaf..ae8dad38e 100644 --- a/libvpx/vp9/common/vp9_common.h +++ b/libvpx/vp9/common/vp9_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_H_ -#define VP9_COMMON_VP9_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_H_ +#define VPX_VP9_COMMON_VP9_COMMON_H_ /* Interface header for common constant data structures and lookup tables */ @@ -75,4 +75,4 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index 4a1083322..809d7317c 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -28,7 +28,7 @@ const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 1, 2, 2, const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] = { 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8 }; -// VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize))) +// VPXMIN(3, VPXMIN(b_width_log2_lookup(bsize), b_height_log2_lookup(bsize))) const uint8_t size_group_lookup[BLOCK_SIZES] = { 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3 }; diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index 5c6a7e8ff..a533c5f05 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_COMMON_DATA_H_ -#define VP9_COMMON_VP9_COMMON_DATA_H_ +#ifndef VPX_VP9_COMMON_VP9_COMMON_DATA_H_ +#define VPX_VP9_COMMON_VP9_COMMON_DATA_H_ #include "vp9/common/vp9_enums.h" #include "vpx/vpx_integer.h" @@ -42,4 +42,4 @@ extern const uint8_t need_top_left[INTRA_MODES]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_COMMON_DATA_H_ +#endif // VPX_VP9_COMMON_VP9_COMMON_DATA_H_ diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index a575bda72..430b917b8 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254, 177, 153, 140, 133, 130, 129 }; #endif +/* clang-format off */ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, // beyond MAXBAND_INDEX+1 all values are filled as 5 @@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, }; +/* clang-format on */ const uint8_t vp9_coefband_trans_4x4[16] = { 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index 1da491166..d026651df 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPY_H_ -#define VP9_COMMON_VP9_ENTROPY_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPY_H_ +#define VPX_VP9_COMMON_VP9_ENTROPY_H_ #include "vpx/vpx_integer.h" #include "vpx_dsp/prob.h" @@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) { // 128 lists of probabilities are stored for the following ONE node probs: // 1, 3, 5, 7, ..., 253, 255 // In between probabilities are interpolated linearly - #define COEFF_PROB_MODELS 255 #define UNCONSTRAINED_NODES 3 @@ -195,4 +194,4 @@ static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPY_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index 47cd63e94..48cad3318 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] = { 93, 24, 99 }, // a split, l not split { 85, 119, 44 }, // l split, a not split { 62, 59, 67 }, // a/l both split + // 16x16 -> 8x8 { 149, 53, 53 }, // a/l both not split { 94, 20, 48 }, // a split, l not split { 83, 53, 24 }, // l split, a not split { 52, 18, 18 }, // a/l both split + // 32x32 -> 16x16 { 150, 40, 39 }, // a/l both not split { 78, 12, 26 }, // a split, l not split { 67, 33, 11 }, // l split, a not split { 24, 7, 5 }, // a/l both split + // 64x64 -> 32x32 { 174, 35, 49 }, // a/l both not split { 68, 11, 27 }, // a split, l not split diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index 0ee663fe8..a756c8d0b 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ -#define VP9_COMMON_VP9_ENTROPYMODE_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ #include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymv.h" @@ -104,4 +104,4 @@ void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMODE_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index a18a290cf..b6f052d08 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = { 18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10, }; -const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { - -0, -1, -}; +const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 }; const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1, 4, -2, -3 }; diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h index e2fe37a32..ee9d37973 100644 --- a/libvpx/vp9/common/vp9_entropymv.h +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENTROPYMV_H_ -#define VP9_COMMON_VP9_ENTROPYMV_H_ +#ifndef VPX_VP9_COMMON_VP9_ENTROPYMV_H_ +#define VPX_VP9_COMMON_VP9_ENTROPYMV_H_ #include "./vpx_config.h" @@ -25,7 +25,7 @@ struct VP9Common; void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp); static INLINE int use_mv_hp(const MV *ref) { const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv @@ -127,10 +127,10 @@ typedef struct { nmv_component_counts comps[2]; } nmv_context_counts; -void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENTROPYMV_H_ +#endif // VPX_VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 056b298b3..bc665534d 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ENUMS_H_ -#define VP9_COMMON_VP9_ENUMS_H_ +#ifndef VPX_VP9_COMMON_VP9_ENUMS_H_ +#define VPX_VP9_COMMON_VP9_ENUMS_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -140,4 +140,4 @@ typedef uint8_t PREDICTION_MODE; } // extern "C" #endif -#endif // VP9_COMMON_VP9_ENUMS_H_ +#endif // VPX_VP9_COMMON_VP9_ENUMS_H_ diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c index 6c43af8ce..adbda6c82 100644 --- a/libvpx/vp9/common/vp9_filter.c +++ b/libvpx/vp9/common/vp9_filter.c @@ -63,6 +63,20 @@ DECLARE_ALIGNED(256, static const InterpKernel, { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 1, 38, 64, 32, -1, -3 } }; -const InterpKernel *vp9_filter_kernels[4] = { - sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters +// 4-tap filter +DECLARE_ALIGNED(256, static const InterpKernel, + sub_pel_filters_4[SUBPEL_SHIFTS]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 }, + { 0, 0, -6, 120, 18, -4, 0, 0 }, { 0, 0, -8, 114, 28, -6, 0, 0 }, + { 0, 0, -10, 108, 36, -6, 0, 0 }, { 0, 0, -12, 102, 46, -8, 0, 0 }, + { 0, 0, -12, 94, 56, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 }, + { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 }, + { 0, 0, -10, 56, 94, -12, 0, 0 }, { 0, 0, -8, 46, 102, -12, 0, 0 }, + { 0, 0, -6, 36, 108, -10, 0, 0 }, { 0, 0, -6, 28, 114, -8, 0, 0 }, + { 0, 0, -4, 18, 120, -6, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 } +}; + +const InterpKernel *vp9_filter_kernels[5] = { + sub_pel_filters_8, sub_pel_filters_8lp, sub_pel_filters_8s, bilinear_filters, + sub_pel_filters_4 }; diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h index 9d2b8e1db..0382c88e7 100644 --- a/libvpx/vp9/common/vp9_filter.h +++ b/libvpx/vp9/common/vp9_filter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FILTER_H_ -#define VP9_COMMON_VP9_FILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_FILTER_H_ +#define VPX_VP9_COMMON_VP9_FILTER_H_ #include "./vpx_config.h" #include "vpx/vpx_integer.h" @@ -25,6 +25,7 @@ extern "C" { #define EIGHTTAP_SHARP 2 #define SWITCHABLE_FILTERS 3 /* Number of switchable filters */ #define BILINEAR 3 +#define FOURTAP 4 // The codec can operate in four possible inter prediction filter mode: // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three. #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1) @@ -32,10 +33,10 @@ extern "C" { typedef uint8_t INTERP_FILTER; -extern const InterpKernel *vp9_filter_kernels[4]; +extern const InterpKernel *vp9_filter_kernels[5]; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_COMMON_VP9_FILTER_H_ +#endif // VPX_VP9_COMMON_VP9_FILTER_H_ diff --git a/libvpx/vp9/common/vp9_frame_buffers.h b/libvpx/vp9/common/vp9_frame_buffers.h index e2cfe61b6..11be838c0 100644 --- a/libvpx/vp9/common/vp9_frame_buffers.h +++ b/libvpx/vp9/common/vp9_frame_buffers.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_FRAME_BUFFERS_H_ -#define VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#ifndef VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#define VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ #include "vpx/vpx_frame_buffer.h" #include "vpx/vpx_integer.h" @@ -50,4 +50,4 @@ int vp9_release_frame_buffer(void *cb_priv, vpx_codec_frame_buffer_t *fb); } // extern "C" #endif -#endif // VP9_COMMON_VP9_FRAME_BUFFERS_H_ +#endif // VPX_VP9_COMMON_VP9_FRAME_BUFFERS_H_ diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h index 3e83b8402..94eeaf599 100644 --- a/libvpx/vp9/common/vp9_idct.h +++ b/libvpx/vp9/common/vp9_idct.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_IDCT_H_ -#define VP9_COMMON_VP9_IDCT_H_ +#ifndef VPX_VP9_COMMON_VP9_IDCT_H_ +#define VPX_VP9_COMMON_VP9_IDCT_H_ #include <assert.h> @@ -78,4 +78,4 @@ void vp9_highbd_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, } // extern "C" #endif -#endif // VP9_COMMON_VP9_IDCT_H_ +#endif // VPX_VP9_COMMON_VP9_IDCT_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index c7c343aed..95d6029f3 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi, const int mode_info_stride, + MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi; - MODE_INFO **mip2 = mi; + MODE_INFO **mip = mi8x8; + MODE_INFO **mip2 = mi8x8; // These are offsets to the next mi in the 64x64 block. It is what gets // added to the mi ptr as we go through each loop. It helps us to avoid @@ -1087,13 +1087,19 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; - unsigned int mask_16x16[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_8x8[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4[MI_BLOCK_SIZE] = { 0 }; - unsigned int mask_4x4_int[MI_BLOCK_SIZE] = { 0 }; + unsigned int mask_16x16[MI_BLOCK_SIZE]; + unsigned int mask_8x8[MI_BLOCK_SIZE]; + unsigned int mask_4x4[MI_BLOCK_SIZE]; + unsigned int mask_4x4_int[MI_BLOCK_SIZE]; uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE]; int r, c; + vp9_zero(mask_16x16); + vp9_zero(mask_8x8); + vp9_zero(mask_4x4); + vp9_zero(mask_4x4_int); + vp9_zero(lfl); + for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) { unsigned int mask_16x16_c = 0; unsigned int mask_8x8_c = 0; @@ -1174,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm, } // Disable filtering on the leftmost column - border_mask = ~(mi_col == 0); + border_mask = ~(mi_col == 0 ? 1 : 0); #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { highbd_filter_selectively_vert( @@ -1330,6 +1336,8 @@ void vp9_filter_block_plane_ss11(VP9_COMMON *const cm, uint16_t mask_4x4 = lfm->left_uv[TX_4X4]; uint16_t mask_4x4_int = lfm->int_4x4_uv; + vp9_zero(lfl_uv); + assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); // Vertical pass: do 2 rows at one time diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index 481a6cdc6..39648a72c 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_LOOPFILTER_H_ -#define VP9_COMMON_VP9_LOOPFILTER_H_ +#ifndef VPX_VP9_COMMON_VP9_LOOPFILTER_H_ +#define VPX_VP9_COMMON_VP9_LOOPFILTER_H_ #include "vpx_ports/mem.h" #include "./vpx_config.h" @@ -97,7 +97,7 @@ struct VP9LfSyncData; // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(struct VP9Common *const cm, const int mi_row, - const int mi_col, MODE_INFO **mi_8x8, + const int mi_col, MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm); void vp9_filter_block_plane_ss00(struct VP9Common *const cm, @@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm); void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - struct macroblockd *mbd, int filter_level, + struct macroblockd *xd, int frame_filter_level, int y_only, int partial_frame); // Get the superblock lfm for a given mi_row, mi_col. @@ -157,4 +157,4 @@ int vp9_loop_filter_worker(void *arg1, void *unused); } // extern "C" #endif -#endif // VP9_COMMON_VP9_LOOPFILTER_H_ +#endif // VPX_VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libvpx/vp9/common/vp9_mfqe.h b/libvpx/vp9/common/vp9_mfqe.h index dfff8c23d..f53e1c2f9 100644 --- a/libvpx/vp9/common/vp9_mfqe.h +++ b/libvpx/vp9/common/vp9_mfqe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MFQE_H_ -#define VP9_COMMON_VP9_MFQE_H_ +#ifndef VPX_VP9_COMMON_VP9_MFQE_H_ +#define VPX_VP9_COMMON_VP9_MFQE_H_ #ifdef __cplusplus extern "C" { @@ -28,4 +28,4 @@ void vp9_mfqe(struct VP9Common *cm); } // extern "C" #endif -#endif // VP9_COMMON_VP9_MFQE_H_ +#endif // VPX_VP9_COMMON_VP9_MFQE_H_ diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h index 4c8eac721..14dde7dd0 100644 --- a/libvpx/vp9/common/vp9_mv.h +++ b/libvpx/vp9/common/vp9_mv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MV_H_ -#define VP9_COMMON_VP9_MV_H_ +#ifndef VPX_VP9_COMMON_VP9_MV_H_ +#define VPX_VP9_COMMON_VP9_MV_H_ #include "vpx/vpx_integer.h" @@ -52,4 +52,4 @@ static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MV_H_ +#endif // VPX_VP9_COMMON_VP9_MV_H_ diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index 2b2c1ba9e..ebe5fdad1 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ -#define VP9_COMMON_VP9_MVREF_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_blockd.h" @@ -320,4 +320,4 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int block, } // extern "C" #endif -#endif // VP9_COMMON_VP9_MVREF_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index 1d96d92c2..662b8ef5e 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_ONYXC_INT_H_ -#define VP9_COMMON_VP9_ONYXC_INT_H_ +#ifndef VPX_VP9_COMMON_VP9_ONYXC_INT_H_ +#define VPX_VP9_COMMON_VP9_ONYXC_INT_H_ #include "./vpx_config.h" #include "vpx/internal/vpx_codec_internal.h" @@ -37,10 +37,9 @@ extern "C" { #define REF_FRAMES_LOG2 3 #define REF_FRAMES (1 << REF_FRAMES_LOG2) -// 1 scratch frame for the new frame, 3 for scaled references on the encoder. -// TODO(jkoleszar): These 3 extra references could probably come from the -// normal reference pool. -#define FRAME_BUFFERS (REF_FRAMES + 4) +// 1 scratch frame for the new frame, REFS_PER_FRAME for scaled references on +// the encoder. +#define FRAME_BUFFERS (REF_FRAMES + 1 + REFS_PER_FRAME) #define FRAME_CONTEXTS_LOG2 2 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2) @@ -70,6 +69,7 @@ typedef struct { int mi_rows; int mi_cols; uint8_t released; + int frame_index; vpx_codec_frame_buffer_t raw_frame_buffer; YV12_BUFFER_CONFIG buf; } RefCntBuffer; @@ -128,6 +128,8 @@ typedef struct VP9Common { int new_fb_idx; + int cur_show_frame_fb_idx; + #if CONFIG_VP9_POSTPROC YV12_BUFFER_CONFIG post_proc_buffer; YV12_BUFFER_CONFIG post_proc_buffer_int; @@ -256,8 +258,16 @@ typedef struct VP9Common { PARTITION_CONTEXT *above_seg_context; ENTROPY_CONTEXT *above_context; int above_context_alloc_cols; + + int lf_row; } VP9_COMMON; +static INLINE YV12_BUFFER_CONFIG *get_buf_frame(VP9_COMMON *cm, int index) { + if (index < 0 || index >= FRAME_BUFFERS) return NULL; + if (cm->error.error_code != VPX_CODEC_OK) return NULL; + return &cm->buffer_pool->frame_bufs[index].buf; +} + static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) { if (index < 0 || index >= REF_FRAMES) return NULL; if (cm->ref_frame_map[index] < 0) return NULL; @@ -405,4 +415,4 @@ static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row, } // extern "C" #endif -#endif // VP9_COMMON_VP9_ONYXC_INT_H_ +#endif // VPX_VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c index dfc315eea..5373b0218 100644 --- a/libvpx/vp9/common/vp9_postproc.c +++ b/libvpx/vp9/common/vp9_postproc.c @@ -293,7 +293,7 @@ static void swap_mi_and_prev_mi(VP9_COMMON *cm) { } int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *ppflags) { + vp9_ppflags_t *ppflags, int unscaled_width) { const int q = VPXMIN(105, cm->lf.filter_level * 2); const int flags = ppflags->post_proc_flag; YV12_BUFFER_CONFIG *const ppbuf = &cm->post_proc_buffer; @@ -359,7 +359,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, if (flags & (VP9D_DEMACROBLOCK | VP9D_DEBLOCK)) { if (!cm->postproc_state.limits) { cm->postproc_state.limits = - vpx_calloc(cm->width, sizeof(*cm->postproc_state.limits)); + vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits)); } } diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h index 605909411..67efc1b4e 100644 --- a/libvpx/vp9/common/vp9_postproc.h +++ b/libvpx/vp9/common/vp9_postproc.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_POSTPROC_H_ -#define VP9_COMMON_VP9_POSTPROC_H_ +#ifndef VPX_VP9_COMMON_VP9_POSTPROC_H_ +#define VPX_VP9_COMMON_VP9_POSTPROC_H_ #include "vpx_ports/mem.h" #include "vpx_scale/yv12config.h" @@ -38,7 +38,7 @@ struct VP9Common; #define MFQE_PRECISION 4 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags); + vp9_ppflags_t *ppflags, int unscaled_width); void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); @@ -50,4 +50,4 @@ void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, } // extern "C" #endif -#endif // VP9_COMMON_VP9_POSTPROC_H_ +#endif // VPX_VP9_COMMON_VP9_POSTPROC_H_ diff --git a/libvpx/vp9/common/vp9_ppflags.h b/libvpx/vp9/common/vp9_ppflags.h index b8b647bf1..a0e301762 100644 --- a/libvpx/vp9/common/vp9_ppflags.h +++ b/libvpx/vp9/common/vp9_ppflags.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PPFLAGS_H_ -#define VP9_COMMON_VP9_PPFLAGS_H_ +#ifndef VPX_VP9_COMMON_VP9_PPFLAGS_H_ +#define VPX_VP9_COMMON_VP9_PPFLAGS_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ typedef struct { } // extern "C" #endif -#endif // VP9_COMMON_VP9_PPFLAGS_H_ +#endif // VPX_VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index a7ddc0b95..375cb4d76 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -13,6 +13,32 @@ #include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_seg_common.h" +int vp9_compound_reference_allowed(const VP9_COMMON *cm) { + int i; + for (i = 1; i < REFS_PER_FRAME; ++i) + if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; + + return 0; +} + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm) { + if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[GOLDEN_FRAME]) { + cm->comp_fixed_ref = ALTREF_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = GOLDEN_FRAME; + } else if (cm->ref_frame_sign_bias[LAST_FRAME] == + cm->ref_frame_sign_bias[ALTREF_FRAME]) { + cm->comp_fixed_ref = GOLDEN_FRAME; + cm->comp_var_ref[0] = LAST_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } else { + cm->comp_fixed_ref = LAST_FRAME; + cm->comp_var_ref[0] = GOLDEN_FRAME; + cm->comp_var_ref[1] = ALTREF_FRAME; + } +} + int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd) { int ctx; @@ -229,9 +255,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) { else pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME); } else { - pred_context = 1 + - 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || - edge_mi->ref_frame[1] == GOLDEN_FRAME); + pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME || + edge_mi->ref_frame[1] == GOLDEN_FRAME); } } else { // inter/inter const int above_has_second = has_second_ref(above_mi); diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index 8400bd70f..ee5966935 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_PRED_COMMON_H_ -#define VP9_COMMON_VP9_PRED_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_PRED_COMMON_H_ +#define VPX_VP9_COMMON_VP9_PRED_COMMON_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_onyxc_int.h" @@ -145,6 +145,10 @@ static INLINE vpx_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm, return cm->fc->single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1]; } +int vp9_compound_reference_allowed(const VP9_COMMON *cm); + +void vp9_setup_compound_reference_mode(VP9_COMMON *cm); + // Returns a context number for the given MB prediction signal // The mode info data structure has a one element border above and to the // left of the entries corresponding to real blocks. @@ -176,12 +180,6 @@ static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx, } } -static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size, - const MACROBLOCKD *xd, - const struct tx_probs *tx_probs) { - return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs); -} - static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, struct tx_counts *tx_counts) { switch (max_tx_size) { @@ -196,4 +194,4 @@ static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_PRED_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h index 4bae4a896..ec8b9f4c6 100644 --- a/libvpx/vp9/common/vp9_quant_common.h +++ b/libvpx/vp9/common/vp9_quant_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ -#define VP9_COMMON_VP9_QUANT_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ #include "vpx/vpx_codec.h" #include "vp9/common/vp9_seg_common.h" @@ -33,4 +33,4 @@ int vp9_get_qindex(const struct segmentation *seg, int segment_id, } // extern "C" #endif -#endif // VP9_COMMON_VP9_QUANT_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index bb9291a26..992e30c34 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTER_H_ -#define VP9_COMMON_VP9_RECONINTER_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTER_H_ +#define VPX_VP9_COMMON_VP9_RECONINTER_H_ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_onyxc_int.h" @@ -61,15 +61,15 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *mv_q3, + int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, - int do_avg, const InterpKernel *kernel, + int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y); #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); #endif @@ -103,4 +103,4 @@ void vp9_setup_pre_planes(MACROBLOCKD *xd, int idx, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTER_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTER_H_ diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h index 78e41c881..426a35ebf 100644 --- a/libvpx/vp9/common/vp9_reconintra.h +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_RECONINTRA_H_ -#define VP9_COMMON_VP9_RECONINTRA_H_ +#ifndef VPX_VP9_COMMON_VP9_RECONINTRA_H_ +#define VPX_VP9_COMMON_VP9_RECONINTRA_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -28,4 +28,4 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_RECONINTRA_H_ +#endif // VPX_VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl index 22b67ecac..8bb68cfdf 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -62,18 +62,18 @@ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, i add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; -add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; +add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when # CONFIG_VP9_HIGHBITDEPTH is off. - specialize qw/vp9_iht4x4_16_add sse2/; - specialize qw/vp9_iht8x8_64_add sse2/; - specialize qw/vp9_iht16x16_256_add sse2/; + specialize qw/vp9_iht4x4_16_add neon sse2 vsx/; + specialize qw/vp9_iht8x8_64_add neon sse2 vsx/; + specialize qw/vp9_iht16x16_256_add neon sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. - specialize qw/vp9_iht4x4_16_add neon dspr2 msa/; - specialize qw/vp9_iht8x8_64_add neon dspr2 msa/; + specialize qw/vp9_iht4x4_16_add dspr2 msa/; + specialize qw/vp9_iht8x8_64_add dspr2 msa/; specialize qw/vp9_iht16x16_256_add dspr2 msa/; } } @@ -100,7 +100,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; + + if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { + specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; + specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/; + specialize qw/vp9_highbd_iht16x16_256_add neon sse4_1/; + } } # @@ -123,10 +129,10 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_ add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; @@ -135,7 +141,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_block_error_fp avx2 sse2/; - specialize qw/vp9_fdct8x8_quant neon ssse3/; + specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/; add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; @@ -199,7 +205,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; - add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count"; + add_proto qw/void vp9_highbd_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count"; } # End vp9_high encoder functions diff --git a/libvpx/vp9/common/vp9_scale.h b/libvpx/vp9/common/vp9_scale.h index ada8dbaad..aaafdf867 100644 --- a/libvpx/vp9/common/vp9_scale.h +++ b/libvpx/vp9/common/vp9_scale.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SCALE_H_ -#define VP9_COMMON_VP9_SCALE_H_ +#ifndef VPX_VP9_COMMON_VP9_SCALE_H_ +#define VPX_VP9_COMMON_VP9_SCALE_H_ #include "vp9/common/vp9_mv.h" #include "vpx_dsp/vpx_convolve.h" @@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); #if CONFIG_VP9_HIGHBITDEPTH void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h, - int use_high); + int use_highbd); #else void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); @@ -68,4 +68,4 @@ static INLINE int valid_ref_frame_size(int ref_width, int ref_height, } // extern "C" #endif -#endif // VP9_COMMON_VP9_SCALE_H_ +#endif // VPX_VP9_COMMON_VP9_SCALE_H_ diff --git a/libvpx/vp9/common/vp9_scan.h b/libvpx/vp9/common/vp9_scan.h index b3520e7dc..72a9a5ec4 100644 --- a/libvpx/vp9/common/vp9_scan.h +++ b/libvpx/vp9/common/vp9_scan.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SCAN_H_ -#define VP9_COMMON_VP9_SCAN_H_ +#ifndef VPX_VP9_COMMON_VP9_SCAN_H_ +#define VPX_VP9_COMMON_VP9_SCAN_H_ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" @@ -55,4 +55,4 @@ static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size, } // extern "C" #endif -#endif // VP9_COMMON_VP9_SCAN_H_ +#endif // VPX_VP9_COMMON_VP9_SCAN_H_ diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h index b9bf75d58..b63e4f499 100644 --- a/libvpx/vp9/common/vp9_seg_common.h +++ b/libvpx/vp9/common/vp9_seg_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_SEG_COMMON_H_ -#define VP9_COMMON_VP9_SEG_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_SEG_COMMON_H_ +#define VPX_VP9_COMMON_VP9_SEG_COMMON_H_ #include "vpx_dsp/prob.h" @@ -78,4 +78,4 @@ extern const vpx_tree_index vp9_segment_tree[TREE_SIZE(MAX_SEGMENTS)]; } // extern "C" #endif -#endif // VP9_COMMON_VP9_SEG_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_SEG_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c index 8d44e91f2..b008ed5cf 100644 --- a/libvpx/vp9/common/vp9_thread_common.c +++ b/libvpx/vp9/common/vp9_thread_common.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> #include "./vpx_config.h" #include "vpx_dsp/vpx_dsp_common.h" #include "vpx_mem/vpx_mem.h" @@ -38,11 +39,11 @@ static INLINE void sync_read(VP9LfSync *const lf_sync, int r, int c) { const int nsync = lf_sync->sync_range; if (r && !(c & (nsync - 1))) { - pthread_mutex_t *const mutex = &lf_sync->mutex_[r - 1]; + pthread_mutex_t *const mutex = &lf_sync->mutex[r - 1]; mutex_lock(mutex); while (c > lf_sync->cur_sb_col[r - 1] - nsync) { - pthread_cond_wait(&lf_sync->cond_[r - 1], mutex); + pthread_cond_wait(&lf_sync->cond[r - 1], mutex); } pthread_mutex_unlock(mutex); } @@ -69,12 +70,12 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } if (sig) { - mutex_lock(&lf_sync->mutex_[r]); + mutex_lock(&lf_sync->mutex[r]); lf_sync->cur_sb_col[r] = cur; - pthread_cond_signal(&lf_sync->cond_[r]); - pthread_mutex_unlock(&lf_sync->mutex_[r]); + pthread_cond_signal(&lf_sync->cond[r]); + pthread_mutex_unlock(&lf_sync->mutex[r]); } #else (void)lf_sync; @@ -91,6 +92,7 @@ static INLINE void thread_loop_filter_rows( int y_only, VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + const int num_active_workers = VPXMIN(lf_sync->num_workers, lf_sync->rows); int mi_row, mi_col; enum lf_path path; if (y_only) @@ -103,7 +105,7 @@ static INLINE void thread_loop_filter_rows( path = LF_PATH_SLOW; for (mi_row = start; mi_row < stop; - mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { + mi_row += num_active_workers * MI_BLOCK_SIZE) { MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride; LOOP_FILTER_MASK *lfm = get_lfm(&cm->lf, mi_row, 0); @@ -157,10 +159,12 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; - // Decoder may allocate more threads than number of tiles based on user's - // input. - const int tile_cols = 1 << cm->log2_tile_cols; - const int num_workers = VPXMIN(nworkers, tile_cols); + const int num_tile_cols = 1 << cm->log2_tile_cols; + // Limit the number of workers to prevent changes in frame dimensions from + // causing incorrect sync calculations when sb_rows < threads/tile_cols. + // Further restrict them by the number of tile columns should the user + // request more as this implementation doesn't scale well beyond that. + const int num_workers = VPXMIN(nworkers, VPXMIN(num_tile_cols, sb_rows)); int i; if (!lf_sync->sync_range || sb_rows != lf_sync->rows || @@ -231,6 +235,28 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm, workers, num_workers, lf_sync); } +void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level, + int num_workers) { + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + + if (!frame_filter_level) return; + + if (!lf_sync->sync_range || sb_rows != lf_sync->rows || + num_workers > lf_sync->num_workers) { + vp9_loop_filter_dealloc(lf_sync); + vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); + } + + // Initialize cur_sb_col to -1 for all SB rows. + memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); + + lf_sync->corrupted = 0; + + memset(lf_sync->num_tiles_done, 0, + sizeof(*lf_sync->num_tiles_done) * sb_rows); + cm->lf_row = 0; +} + // Set up nsync by width. static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k @@ -253,19 +279,38 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, { int i; - CHECK_MEM_ERROR(cm, lf_sync->mutex_, - vpx_malloc(sizeof(*lf_sync->mutex_) * rows)); - if (lf_sync->mutex_) { + CHECK_MEM_ERROR(cm, lf_sync->mutex, + vpx_malloc(sizeof(*lf_sync->mutex) * rows)); + if (lf_sync->mutex) { for (i = 0; i < rows; ++i) { - pthread_mutex_init(&lf_sync->mutex_[i], NULL); + pthread_mutex_init(&lf_sync->mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, lf_sync->cond_, - vpx_malloc(sizeof(*lf_sync->cond_) * rows)); - if (lf_sync->cond_) { + CHECK_MEM_ERROR(cm, lf_sync->cond, + vpx_malloc(sizeof(*lf_sync->cond) * rows)); + if (lf_sync->cond) { for (i = 0; i < rows; ++i) { - pthread_cond_init(&lf_sync->cond_[i], NULL); + pthread_cond_init(&lf_sync->cond[i], NULL); + } + } + pthread_mutex_init(&lf_sync->lf_mutex, NULL); + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex, + vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows)); + if (lf_sync->recon_done_mutex) { + int i; + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond, + vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows)); + if (lf_sync->recon_done_cond) { + int i; + for (i = 0; i < rows; ++i) { + pthread_cond_init(&lf_sync->recon_done_cond[i], NULL); } } } @@ -278,6 +323,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col, vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows)); + CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done, + vpx_malloc(sizeof(*lf_sync->num_tiles_done) * + mi_cols_aligned_to_sb(cm->mi_rows) >> + MI_BLOCK_SIZE_LOG2)); + // Set up nsync. lf_sync->sync_range = get_sync_range(width); } @@ -288,27 +338,143 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) { #if CONFIG_MULTITHREAD int i; - if (lf_sync->mutex_ != NULL) { + if (lf_sync->mutex != NULL) { + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->mutex[i]); + } + vpx_free(lf_sync->mutex); + } + if (lf_sync->cond != NULL) { for (i = 0; i < lf_sync->rows; ++i) { - pthread_mutex_destroy(&lf_sync->mutex_[i]); + pthread_cond_destroy(&lf_sync->cond[i]); } - vpx_free(lf_sync->mutex_); + vpx_free(lf_sync->cond); } - if (lf_sync->cond_ != NULL) { + if (lf_sync->recon_done_mutex != NULL) { + int i; + for (i = 0; i < lf_sync->rows; ++i) { + pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]); + } + vpx_free(lf_sync->recon_done_mutex); + } + + pthread_mutex_destroy(&lf_sync->lf_mutex); + if (lf_sync->recon_done_cond != NULL) { + int i; for (i = 0; i < lf_sync->rows; ++i) { - pthread_cond_destroy(&lf_sync->cond_[i]); + pthread_cond_destroy(&lf_sync->recon_done_cond[i]); } - vpx_free(lf_sync->cond_); + vpx_free(lf_sync->recon_done_cond); } #endif // CONFIG_MULTITHREAD + vpx_free(lf_sync->lfdata); vpx_free(lf_sync->cur_sb_col); + vpx_free(lf_sync->num_tiles_done); // clear the structure as the source of this call may be a resize in which // case this call will be followed by an _alloc() which may fail. vp9_zero(*lf_sync); } } +static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) { + int return_val = -1; + int cur_row; + const int max_rows = cm->mi_rows; + +#if CONFIG_MULTITHREAD + const int tile_cols = 1 << cm->log2_tile_cols; + + pthread_mutex_lock(&lf_sync->lf_mutex); + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } + pthread_mutex_unlock(&lf_sync->lf_mutex); + + if (return_val == -1) return return_val; + + pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]); + if (lf_sync->num_tiles_done[cur_row] < tile_cols) { + pthread_cond_wait(&lf_sync->recon_done_cond[cur_row], + &lf_sync->recon_done_mutex[cur_row]); + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]); + pthread_mutex_lock(&lf_sync->lf_mutex); + if (lf_sync->corrupted) { + int row = return_val >> MI_BLOCK_SIZE_LOG2; + pthread_mutex_lock(&lf_sync->mutex[row]); + lf_sync->cur_sb_col[row] = INT_MAX; + pthread_cond_signal(&lf_sync->cond[row]); + pthread_mutex_unlock(&lf_sync->mutex[row]); + return_val = -1; + } + pthread_mutex_unlock(&lf_sync->lf_mutex); +#else + (void)lf_sync; + if (cm->lf_row < max_rows) { + cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2; + return_val = cm->lf_row; + cm->lf_row += MI_BLOCK_SIZE; + if (cm->lf_row < max_rows) { + /* If this is not the last row, make sure the next row is also decoded. + * This is because the intra predict has to happen before loop filter */ + cur_row += 1; + } + } +#endif // CONFIG_MULTITHREAD + + return return_val; +} + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync) { + int mi_row; + VP9_COMMON *cm = lf_data->cm; + + while ((mi_row = get_next_row(cm, lf_sync)) != -1 && mi_row < cm->mi_rows) { + lf_data->start = mi_row; + lf_data->stop = mi_row + MI_BLOCK_SIZE; + + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); + } +} + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted) { +#if CONFIG_MULTITHREAD + pthread_mutex_lock(&lf_sync->lf_mutex); + lf_sync->corrupted |= corrupted; + pthread_mutex_unlock(&lf_sync->lf_mutex); + pthread_mutex_lock(&lf_sync->recon_done_mutex[row]); + lf_sync->num_tiles_done[row] += 1; + if (num_tiles == lf_sync->num_tiles_done[row]) { + if (is_last_row) { + /* The last 2 rows wait on the last row to be done. + * So, we have to broadcast the signal in this case. + */ + pthread_cond_broadcast(&lf_sync->recon_done_cond[row]); + } else { + pthread_cond_signal(&lf_sync->recon_done_cond[row]); + } + } + pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]); +#else + (void)lf_sync; + (void)num_tiles; + (void)row; + (void)is_last_row; + (void)corrupted; +#endif // CONFIG_MULTITHREAD +} + // Accumulate frame counts. void vp9_accumulate_frame_counts(FRAME_COUNTS *accum, const FRAME_COUNTS *counts, int is_dec) { diff --git a/libvpx/vp9/common/vp9_thread_common.h b/libvpx/vp9/common/vp9_thread_common.h index 0f7c3ff74..b97e9ee13 100644 --- a/libvpx/vp9/common/vp9_thread_common.h +++ b/libvpx/vp9/common/vp9_thread_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_THREAD_COMMON_H_ -#define VP9_COMMON_VP9_THREAD_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ +#define VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ #include "./vpx_config.h" #include "vp9/common/vp9_loopfilter.h" #include "vpx_util/vpx_thread.h" @@ -24,8 +24,8 @@ struct FRAME_COUNTS; // Loopfilter row synchronization typedef struct VP9LfSyncData { #if CONFIG_MULTITHREAD - pthread_mutex_t *mutex_; - pthread_cond_t *cond_; + pthread_mutex_t *mutex; + pthread_cond_t *cond; #endif // Allocate memory to store the loop-filtered superblock index in each row. int *cur_sb_col; @@ -37,6 +37,14 @@ typedef struct VP9LfSyncData { // Row-based parallel loopfilter data LFWorkerData *lfdata; int num_workers; + +#if CONFIG_MULTITHREAD + pthread_mutex_t lf_mutex; + pthread_mutex_t *recon_done_mutex; + pthread_cond_t *recon_done_cond; +#endif + int *num_tiles_done; + int corrupted; } VP9LfSync; // Allocate memory for loopfilter row synchronization. @@ -53,6 +61,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, int partial_frame, VPxWorker *workers, int num_workers, VP9LfSync *lf_sync); +// Multi-threaded loopfilter initialisations +void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm, + int frame_filter_level, int num_workers); + +void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync); + +void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row, + int corrupted); + +void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row); + void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, const struct FRAME_COUNTS *counts, int is_dec); @@ -60,4 +79,4 @@ void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum, } // extern "C" #endif -#endif // VP9_COMMON_VP9_THREAD_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_THREAD_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h index 1b11c2680..4ccf0a3d5 100644 --- a/libvpx/vp9/common/vp9_tile_common.h +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ -#define VP9_COMMON_VP9_TILE_COMMON_H_ +#ifndef VPX_VP9_COMMON_VP9_TILE_COMMON_H_ +#define VPX_VP9_COMMON_VP9_TILE_COMMON_H_ #ifdef __cplusplus extern "C" { @@ -37,4 +37,4 @@ void vp9_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols, } // extern "C" #endif -#endif // VP9_COMMON_VP9_TILE_COMMON_H_ +#endif // VPX_VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c new file mode 100644 index 000000000..57b79a732 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c @@ -0,0 +1,419 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2], + s10[2], s11[2], s12[2], s13[2], s14[2], s15[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2], + x10[2], x11[2], x12[2], x13[2], x14[2], x15[2]; + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5); + highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7); + highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9); + highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14, + s15); + + x0[0] = _mm_add_epi64(s0[0], s8[0]); + x0[1] = _mm_add_epi64(s0[1], s8[1]); + x1[0] = _mm_add_epi64(s1[0], s9[0]); + x1[1] = _mm_add_epi64(s1[1], s9[1]); + x2[0] = _mm_add_epi64(s2[0], s10[0]); + x2[1] = _mm_add_epi64(s2[1], s10[1]); + x3[0] = _mm_add_epi64(s3[0], s11[0]); + x3[1] = _mm_add_epi64(s3[1], s11[1]); + x4[0] = _mm_add_epi64(s4[0], s12[0]); + x4[1] = _mm_add_epi64(s4[1], s12[1]); + x5[0] = _mm_add_epi64(s5[0], s13[0]); + x5[1] = _mm_add_epi64(s5[1], s13[1]); + x6[0] = _mm_add_epi64(s6[0], s14[0]); + x6[1] = _mm_add_epi64(s6[1], s14[1]); + x7[0] = _mm_add_epi64(s7[0], s15[0]); + x7[1] = _mm_add_epi64(s7[1], s15[1]); + x8[0] = _mm_sub_epi64(s0[0], s8[0]); + x8[1] = _mm_sub_epi64(s0[1], s8[1]); + x9[0] = _mm_sub_epi64(s1[0], s9[0]); + x9[1] = _mm_sub_epi64(s1[1], s9[1]); + x10[0] = _mm_sub_epi64(s2[0], s10[0]); + x10[1] = _mm_sub_epi64(s2[1], s10[1]); + x11[0] = _mm_sub_epi64(s3[0], s11[0]); + x11[1] = _mm_sub_epi64(s3[1], s11[1]); + x12[0] = _mm_sub_epi64(s4[0], s12[0]); + x12[1] = _mm_sub_epi64(s4[1], s12[1]); + x13[0] = _mm_sub_epi64(s5[0], s13[0]); + x13[1] = _mm_sub_epi64(s5[1], s13[1]); + x14[0] = _mm_sub_epi64(s6[0], s14[0]); + x14[1] = _mm_sub_epi64(s6[1], s14[1]); + x15[0] = _mm_sub_epi64(s7[0], s15[0]); + x15[1] = _mm_sub_epi64(s7[1], s15[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x0[0] = pack_4(x0[0], x0[1]); + x1[0] = pack_4(x1[0], x1[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 2 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + s4[0] = x4[0]; + s5[0] = x5[0]; + s6[0] = x6[0]; + s7[0] = x7[0]; + x0[0] = _mm_add_epi32(s0[0], s4[0]); + x1[0] = _mm_add_epi32(s1[0], s5[0]); + x2[0] = _mm_add_epi32(s2[0], s6[0]); + x3[0] = _mm_add_epi32(s3[0], s7[0]); + x4[0] = _mm_sub_epi32(s0[0], s4[0]); + x5[0] = _mm_sub_epi32(s1[0], s5[0]); + x6[0] = _mm_sub_epi32(s2[0], s6[0]); + x7[0] = _mm_sub_epi32(s3[0], s7[0]); + + highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9); + highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10, + s11); + highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13, + s12); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15, + s14); + + x8[0] = _mm_add_epi64(s8[0], s12[0]); + x8[1] = _mm_add_epi64(s8[1], s12[1]); + x9[0] = _mm_add_epi64(s9[0], s13[0]); + x9[1] = _mm_add_epi64(s9[1], s13[1]); + x10[0] = _mm_add_epi64(s10[0], s14[0]); + x10[1] = _mm_add_epi64(s10[1], s14[1]); + x11[0] = _mm_add_epi64(s11[0], s15[0]); + x11[1] = _mm_add_epi64(s11[1], s15[1]); + x12[0] = _mm_sub_epi64(s8[0], s12[0]); + x12[1] = _mm_sub_epi64(s8[1], s12[1]); + x13[0] = _mm_sub_epi64(s9[0], s13[0]); + x13[1] = _mm_sub_epi64(s9[1], s13[1]); + x14[0] = _mm_sub_epi64(s10[0], s14[0]); + x14[1] = _mm_sub_epi64(s10[1], s14[1]); + x15[0] = _mm_sub_epi64(s11[0], s15[0]); + x15[1] = _mm_sub_epi64(s11[1], s15[1]); + x8[0] = dct_const_round_shift_64bit(x8[0]); + x8[1] = dct_const_round_shift_64bit(x8[1]); + x9[0] = dct_const_round_shift_64bit(x9[0]); + x9[1] = dct_const_round_shift_64bit(x9[1]); + x10[0] = dct_const_round_shift_64bit(x10[0]); + x10[1] = dct_const_round_shift_64bit(x10[1]); + x11[0] = dct_const_round_shift_64bit(x11[0]); + x11[1] = dct_const_round_shift_64bit(x11[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x8[0] = pack_4(x8[0], x8[1]); + x9[0] = pack_4(x9[0], x9[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 3 + s0[0] = x0[0]; + s1[0] = x1[0]; + s2[0] = x2[0]; + s3[0] = x3[0]; + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + s8[0] = x8[0]; + s9[0] = x9[0]; + s10[0] = x10[0]; + s11[0] = x11[0]; + highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12, + s13); + highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15, + s14); + + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x8[0] = _mm_add_epi32(s8[0], s10[0]); + x9[0] = _mm_add_epi32(s9[0], s11[0]); + x10[0] = _mm_sub_epi32(s8[0], s10[0]); + x11[0] = _mm_sub_epi32(s9[0], s11[0]); + x12[0] = _mm_add_epi64(s12[0], s14[0]); + x12[1] = _mm_add_epi64(s12[1], s14[1]); + x13[0] = _mm_add_epi64(s13[0], s15[0]); + x13[1] = _mm_add_epi64(s13[1], s15[1]); + x14[0] = _mm_sub_epi64(s12[0], s14[0]); + x14[1] = _mm_sub_epi64(s12[1], s14[1]); + x15[0] = _mm_sub_epi64(s13[0], s15[0]); + x15[1] = _mm_sub_epi64(s13[1], s15[1]); + x12[0] = dct_const_round_shift_64bit(x12[0]); + x12[1] = dct_const_round_shift_64bit(x12[1]); + x13[0] = dct_const_round_shift_64bit(x13[0]); + x13[1] = dct_const_round_shift_64bit(x13[1]); + x14[0] = dct_const_round_shift_64bit(x14[0]); + x14[1] = dct_const_round_shift_64bit(x14[1]); + x15[0] = dct_const_round_shift_64bit(x15[0]); + x15[1] = dct_const_round_shift_64bit(x15[1]); + x12[0] = pack_4(x12[0], x12[1]); + x13[0] = pack_4(x13[0], x13[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + // stage 4 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x7[0], x6[0]); + s7[0] = _mm_sub_epi32(x7[0], x6[0]); + s10[0] = _mm_add_epi32(x11[0], x10[0]); + s11[0] = _mm_sub_epi32(x11[0], x10[0]); + s14[0] = _mm_add_epi32(x14[0], x15[0]); + s15[0] = _mm_sub_epi32(x14[0], x15[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10); + highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11); + highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14); + highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x10[0] = dct_const_round_shift_64bit(s10[0]); + x10[1] = dct_const_round_shift_64bit(s10[1]); + x11[0] = dct_const_round_shift_64bit(s11[0]); + x11[1] = dct_const_round_shift_64bit(s11[1]); + x14[0] = dct_const_round_shift_64bit(s14[0]); + x14[1] = dct_const_round_shift_64bit(s14[1]); + x15[0] = dct_const_round_shift_64bit(s15[0]); + x15[1] = dct_const_round_shift_64bit(s15[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + x10[0] = pack_4(x10[0], x10[1]); + x11[0] = pack_4(x11[0], x11[1]); + x14[0] = pack_4(x14[0], x14[1]); + x15[0] = pack_4(x15[0], x15[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]); + io[2] = x12[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[4] = x6[0]; + io[5] = x14[0]; + io[6] = x10[0]; + io[7] = x2[0]; + io[8] = x3[0]; + io[9] = x11[0]; + io[10] = x15[0]; + io[11] = x7[0]; + io[12] = x5[0]; + io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]); + io[14] = x9[0]; + io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + int i; + __m128i out[16], *in; + + if (bd == 8) { + __m128i l[16], r[16]; + + in = l; + for (i = 0; i < 2; i++) { + highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]); + highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct16_8col(in, in); + } else { + vpx_iadst16_8col_sse2(in); + } + in = r; + input += 128; + } + + for (i = 0; i < 16; i += 8) { + int j; + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct16_8col(out, out); + } else { + vpx_iadst16_8col_sse2(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_8(dest + j * stride, out[j], bd); + } + dest += 8; + } + } else { + __m128i all[4][16]; + + for (i = 0; i < 4; i++) { + in = all[i]; + highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]); + highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct16_4col_sse4_1(in); + } else { + highbd_iadst16_4col_sse4_1(in); + } + input += 4 * 16; + } + + for (i = 0; i < 16; i += 4) { + int j; + transpose_32bit_4x4(all[0] + i, out + 0); + transpose_32bit_4x4(all[1] + i, out + 4); + transpose_32bit_4x4(all[2] + i, out + 8); + transpose_32bit_4x4(all[3] + i, out + 12); + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct16_4col_sse4_1(out); + } else { + highbd_iadst16_4col_sse4_1(out); + } + + for (j = 0; j < 16; ++j) { + highbd_write_buffer_4(dest + j * stride, out[j], bd); + } + dest += 4; + } + } +} diff --git a/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c b/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c new file mode 100644 index 000000000..af158536f --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_highbd_iht4x4_add_sse4.c @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst4_sse4_1(__m128i *const io) { + const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0); + const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0); + const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0); + const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0); + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2]; + __m128i temp[2]; + + transpose_32bit_4x4(io, io); + + extend_64bit(io[0], temp); + s0[0] = _mm_mul_epi32(pair_c1, temp[0]); + s0[1] = _mm_mul_epi32(pair_c1, temp[1]); + s1[0] = _mm_mul_epi32(pair_c2, temp[0]); + s1[1] = _mm_mul_epi32(pair_c2, temp[1]); + + extend_64bit(io[1], temp); + s2[0] = _mm_mul_epi32(pair_c3, temp[0]); + s2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + extend_64bit(io[2], temp); + s3[0] = _mm_mul_epi32(pair_c4, temp[0]); + s3[1] = _mm_mul_epi32(pair_c4, temp[1]); + s4[0] = _mm_mul_epi32(pair_c1, temp[0]); + s4[1] = _mm_mul_epi32(pair_c1, temp[1]); + + extend_64bit(io[3], temp); + s5[0] = _mm_mul_epi32(pair_c2, temp[0]); + s5[1] = _mm_mul_epi32(pair_c2, temp[1]); + s6[0] = _mm_mul_epi32(pair_c4, temp[0]); + s6[1] = _mm_mul_epi32(pair_c4, temp[1]); + + t0[0] = _mm_add_epi64(s0[0], s3[0]); + t0[1] = _mm_add_epi64(s0[1], s3[1]); + t0[0] = _mm_add_epi64(t0[0], s5[0]); + t0[1] = _mm_add_epi64(t0[1], s5[1]); + t1[0] = _mm_sub_epi64(s1[0], s4[0]); + t1[1] = _mm_sub_epi64(s1[1], s4[1]); + t1[0] = _mm_sub_epi64(t1[0], s6[0]); + t1[1] = _mm_sub_epi64(t1[1], s6[1]); + temp[0] = _mm_sub_epi32(io[0], io[2]); + temp[0] = _mm_add_epi32(temp[0], io[3]); + extend_64bit(temp[0], temp); + t2[0] = _mm_mul_epi32(pair_c3, temp[0]); + t2[1] = _mm_mul_epi32(pair_c3, temp[1]); + + s0[0] = _mm_add_epi64(t0[0], s2[0]); + s0[1] = _mm_add_epi64(t0[1], s2[1]); + s1[0] = _mm_add_epi64(t1[0], s2[0]); + s1[1] = _mm_add_epi64(t1[1], s2[1]); + s3[0] = _mm_add_epi64(t0[0], t1[0]); + s3[1] = _mm_add_epi64(t0[1], t1[1]); + s3[0] = _mm_sub_epi64(s3[0], s2[0]); + s3[1] = _mm_sub_epi64(s3[1], s2[1]); + + s0[0] = dct_const_round_shift_64bit(s0[0]); + s0[1] = dct_const_round_shift_64bit(s0[1]); + s1[0] = dct_const_round_shift_64bit(s1[0]); + s1[1] = dct_const_round_shift_64bit(s1[1]); + s2[0] = dct_const_round_shift_64bit(t2[0]); + s2[1] = dct_const_round_shift_64bit(t2[1]); + s3[0] = dct_const_round_shift_64bit(s3[0]); + s3[1] = dct_const_round_shift_64bit(s3[1]); + io[0] = pack_4(s0[0], s0[1]); + io[1] = pack_4(s1[0], s1[1]); + io[2] = pack_4(s2[0], s2[1]); + io[3] = pack_4(s3[0], s3[1]); +} + +void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[4]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0)); + io[1] = _mm_load_si128((const __m128i *)(input + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 8)); + io[3] = _mm_load_si128((const __m128i *)(input + 12)); + + if (bd == 8) { + __m128i io_short[2]; + + io_short[0] = _mm_packs_epi32(io[0], io[1]); + io_short[1] = _mm_packs_epi32(io[2], io[3]); + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + idct4_sse2(io_short); + } else { + iadst4_sse2(io_short); + } + io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8)); + io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8)); + io[0] = _mm_srai_epi16(io_short[0], 4); + io[1] = _mm_srai_epi16(io_short[1], 4); + } else { + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + highbd_idct4_sse4_1(io); + } else { + highbd_iadst4_sse4_1(io); + } + io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8)); + io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8)); + } + + recon_and_store_4x4(io, dest, stride, bd); +} diff --git a/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c new file mode 100644 index 000000000..7d949b6db --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_idct.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/transpose_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in, + const int c, + __m128i *const s) { + const __m128i pair_c = pair_set_epi32(4 * c, 0); + __m128i x[2]; + + extend_64bit(in, x); + s[0] = _mm_mul_epi32(pair_c, x[0]); + s[1] = _mm_mul_epi32(pair_c, x[1]); +} + +static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0, + const __m128i in1, + const int c0, const int c1, + __m128i *const s0, + __m128i *const s1) { + const __m128i pair_c0 = pair_set_epi32(4 * c0, 0); + const __m128i pair_c1 = pair_set_epi32(4 * c1, 0); + __m128i t00[2], t01[2], t10[2], t11[2]; + __m128i x0[2], x1[2]; + + extend_64bit(in0, x0); + extend_64bit(in1, x1); + t00[0] = _mm_mul_epi32(pair_c0, x0[0]); + t00[1] = _mm_mul_epi32(pair_c0, x0[1]); + t01[0] = _mm_mul_epi32(pair_c0, x1[0]); + t01[1] = _mm_mul_epi32(pair_c0, x1[1]); + t10[0] = _mm_mul_epi32(pair_c1, x0[0]); + t10[1] = _mm_mul_epi32(pair_c1, x0[1]); + t11[0] = _mm_mul_epi32(pair_c1, x1[0]); + t11[1] = _mm_mul_epi32(pair_c1, x1[1]); + + s0[0] = _mm_add_epi64(t00[0], t11[0]); + s0[1] = _mm_add_epi64(t00[1], t11[1]); + s1[0] = _mm_sub_epi64(t10[0], t01[0]); + s1[1] = _mm_sub_epi64(t10[1], t01[1]); +} + +static void highbd_iadst8_sse4_1(__m128i *const io) { + __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; + __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2]; + + transpose_32bit_4x4x2(io, io); + + // stage 1 + highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1); + highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5); + x0[0] = _mm_add_epi64(s0[0], s4[0]); + x0[1] = _mm_add_epi64(s0[1], s4[1]); + x1[0] = _mm_add_epi64(s1[0], s5[0]); + x1[1] = _mm_add_epi64(s1[1], s5[1]); + x4[0] = _mm_sub_epi64(s0[0], s4[0]); + x4[1] = _mm_sub_epi64(s0[1], s4[1]); + x5[0] = _mm_sub_epi64(s1[0], s5[0]); + x5[1] = _mm_sub_epi64(s1[1], s5[1]); + + highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3); + highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7); + x2[0] = _mm_add_epi64(s2[0], s6[0]); + x2[1] = _mm_add_epi64(s2[1], s6[1]); + x3[0] = _mm_add_epi64(s3[0], s7[0]); + x3[1] = _mm_add_epi64(s3[1], s7[1]); + x6[0] = _mm_sub_epi64(s2[0], s6[0]); + x6[1] = _mm_sub_epi64(s2[1], s6[1]); + x7[0] = _mm_sub_epi64(s3[0], s7[0]); + x7[1] = _mm_sub_epi64(s3[1], s7[1]); + + x0[0] = dct_const_round_shift_64bit(x0[0]); + x0[1] = dct_const_round_shift_64bit(x0[1]); + x1[0] = dct_const_round_shift_64bit(x1[0]); + x1[1] = dct_const_round_shift_64bit(x1[1]); + x2[0] = dct_const_round_shift_64bit(x2[0]); + x2[1] = dct_const_round_shift_64bit(x2[1]); + x3[0] = dct_const_round_shift_64bit(x3[0]); + x3[1] = dct_const_round_shift_64bit(x3[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + s0[0] = pack_4(x0[0], x0[1]); // s0 = x0; + s1[0] = pack_4(x1[0], x1[1]); // s1 = x1; + s2[0] = pack_4(x2[0], x2[1]); // s2 = x2; + s3[0] = pack_4(x3[0], x3[1]); // s3 = x3; + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 2 + x0[0] = _mm_add_epi32(s0[0], s2[0]); + x1[0] = _mm_add_epi32(s1[0], s3[0]); + x2[0] = _mm_sub_epi32(s0[0], s2[0]); + x3[0] = _mm_sub_epi32(s1[0], s3[0]); + + highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5); + highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6); + + x4[0] = _mm_add_epi64(s4[0], s6[0]); + x4[1] = _mm_add_epi64(s4[1], s6[1]); + x5[0] = _mm_add_epi64(s5[0], s7[0]); + x5[1] = _mm_add_epi64(s5[1], s7[1]); + x6[0] = _mm_sub_epi64(s4[0], s6[0]); + x6[1] = _mm_sub_epi64(s4[1], s6[1]); + x7[0] = _mm_sub_epi64(s5[0], s7[0]); + x7[1] = _mm_sub_epi64(s5[1], s7[1]); + x4[0] = dct_const_round_shift_64bit(x4[0]); + x4[1] = dct_const_round_shift_64bit(x4[1]); + x5[0] = dct_const_round_shift_64bit(x5[0]); + x5[1] = dct_const_round_shift_64bit(x5[1]); + x6[0] = dct_const_round_shift_64bit(x6[0]); + x6[1] = dct_const_round_shift_64bit(x6[1]); + x7[0] = dct_const_round_shift_64bit(x7[0]); + x7[1] = dct_const_round_shift_64bit(x7[1]); + x4[0] = pack_4(x4[0], x4[1]); + x5[0] = pack_4(x5[0], x5[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + // stage 3 + s2[0] = _mm_add_epi32(x2[0], x3[0]); + s3[0] = _mm_sub_epi32(x2[0], x3[0]); + s6[0] = _mm_add_epi32(x6[0], x7[0]); + s7[0] = _mm_sub_epi32(x6[0], x7[0]); + highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2); + highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3); + highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6); + highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7); + + x2[0] = dct_const_round_shift_64bit(s2[0]); + x2[1] = dct_const_round_shift_64bit(s2[1]); + x3[0] = dct_const_round_shift_64bit(s3[0]); + x3[1] = dct_const_round_shift_64bit(s3[1]); + x6[0] = dct_const_round_shift_64bit(s6[0]); + x6[1] = dct_const_round_shift_64bit(s6[1]); + x7[0] = dct_const_round_shift_64bit(s7[0]); + x7[1] = dct_const_round_shift_64bit(s7[1]); + x2[0] = pack_4(x2[0], x2[1]); + x3[0] = pack_4(x3[0], x3[1]); + x6[0] = pack_4(x6[0], x6[1]); + x7[0] = pack_4(x7[0], x7[1]); + + io[0] = x0[0]; + io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]); + io[2] = x6[0]; + io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]); + io[4] = x3[0]; + io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]); + io[6] = x5[0]; + io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]); +} + +void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest, + int stride, int tx_type, int bd) { + __m128i io[16]; + + io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0)); + io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4)); + io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0)); + io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4)); + io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0)); + io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4)); + io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0)); + io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4)); + io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0)); + io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4)); + io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0)); + io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4)); + io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0)); + io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4)); + io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0)); + io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4)); + + if (bd == 8) { + __m128i io_short[8]; + + io_short[0] = _mm_packs_epi32(io[0], io[4]); + io_short[1] = _mm_packs_epi32(io[1], io[5]); + io_short[2] = _mm_packs_epi32(io[2], io[6]); + io_short[3] = _mm_packs_epi32(io[3], io[7]); + io_short[4] = _mm_packs_epi32(io[8], io[12]); + io_short[5] = _mm_packs_epi32(io[9], io[13]); + io_short[6] = _mm_packs_epi32(io[10], io[14]); + io_short[7] = _mm_packs_epi32(io[11], io[15]); + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_idct8_sse2(io_short); + } else { + iadst8_sse2(io_short); + } + round_shift_8x8(io_short, io); + } else { + __m128i temp[4]; + + if (tx_type == DCT_DCT || tx_type == ADST_DCT) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + highbd_iadst8_sse4_1(&io[8]); + } + + temp[0] = io[4]; + temp[1] = io[5]; + temp[2] = io[6]; + temp[3] = io[7]; + io[4] = io[8]; + io[5] = io[9]; + io[6] = io[10]; + io[7] = io[11]; + + if (tx_type == DCT_DCT || tx_type == DCT_ADST) { + vpx_highbd_idct8x8_half1d_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + vpx_highbd_idct8x8_half1d_sse4_1(&io[8]); + } else { + highbd_iadst8_sse4_1(io); + io[8] = temp[0]; + io[9] = temp[1]; + io[10] = temp[2]; + io[11] = temp[3]; + highbd_iadst8_sse4_1(&io[8]); + } + highbd_idct8x8_final_round(io); + } + recon_and_store_8x8(io, dest, stride, bd); +} diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index 6996260e2..ad693718c 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -10,8 +10,6 @@ #include "./vp9_rtcd.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" -#include "vpx_dsp/x86/txfm_common_sse2.h" -#include "vpx_ports/mem.h" void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type) { @@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[1] = load_input_data8(input + 8); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct4_sse2(in); idct4_sse2(in); break; - case 1: // ADST_DCT + case ADST_DCT: idct4_sse2(in); iadst4_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst4_sse2(in); idct4_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst4_sse2(in); iadst4_sse2(in); break; - default: assert(0); break; } // Final round and shift @@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, in[7] = load_input_data8(input + 8 * 7); switch (tx_type) { - case 0: // DCT_DCT - idct8_sse2(in); - idct8_sse2(in); + case DCT_DCT: + vpx_idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 1: // ADST_DCT - idct8_sse2(in); + case ADST_DCT: + vpx_idct8_sse2(in); iadst8_sse2(in); break; - case 2: // DCT_ADST + case DCT_ADST: iadst8_sse2(in); - idct8_sse2(in); + vpx_idct8_sse2(in); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst8_sse2(in); iadst8_sse2(in); break; - default: assert(0); break; } // Final rounding and shift @@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, load_buffer_8x16(input, in1); switch (tx_type) { - case 0: // DCT_DCT + case DCT_DCT: idct16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 1: // ADST_DCT + case ADST_DCT: idct16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - case 2: // DCT_ADST + case DCT_ADST: iadst16_sse2(in0, in1); idct16_sse2(in0, in1); break; - case 3: // ADST_ADST + default: + assert(tx_type == ADST_ADST); iadst16_sse2(in0, in1); iadst16_sse2(in0, in1); break; - default: assert(0); break; } write_buffer_8x16(dest, in0, stride); diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c index 497a22459..c9c85053d 100644 --- a/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/libvpx/vp9/decoder/vp9_decodeframe.c @@ -45,31 +45,11 @@ #define MAX_VP9_HEADER_SIZE 80 -static int is_compound_reference_allowed(const VP9_COMMON *cm) { - int i; - for (i = 1; i < REFS_PER_FRAME; ++i) - if (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1]) return 1; - - return 0; -} - -static void setup_compound_reference_mode(VP9_COMMON *cm) { - if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) { - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; - } else if (cm->ref_frame_sign_bias[LAST_FRAME] == - cm->ref_frame_sign_bias[ALTREF_FRAME]) { - cm->comp_fixed_ref = GOLDEN_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } else { - cm->comp_fixed_ref = LAST_FRAME; - cm->comp_var_ref[0] = GOLDEN_FRAME; - cm->comp_var_ref[1] = ALTREF_FRAME; - } -} +typedef int (*predict_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); + +typedef void (*intra_recon_func)(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, TX_SIZE tx_size); static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) { return len != 0 && len <= (size_t)(end - start); @@ -118,7 +98,7 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) { static REFERENCE_MODE read_frame_reference_mode(const VP9_COMMON *cm, vpx_reader *r) { - if (is_compound_reference_allowed(cm)) { + if (vp9_compound_reference_allowed(cm)) { return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT : COMPOUND_REFERENCE) : SINGLE_REFERENCE; @@ -351,6 +331,59 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd, } } +static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + if (!mi->skip) { + struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + const scan_order *sc = (plane || xd->lossless) + ? &vp9_default_scan_orders[tx_size] + : &vp9_scan_orders[tx_size][tx_type]; + *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + +static void predict_and_reconstruct_intra_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, + int plane, int row, + int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + PREDICTION_MODE mode = (plane == 0) ? mi->mode : mi->uv_mode; + uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col]; + + if (mi->sb_type < BLOCK_8X8) + if (plane == 0) mode = xd->mi[0]->bmi[(row << 1) + col].as_mode; + + vp9_predict_intra_block(xd, pd->n4_wl, tx_size, mode, dst, pd->dst.stride, + dst, pd->dst.stride, col, row, plane); + + if (!mi->skip) { + const TX_TYPE tx_type = + (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode]; + if (*pd->eob > 0) { + inverse_transform_block_intra(xd, plane, tx_type, tx_size, dst, + pd->dst.stride, *pd->eob); + } + /* Keep the alignment to 16 */ + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + } +} + static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, int plane, int row, int col, TX_SIZE tx_size) { @@ -368,6 +401,41 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi, return eob; } +static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi, + int plane, int row, int col, + TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *sc = &vp9_default_scan_orders[tx_size]; + const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size, + mi->segment_id); + + *pd->eob = eob; + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + +static int reconstruct_inter_block_row_mt(TileWorkerData *twd, + MODE_INFO *const mi, int plane, + int row, int col, TX_SIZE tx_size) { + MACROBLOCKD *const xd = &twd->xd; + struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = *pd->eob; + + (void)mi; + if (eob > 0) { + inverse_transform_block_inter( + xd, plane, tx_size, &pd->dst.buf[4 * row * pd->dst.stride + 4 * col], + pd->dst.stride, eob); + } + pd->dqcoeff += (16 << (tx_size << 1)); + pd->eob++; + + return eob; +} + static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int x, int y, int b_w, int b_h, int w, int h) { @@ -715,6 +783,25 @@ static void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh, int bwl, } } +static MODE_INFO *set_offsets_recon(VP9_COMMON *const cm, MACROBLOCKD *const xd, + int mi_row, int mi_col, int bw, int bh, + int bwl, int bhl) { + const int offset = mi_row * cm->mi_stride + mi_col; + const TileInfo *const tile = &xd->tile; + xd->mi = cm->mi_grid_visible + offset; + + set_plane_n4(xd, bw, bh, bwl, bhl); + + set_skip_context(xd, mi_row, mi_col); + + // Distance of Mb to the various image edges. These are specified to 8th pel + // as they are always compared to values that are in 1/8th pel units + set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols); + + vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); + return xd->mi[0]; +} + static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, BLOCK_SIZE bsize, int mi_row, int mi_col, int bw, int bh, int x_mis, int y_mis, int bwl, int bhl) { @@ -744,6 +831,66 @@ static MODE_INFO *set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd, return xd->mi[0]; } +static INLINE int predict_recon_inter(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + predict_recon_func func) { + int eobtotal = 0; + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + eobtotal += func(twd, mi, plane, row, col, tx_size); + } + return eobtotal; +} + +static INLINE void predict_recon_intra(MACROBLOCKD *xd, MODE_INFO *mi, + TileWorkerData *twd, + intra_recon_func func) { + int plane; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size; + const int num_4x4_w = pd->n4_w; + const int num_4x4_h = pd->n4_h; + const int step = (1 << tx_size); + int row, col; + const int max_blocks_wide = + num_4x4_w + (xd->mb_to_right_edge >= 0 + ? 0 + : xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = + num_4x4_h + (xd->mb_to_bottom_edge >= 0 + ? 0 + : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + xd->max_blocks_wide = xd->mb_to_right_edge >= 0 ? 0 : max_blocks_wide; + xd->max_blocks_high = xd->mb_to_bottom_edge >= 0 ? 0 : max_blocks_high; + + for (row = 0; row < max_blocks_high; row += step) + for (col = 0; col < max_blocks_wide; col += step) + func(twd, mi, plane, row, col, tx_size); + } +} + static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { VP9_COMMON *const cm = &pbi->common; @@ -844,6 +991,81 @@ static void decode_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } } +static void recon_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets_recon(cm, xd, mi_row, mi_col, bw, bh, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, + predict_and_reconstruct_intra_block_row_mt); + } else { + // Prediction + dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col); + + // Reconstruction + if (!mi->skip) { + predict_recon_inter(xd, mi, twd, reconstruct_inter_block_row_mt); + } + } + + vp9_build_mask(cm, mi, mi_row, mi_col, bw, bh); +} + +static void parse_block(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, + int mi_col, BLOCK_SIZE bsize, int bwl, int bhl) { + VP9_COMMON *const cm = &pbi->common; + const int less8x8 = bsize < BLOCK_8X8; + const int bw = 1 << (bwl - 1); + const int bh = 1 << (bhl - 1); + const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col); + const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row); + vpx_reader *r = &twd->bit_reader; + MACROBLOCKD *const xd = &twd->xd; + + MODE_INFO *mi = set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, + y_mis, bwl, bhl); + + if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) { + const BLOCK_SIZE uv_subsize = + ss_size_lookup[bsize][cm->subsampling_x][cm->subsampling_y]; + if (uv_subsize == BLOCK_INVALID) + vpx_internal_error(xd->error_info, VPX_CODEC_CORRUPT_FRAME, + "Invalid block size."); + } + + vp9_read_mode_info(twd, pbi, mi_row, mi_col, x_mis, y_mis); + + if (mi->skip) { + dec_reset_skip_context(xd); + } + + if (!is_inter_block(mi)) { + predict_recon_intra(xd, mi, twd, parse_intra_block_row_mt); + } else { + if (!mi->skip) { + const int eobtotal = + predict_recon_inter(xd, mi, twd, parse_inter_block_row_mt); + + if (!less8x8 && eobtotal == 0) mi->skip = 1; // skip loopfilter + } + } + + xd->corrupted |= vpx_reader_has_error(r); +} + static INLINE int dec_partition_plane_context(TileWorkerData *twd, int mi_row, int mi_col, int bsl) { const PARTITION_CONTEXT *above_ctx = twd->xd.above_seg_context + mi_col; @@ -950,6 +1172,118 @@ static void decode_partition(TileWorkerData *twd, VP9Decoder *const pbi, dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); } +static void recon_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = *xd->partition; + xd->partition++; + + subsize = get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + recon_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + recon_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + recon_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + recon_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + recon_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + recon_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); + recon_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); + recon_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); + recon_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2); + break; + default: assert(0 && "Invalid partition type"); + } + } +} + +static void parse_partition(TileWorkerData *twd, VP9Decoder *const pbi, + int mi_row, int mi_col, BLOCK_SIZE bsize, + int n4x4_l2) { + VP9_COMMON *const cm = &pbi->common; + const int n8x8_l2 = n4x4_l2 - 1; + const int num_8x8_wh = 1 << n8x8_l2; + const int hbs = num_8x8_wh >> 1; + PARTITION_TYPE partition; + BLOCK_SIZE subsize; + const int has_rows = (mi_row + hbs) < cm->mi_rows; + const int has_cols = (mi_col + hbs) < cm->mi_cols; + MACROBLOCKD *const xd = &twd->xd; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + *xd->partition = + read_partition(twd, mi_row, mi_col, has_rows, has_cols, n8x8_l2); + + partition = *xd->partition; + xd->partition++; + + subsize = get_subsize(bsize, partition); + if (!hbs) { + // calculate bmode block dimensions (log 2) + xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT); + xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ); + parse_block(twd, pbi, mi_row, mi_col, subsize, 1, 1); + } else { + switch (partition) { + case PARTITION_NONE: + parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n4x4_l2); + break; + case PARTITION_HORZ: + parse_block(twd, pbi, mi_row, mi_col, subsize, n4x4_l2, n8x8_l2); + if (has_rows) + parse_block(twd, pbi, mi_row + hbs, mi_col, subsize, n4x4_l2, + n8x8_l2); + break; + case PARTITION_VERT: + parse_block(twd, pbi, mi_row, mi_col, subsize, n8x8_l2, n4x4_l2); + if (has_cols) + parse_block(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2, + n4x4_l2); + break; + case PARTITION_SPLIT: + parse_partition(twd, pbi, mi_row, mi_col, subsize, n8x8_l2); + parse_partition(twd, pbi, mi_row, mi_col + hbs, subsize, n8x8_l2); + parse_partition(twd, pbi, mi_row + hbs, mi_col, subsize, n8x8_l2); + parse_partition(twd, pbi, mi_row + hbs, mi_col + hbs, subsize, n8x8_l2); + break; + default: assert(0 && "Invalid partition type"); + } + } + + // update partition context + if (bsize >= BLOCK_8X8 && + (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) + dec_update_partition_context(twd, mi_row, mi_col, subsize, num_8x8_wh); +} + static void setup_token_decoder(const uint8_t *data, const uint8_t *data_end, size_t read_size, struct vpx_internal_error_info *error_info, @@ -1148,9 +1482,15 @@ static void resize_context_buffers(VP9_COMMON *cm, int width, int height) { // Allocations in vp9_alloc_context_buffers() depend on individual // dimensions as well as the overall size. if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) { - if (vp9_alloc_context_buffers(cm, width, height)) + if (vp9_alloc_context_buffers(cm, width, height)) { + // The cm->mi_* values have been cleared and any existing context + // buffers have been freed. Clear cm->width and cm->height to be + // consistent and to force a realloc next time. + cm->width = 0; + cm->height = 0; vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate context buffers"); + } } else { vp9_set_mb_mi(cm, width, height); } @@ -1426,7 +1766,27 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, vp9_zero(tile_data->xd.left_seg_context); for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end; mi_col += MI_BLOCK_SIZE) { - decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + if (pbi->row_mt == 1) { + int plane; + RowMTWorkerData *const row_mt_worker_data = pbi->row_mt_worker_data; + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + parse_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + + for (plane = 0; plane < MAX_MB_PLANE; ++plane) { + tile_data->xd.plane[plane].eob = row_mt_worker_data->eob[plane]; + tile_data->xd.plane[plane].dqcoeff = + row_mt_worker_data->dqcoeff[plane]; + } + tile_data->xd.partition = row_mt_worker_data->partition; + recon_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } else { + decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); + } } pbi->mb.corrupted |= tile_data->xd.corrupted; if (pbi->mb.corrupted) @@ -1471,6 +1831,25 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data, return vpx_reader_find_end(&tile_data->bit_reader); } +static void set_rows_after_error(VP9LfSync *lf_sync, int start_row, int mi_rows, + int num_tiles_left, int total_num_tiles) { + do { + int mi_row; + const int aligned_rows = mi_cols_aligned_to_sb(mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int corrupted = 1; + for (mi_row = start_row; mi_row < mi_rows; mi_row += MI_BLOCK_SIZE) { + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, total_num_tiles, mi_row >> MI_BLOCK_SIZE_LOG2, + is_last_row, corrupted); + } + /* If there are multiple tiles, the second tile should start marking row + * progress from row 0. + */ + start_row = 0; + } while (num_tiles_left--); +} + // On entry 'tile_data->data_end' points to the end of the input frame, on exit // it is updated to reflect the bitreader position of the final tile column if // present in the tile buffer group or NULL otherwise. @@ -1481,6 +1860,12 @@ static int tile_worker_hook(void *arg1, void *arg2) { TileInfo *volatile tile = &tile_data->xd.tile; const int final_col = (1 << pbi->common.log2_tile_cols) - 1; const uint8_t *volatile bit_reader_end = NULL; + VP9_COMMON *cm = &pbi->common; + + LFWorkerData *lf_data = tile_data->lf_data; + VP9LfSync *lf_sync = tile_data->lf_sync; + + volatile int mi_row = 0; volatile int n = tile_data->buf_start; tile_data->error_info.setjmp = 1; @@ -1488,14 +1873,26 @@ static int tile_worker_hook(void *arg1, void *arg2) { tile_data->error_info.setjmp = 0; tile_data->xd.corrupted = 1; tile_data->data_end = NULL; + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int num_tiles_left = tile_data->buf_end - n; + const int mi_row_start = mi_row; + set_rows_after_error(lf_sync, mi_row_start, cm->mi_rows, num_tiles_left, + 1 << cm->log2_tile_cols); + } return 0; } tile_data->xd.corrupted = 0; do { - int mi_row, mi_col; + int mi_col; const TileBuffer *const buf = pbi->tile_buffers + n; + + /* Initialize to 0 is safe since we do not deal with streams that have + * more than one row of tiles. (So tile->mi_row_start will be 0) + */ + assert(cm->log2_tile_rows == 0); + mi_row = 0; vp9_zero(tile_data->dqcoeff); vp9_tile_init(tile, &pbi->common, 0, buf->col); setup_token_decoder(buf->data, tile_data->data_end, buf->size, @@ -1513,6 +1910,14 @@ static int tile_worker_hook(void *arg1, void *arg2) { mi_col += MI_BLOCK_SIZE) { decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4); } + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2); + const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2); + vp9_set_row(lf_sync, 1 << cm->log2_tile_cols, + mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row, + tile_data->xd.corrupted); + } } if (buf->col == final_col) { @@ -1520,6 +1925,21 @@ static int tile_worker_hook(void *arg1, void *arg2) { } } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end); + if (pbi->lpf_mt_opt && n < tile_data->buf_end && cm->lf.filter_level && + !cm->skip_loop_filter) { + /* This was not incremented in the tile loop, so increment before tiles left + * calculation + */ + ++n; + set_rows_after_error(lf_sync, 0, cm->mi_rows, tile_data->buf_end - n, + 1 << cm->log2_tile_cols); + } + + if (pbi->lpf_mt_opt && !tile_data->xd.corrupted && cm->lf.filter_level && + !cm->skip_loop_filter) { + vp9_loopfilter_rows(lf_data, lf_sync); + } + tile_data->data_end = bit_reader_end; return !tile_data->xd.corrupted; } @@ -1536,6 +1956,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, VP9_COMMON *const cm = &pbi->common; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); const uint8_t *bit_reader_end = NULL; + VP9LfSync *lf_row_sync = &pbi->lf_row_sync; + YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm); const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -1562,12 +1984,26 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data, } } + // Initialize LPF + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level, + pbi->num_tile_workers); + } + // Reset tile decoding hook for (n = 0; n < num_workers; ++n) { VPxWorker *const worker = &pbi->tile_workers[n]; TileWorkerData *const tile_data = &pbi->tile_worker_data[n + pbi->total_tiles]; winterface->sync(worker); + + if (pbi->lpf_mt_opt && cm->lf.filter_level && !cm->skip_loop_filter) { + tile_data->lf_sync = lf_row_sync; + tile_data->lf_data = &tile_data->lf_sync->lfdata[n]; + vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane); + tile_data->lf_data->y_only = 0; + } + tile_data->xd = pbi->mb; tile_data->xd.counts = cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts; @@ -1724,6 +2160,22 @@ static void read_bitdepth_colorspace_sampling(VP9_COMMON *cm, } } +static INLINE void flush_all_fb_on_key(VP9_COMMON *cm) { + if (cm->frame_type == KEY_FRAME && cm->current_video_frame > 0) { + RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (i == cm->new_fb_idx) continue; + frame_bufs[i].ref_count = 0; + if (!frame_bufs[i].released) { + pool->release_fb_cb(pool->cb_priv, &frame_bufs[i].raw_frame_buffer); + frame_bufs[i].released = 1; + } + } + } +} + static size_t read_uncompressed_header(VP9Decoder *pbi, struct vpx_read_bit_buffer *rb) { VP9_COMMON *const cm = &pbi->common; @@ -1788,6 +2240,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_frame_size(cm, rb); if (pbi->need_resync) { memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map)); + flush_all_fb_on_key(cm); pbi->need_resync = 0; } } else { @@ -1911,6 +2364,28 @@ static size_t read_uncompressed_header(VP9Decoder *pbi, setup_segmentation_dequant(cm); setup_tile_info(cm, rb); + if (pbi->row_mt == 1) { + int num_sbs = 1; + + if (pbi->row_mt_worker_data == NULL) { + CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data, + vpx_calloc(1, sizeof(*pbi->row_mt_worker_data))); + } + + if (pbi->max_threads > 1) { + const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int sb_cols = aligned_cols >> MI_BLOCK_SIZE_LOG2; + const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows); + const int sb_rows = aligned_rows >> MI_BLOCK_SIZE_LOG2; + + num_sbs = sb_cols * sb_rows; + } + + if (num_sbs > pbi->row_mt_worker_data->num_sbs) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + vp9_dec_alloc_row_mt_mem(pbi->row_mt_worker_data, cm, num_sbs); + } + } sz = vpx_rb_read_literal(rb, 16); if (sz == 0) @@ -1953,7 +2428,7 @@ static int read_compressed_header(VP9Decoder *pbi, const uint8_t *data, cm->reference_mode = read_frame_reference_mode(cm, &r); if (cm->reference_mode != SINGLE_REFERENCE) - setup_compound_reference_mode(cm); + vp9_setup_compound_reference_mode(cm); read_frame_reference_mode_probs(cm, &r); for (j = 0; j < BLOCK_SIZE_GROUPS; j++) @@ -2072,17 +2547,19 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data, if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) { // Multi-threaded tile decoder *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end); - if (!xd->corrupted) { - if (!cm->skip_loop_filter) { - // If multiple threads are used to decode tiles, then we use those - // threads to do parallel loopfiltering. - vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, - 0, 0, pbi->tile_workers, pbi->num_tile_workers, - &pbi->lf_row_sync); + if (!pbi->lpf_mt_opt) { + if (!xd->corrupted) { + if (!cm->skip_loop_filter) { + // If multiple threads are used to decode tiles, then we use those + // threads to do parallel loopfiltering. + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, + cm->lf.filter_level, 0, 0, pbi->tile_workers, + pbi->num_tile_workers, &pbi->lf_row_sync); + } + } else { + vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, + "Decode failed. Frame data is corrupted."); } - } else { - vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, - "Decode failed. Frame data is corrupted."); } } else { *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end); diff --git a/libvpx/vp9/decoder/vp9_decodeframe.h b/libvpx/vp9/decoder/vp9_decodeframe.h index 44717f546..ba95e7234 100644 --- a/libvpx/vp9/decoder/vp9_decodeframe.h +++ b/libvpx/vp9/decoder/vp9_decodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEFRAME_H_ -#define VP9_DECODER_VP9_DECODEFRAME_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEFRAME_H_ +#define VPX_VP9_DECODER_VP9_DECODEFRAME_H_ #ifdef __cplusplus extern "C" { @@ -32,4 +32,4 @@ void vp9_decode_frame(struct VP9Decoder *pbi, const uint8_t *data, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEFRAME_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEFRAME_H_ diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h index b460cb8fb..11b45ace0 100644 --- a/libvpx/vp9/decoder/vp9_decodemv.h +++ b/libvpx/vp9/decoder/vp9_decodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODEMV_H_ -#define VP9_DECODER_VP9_DECODEMV_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODEMV_H_ +#define VPX_VP9_DECODER_VP9_DECODEMV_H_ #include "vpx_dsp/bitreader.h" @@ -26,4 +26,4 @@ void vp9_read_mode_info(TileWorkerData *twd, VP9Decoder *const pbi, int mi_row, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODEMV_H_ +#endif // VPX_VP9_DECODER_VP9_DECODEMV_H_ diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c index a913fa560..7fde0b07f 100644 --- a/libvpx/vp9/decoder/vp9_decoder.c +++ b/libvpx/vp9/decoder/vp9_decoder.c @@ -55,6 +55,43 @@ static void vp9_dec_setup_mi(VP9_COMMON *cm) { cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base)); } +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs) { + int plane; + const size_t dqcoeff_size = (num_sbs << DQCOEFFS_PER_SB_LOG2) * + sizeof(*row_mt_worker_data->dqcoeff[0]); + row_mt_worker_data->num_sbs = num_sbs; + for (plane = 0; plane < 3; ++plane) { + CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane], + vpx_memalign(16, dqcoeff_size)); + memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size); + CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane], + vpx_calloc(num_sbs << EOBS_PER_SB_LOG2, + sizeof(*row_mt_worker_data->eob[plane]))); + } + CHECK_MEM_ERROR(cm, row_mt_worker_data->partition, + vpx_calloc(num_sbs * PARTITIONS_PER_SB, + sizeof(*row_mt_worker_data->partition))); + CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map, + vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map))); +} + +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data) { + if (row_mt_worker_data != NULL) { + int plane; + for (plane = 0; plane < 3; ++plane) { + vpx_free(row_mt_worker_data->eob[plane]); + row_mt_worker_data->eob[plane] = NULL; + vpx_free(row_mt_worker_data->dqcoeff[plane]); + row_mt_worker_data->dqcoeff[plane] = NULL; + } + vpx_free(row_mt_worker_data->partition); + row_mt_worker_data->partition = NULL; + vpx_free(row_mt_worker_data->recon_map); + row_mt_worker_data->recon_map = NULL; + } +} + static int vp9_dec_alloc_mi(VP9_COMMON *cm, int mi_size) { cm->mip = vpx_calloc(mi_size, sizeof(*cm->mip)); if (!cm->mip) return 1; @@ -69,6 +106,7 @@ static void vp9_dec_free_mi(VP9_COMMON *cm) { cm->mip = NULL; vpx_free(cm->mi_grid_base); cm->mi_grid_base = NULL; + cm->mi_alloc_size = 0; } VP9Decoder *vp9_decoder_create(BufferPool *const pool) { @@ -139,6 +177,10 @@ void vp9_decoder_remove(VP9Decoder *pbi) { vp9_loop_filter_dealloc(&pbi->lf_row_sync); } + if (pbi->row_mt == 1) { + vp9_dec_free_row_mt_mem(pbi->row_mt_worker_data); + vpx_free(pbi->row_mt_worker_data); + } vp9_remove_common(&pbi->common); vpx_free(pbi); } @@ -260,6 +302,44 @@ static void swap_frame_buffers(VP9Decoder *pbi) { cm->frame_refs[ref_index].idx = -1; } +static void release_fb_on_decoder_exit(VP9Decoder *pbi) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *volatile const cm = &pbi->common; + BufferPool *volatile const pool = cm->buffer_pool; + RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs; + int i; + + // Synchronize all threads immediately as a subsequent decode call may + // cause a resize invalidating some allocations. + winterface->sync(&pbi->lf_worker); + for (i = 0; i < pbi->num_tile_workers; ++i) { + winterface->sync(&pbi->tile_workers[i]); + } + + // Release all the reference buffers if worker thread is holding them. + if (pbi->hold_ref_buf == 1) { + int ref_index = 0, mask; + for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { + const int old_idx = cm->ref_frame_map[ref_index]; + // Current thread releases the holding of reference frame. + decrease_ref_count(old_idx, frame_bufs, pool); + + // Release the reference frame in reference map. + if (mask & 1) { + decrease_ref_count(old_idx, frame_bufs, pool); + } + ++ref_index; + } + + // Current thread releases the holding of reference frame. + for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { + const int old_idx = cm->ref_frame_map[ref_index]; + decrease_ref_count(old_idx, frame_bufs, pool); + } + pbi->hold_ref_buf = 0; + } +} + int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, const uint8_t **psource) { VP9_COMMON *volatile const cm = &pbi->common; @@ -297,6 +377,9 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, // Find a free frame buffer. Return error if can not find any. cm->new_fb_idx = get_free_fb(cm); if (cm->new_fb_idx == INVALID_IDX) { + pbi->ready_for_new_data = 1; + release_fb_on_decoder_exit(pbi); + vpx_clear_system_state(); vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Unable to find free frame buffer"); return cm->error.error_code; @@ -309,44 +392,11 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, pbi->cur_buf = &frame_bufs[cm->new_fb_idx]; if (setjmp(cm->error.jmp)) { - const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - int i; - cm->error.setjmp = 0; pbi->ready_for_new_data = 1; - - // Synchronize all threads immediately as a subsequent decode call may - // cause a resize invalidating some allocations. - winterface->sync(&pbi->lf_worker); - for (i = 0; i < pbi->num_tile_workers; ++i) { - winterface->sync(&pbi->tile_workers[i]); - } - - // Release all the reference buffers if worker thread is holding them. - if (pbi->hold_ref_buf == 1) { - int ref_index = 0, mask; - for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) { - const int old_idx = cm->ref_frame_map[ref_index]; - // Current thread releases the holding of reference frame. - decrease_ref_count(old_idx, frame_bufs, pool); - - // Release the reference frame in reference map. - if (mask & 1) { - decrease_ref_count(old_idx, frame_bufs, pool); - } - ++ref_index; - } - - // Current thread releases the holding of reference frame. - for (; ref_index < REF_FRAMES && !cm->show_existing_frame; ++ref_index) { - const int old_idx = cm->ref_frame_map[ref_index]; - decrease_ref_count(old_idx, frame_bufs, pool); - } - pbi->hold_ref_buf = 0; - } + release_fb_on_decoder_exit(pbi); // Release current frame. decrease_ref_count(cm->new_fb_idx, frame_bufs, pool); - vpx_clear_system_state(); return -1; } @@ -364,6 +414,8 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size, if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm); } + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + // Update progress in frame parallel decode. cm->last_width = cm->width; cm->last_height = cm->height; @@ -394,7 +446,7 @@ int vp9_get_raw_frame(VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, #if CONFIG_VP9_POSTPROC if (!cm->show_existing_frame) { - ret = vp9_post_proc_frame(cm, sd, flags); + ret = vp9_post_proc_frame(cm, sd, flags, cm->width); } else { *sd = *cm->frame_to_show; ret = 0; diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h index 4b26c314d..9a582fffb 100644 --- a/libvpx/vp9/decoder/vp9_decoder.h +++ b/libvpx/vp9/decoder/vp9_decoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DECODER_H_ -#define VP9_DECODER_VP9_DECODER_H_ +#ifndef VPX_VP9_DECODER_VP9_DECODER_H_ +#define VPX_VP9_DECODER_VP9_DECODER_H_ #include "./vpx_config.h" @@ -26,6 +26,10 @@ extern "C" { #endif +#define EOBS_PER_SB_LOG2 8 +#define DQCOEFFS_PER_SB_LOG2 12 +#define PARTITIONS_PER_SB 85 + typedef struct TileBuffer { const uint8_t *data; size_t size; @@ -37,12 +41,22 @@ typedef struct TileWorkerData { int buf_start, buf_end; // pbi->tile_buffers to decode, inclusive vpx_reader bit_reader; FRAME_COUNTS counts; + LFWorkerData *lf_data; + VP9LfSync *lf_sync; DECLARE_ALIGNED(16, MACROBLOCKD, xd); /* dqcoeff are shared by all the planes. So planes must be decoded serially */ DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); struct vpx_internal_error_info error_info; } TileWorkerData; +typedef struct RowMTWorkerData { + int num_sbs; + int *eob[MAX_MB_PLANE]; + PARTITION_TYPE *partition; + tran_low_t *dqcoeff[MAX_MB_PLANE]; + int8_t *recon_map; +} RowMTWorkerData; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); @@ -72,10 +86,14 @@ typedef struct VP9Decoder { int inv_tile_order; int need_resync; // wait for key/intra-only frame. int hold_ref_buf; // hold the reference buffer. + + int row_mt; + int lpf_mt_opt; + RowMTWorkerData *row_mt_worker_data; } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, - const uint8_t **dest); + const uint8_t **psource); int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, vp9_ppflags_t *flags); @@ -109,6 +127,10 @@ struct VP9Decoder *vp9_decoder_create(BufferPool *const pool); void vp9_decoder_remove(struct VP9Decoder *pbi); +void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data, + VP9_COMMON *cm, int num_sbs); +void vp9_dec_free_row_mt_mem(RowMTWorkerData *row_mt_worker_data); + static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, BufferPool *const pool) { if (idx >= 0 && frame_bufs[idx].ref_count > 0) { @@ -129,4 +151,4 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DECODER_H_ +#endif // VPX_VP9_DECODER_VP9_DECODER_H_ diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h index 7b0d87601..a32052fff 100644 --- a/libvpx/vp9/decoder/vp9_detokenize.h +++ b/libvpx/vp9/decoder/vp9_detokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DETOKENIZE_H_ -#define VP9_DECODER_VP9_DETOKENIZE_H_ +#ifndef VPX_VP9_DECODER_VP9_DETOKENIZE_H_ +#define VPX_VP9_DECODER_VP9_DETOKENIZE_H_ #include "vpx_dsp/bitreader.h" #include "vp9/decoder/vp9_decoder.h" @@ -27,4 +27,4 @@ int vp9_decode_block_tokens(TileWorkerData *twd, int plane, } // extern "C" #endif -#endif // VP9_DECODER_VP9_DETOKENIZE_H_ +#endif // VPX_VP9_DECODER_VP9_DETOKENIZE_H_ diff --git a/libvpx/vp9/decoder/vp9_dsubexp.h b/libvpx/vp9/decoder/vp9_dsubexp.h index 5a8ec8300..b0c775073 100644 --- a/libvpx/vp9/decoder/vp9_dsubexp.h +++ b/libvpx/vp9/decoder/vp9_dsubexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DSUBEXP_H_ -#define VP9_DECODER_VP9_DSUBEXP_H_ +#ifndef VPX_VP9_DECODER_VP9_DSUBEXP_H_ +#define VPX_VP9_DECODER_VP9_DSUBEXP_H_ #include "vpx_dsp/bitreader.h" @@ -23,4 +23,4 @@ void vp9_diff_update_prob(vpx_reader *r, vpx_prob *p); } // extern "C" #endif -#endif // VP9_DECODER_VP9_DSUBEXP_H_ +#endif // VPX_VP9_DECODER_VP9_DSUBEXP_H_ diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c index 513718e7c..f8dd0a6f7 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -23,13 +23,13 @@ void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { tran_low_t temp_buffer[64]; (void)coeff_ptr; vpx_fdct8x8_neon(input, temp_buffer, stride); vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr, - qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr, - iscan_ptr); + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan); } diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c index 97a09bdff..8b62b450c 100644 --- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_s16(v_eobmax_76543210); +#else { const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); @@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } +#endif // __aarch64__ } static INLINE int32x4_t extract_sign_bit(int32x4_t a) { @@ -122,7 +126,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); @@ -134,8 +138,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); // Process dc and the first seven ac coeffs. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -169,12 +173,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; @@ -188,8 +192,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, // Process the rest of the ac coeffs. for (i = 8; i < 32 * 32; i += 8) { - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -215,17 +219,20 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; } +#ifdef __aarch64__ + *eob_ptr = vmaxvq_u16(eob_max); +#else { const uint16x4_t eob_max_0 = vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max)); @@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } +#endif // __aarch64__ } } diff --git a/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h b/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h index 794bec70b..fa1af2fc5 100644 --- a/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h +++ b/libvpx/vp9/encoder/mips/msa/vp9_fdct_msa.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ -#define VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#ifndef VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ +#define VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ #include "vpx_dsp/mips/fwd_txfm_msa.h" #include "vpx_dsp/mips/txfm_macros_msa.h" @@ -113,4 +113,4 @@ PCKEV_H4_SH(in0_r_m, in0_r_m, in1_r_m, in1_r_m, s2_m, s2_m, s3_m, s3_m, \ out0, out1, out2, out3); \ } -#endif /* VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ */ +#endif // VPX_VP9_ENCODER_MIPS_MSA_VP9_FDCT_MSA_H_ diff --git a/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c new file mode 100644 index 000000000..4f88b8fff --- /dev/null +++ b/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" + +#include "./vp9_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +// (a * b) >> 16 +// Note: Because this is done in 2 operations, a and b cannot both be UINT16_MIN +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >> 15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_u16); +} + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + + (void)scan; + (void)skip_block; + assert(!skip_block); + + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from round and quant + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + + // Remove DC value from dequant + dequant = vec_splat(dequant, 1); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + // We quantize 16 coeff up front (enough for a 4x4) and process 24 coeff per + // loop iteration. + // for 8x8: 16 + 2 x 24 = 64 + // for 16x16: 16 + 10 x 24 = 256 + if (n_coeffs > 16) { + int16x8_t coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2; + + int index = 16; + int off0 = 32; + int off1 = 48; + int off2 = 64; + + do { + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + + qcoeff1 = vec_mulhi(vec_vaddshs(vec_abs(coeff1), round), quant); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + + qcoeff2 = vec_mulhi(vec_vaddshs(vec_abs(coeff2), round), quant); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + qcoeff2 = vec_sign(qcoeff2, coeff2); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + index += 24; + off0 += 48; + off1 += 48; + off2 += 48; + } while (index < n_coeffs); + } + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} + +// Sets the value of a 32-bit integers to 1 when the corresponding value in a is +// negative. +static INLINE int32x4_t vec_is_neg(int32x4_t a) { + return vec_sr(a, vec_shift_sign_s32); +} + +// DeQuantization function used for 32x32 blocks. Quantized coeff of 32x32 +// blocks are twice as big as for other block sizes. As such, using +// vec_mladd results in overflow. +static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff, + int16x8_t dequant) { + int32x4_t dqcoeffe = vec_mule(qcoeff, dequant); + int32x4_t dqcoeffo = vec_mulo(qcoeff, dequant); + // Add 1 if negative to round towards zero because the C uses division. + dqcoeffe = vec_add(dqcoeffe, vec_is_neg(dqcoeffe)); + dqcoeffo = vec_add(dqcoeffo, vec_is_neg(dqcoeffo)); + dqcoeffe = vec_sra(dqcoeffe, vec_ones_u32); + dqcoeffo = vec_sra(dqcoeffo, vec_ones_u32); + return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack); +} + +void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + // In stage 1, we quantize 16 coeffs (DC + 15 AC) + // In stage 2, we loop 42 times and quantize 24 coeffs per iteration + // (32 * 32 - 16) / 24 = 42 + int num_itr = 42; + // Offsets are in bytes, 16 coeffs = 32 bytes + int off0 = 32; + int off1 = 48; + int off2 = 64; + + int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; + bool16x8_t mask0, mask1, zero_coeff0, zero_coeff1; + + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); + int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2)); + int16x8_t abs_coeff0 = vec_abs(coeff0); + int16x8_t abs_coeff1 = vec_abs(coeff1); + + (void)scan; + (void)skip_block; + (void)n_coeffs; + assert(!skip_block); + + mask0 = vec_cmpge(abs_coeff0, thres); + round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16); + // First set of 8 coeff starts with DC + 7 AC + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff0 = vec_and(qcoeff0, mask0); + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + qcoeff0 = vec_sign(qcoeff0, coeff0); + vec_vsx_st(qcoeff0, 0, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr); + + // Remove DC value from thres, round, quant and dequant + thres = vec_splat(thres, 1); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + dequant = vec_splat(dequant, 1); + + mask1 = vec_cmpge(abs_coeff1, thres); + + // Second set of 8 coeff starts with (all AC) + qcoeff1 = + vec_madds(vec_vaddshs(vec_abs(coeff1), round), quant, vec_zeros_s16); + qcoeff1 = vec_and(qcoeff1, mask1); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + qcoeff1 = vec_sign(qcoeff1, coeff1); + vec_vsx_st(qcoeff1, 16, qcoeff_ptr); + + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr); + + eob = vec_max(vec_or(scan0, zero_coeff0), vec_or(scan1, zero_coeff1)); + + do { + int16x8_t coeff2, abs_coeff2, qcoeff2, dqcoeff2, eob2, scan2; + bool16x8_t zero_coeff2, mask2; + coeff0 = vec_vsx_ld(off0, coeff_ptr); + coeff1 = vec_vsx_ld(off1, coeff_ptr); + coeff2 = vec_vsx_ld(off2, coeff_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); + + abs_coeff0 = vec_abs(coeff0); + abs_coeff1 = vec_abs(coeff1); + abs_coeff2 = vec_abs(coeff2); + + qcoeff0 = vec_madds(vec_vaddshs(abs_coeff0, round), quant, vec_zeros_s16); + qcoeff1 = vec_madds(vec_vaddshs(abs_coeff1, round), quant, vec_zeros_s16); + qcoeff2 = vec_madds(vec_vaddshs(abs_coeff2, round), quant, vec_zeros_s16); + + mask0 = vec_cmpge(abs_coeff0, thres); + mask1 = vec_cmpge(abs_coeff1, thres); + mask2 = vec_cmpge(abs_coeff2, thres); + + qcoeff0 = vec_and(qcoeff0, mask0); + qcoeff1 = vec_and(qcoeff1, mask1); + qcoeff2 = vec_and(qcoeff2, mask2); + + zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); + zero_coeff1 = vec_cmpeq(qcoeff1, vec_zeros_s16); + zero_coeff2 = vec_cmpeq(qcoeff2, vec_zeros_s16); + + qcoeff0 = vec_sign(qcoeff0, coeff0); + qcoeff1 = vec_sign(qcoeff1, coeff1); + qcoeff2 = vec_sign(qcoeff2, coeff2); + + vec_vsx_st(qcoeff0, off0, qcoeff_ptr); + vec_vsx_st(qcoeff1, off1, qcoeff_ptr); + vec_vsx_st(qcoeff2, off2, qcoeff_ptr); + + dqcoeff0 = dequantize_coeff_32(qcoeff0, dequant); + dqcoeff1 = dequantize_coeff_32(qcoeff1, dequant); + dqcoeff2 = dequantize_coeff_32(qcoeff2, dequant); + + vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr); + vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr); + vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr); + + eob = vec_max(eob, vec_or(scan0, zero_coeff0)); + eob2 = vec_max(vec_or(scan1, zero_coeff1), vec_or(scan2, zero_coeff2)); + eob = vec_max(eob, eob2); + + off0 += 48; + off1 += 48; + off2 += 48; + num_itr--; + } while (num_itr != 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0] + 1; +} diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libvpx/vp9/encoder/vp9_alt_ref_aq.h index e508cb44a..22a657e03 100644 --- a/libvpx/vp9/encoder/vp9_alt_ref_aq.h +++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.h @@ -15,8 +15,8 @@ * for altref frames. Go to alt_ref_aq_private.h for implmentation details. */ -#ifndef VP9_ENCODER_VP9_ALT_REF_AQ_H_ -#define VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#ifndef VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#define VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ #include "vpx/vpx_integer.h" @@ -124,4 +124,4 @@ void vp9_alt_ref_aq_destroy(struct ALT_REF_AQ *const self); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ALT_REF_AQ_H_ +#endif // VPX_VP9_ENCODER_VP9_ALT_REF_AQ_H_ diff --git a/libvpx/vp9/encoder/vp9_aq_360.h b/libvpx/vp9/encoder/vp9_aq_360.h index b1b56561d..749d3c198 100644 --- a/libvpx/vp9/encoder/vp9_aq_360.h +++ b/libvpx/vp9/encoder/vp9_aq_360.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_360_H_ -#define VP9_ENCODER_VP9_AQ_360_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_360_H_ +#define VPX_VP9_ENCODER_VP9_AQ_360_H_ #include "vp9/encoder/vp9_encoder.h" @@ -24,4 +24,4 @@ void vp9_360aq_frame_setup(VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_360_H_ diff --git a/libvpx/vp9/encoder/vp9_aq_complexity.h b/libvpx/vp9/encoder/vp9_aq_complexity.h index a00d34e70..d3cb34c01 100644 --- a/libvpx/vp9/encoder/vp9_aq_complexity.h +++ b/libvpx/vp9/encoder/vp9_aq_complexity.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ -#define VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#define VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ #ifdef __cplusplus extern "C" { @@ -33,4 +33,4 @@ void vp9_setup_in_frame_q_adj(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_COMPLEXITY_H_ diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 2f2f0055a..a2a742493 100644 --- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -21,6 +21,14 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_segmentation.h" +static const uint8_t VP9_VAR_OFFS[64] = { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 +}; + CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { size_t last_coded_q_map_size; CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr)); @@ -39,13 +47,16 @@ CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) { } assert(MAXQ <= 255); memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size); + cr->counter_encode_maxq_scene_change = 0; return cr; } void vp9_cyclic_refresh_free(CYCLIC_REFRESH *cr) { - vpx_free(cr->map); - vpx_free(cr->last_coded_q_map); - vpx_free(cr); + if (cr != NULL) { + vpx_free(cr->map); + vpx_free(cr->last_coded_q_map); + vpx_free(cr); + } } // Check if this coding block, of size bsize, should be considered for refresh @@ -318,6 +329,28 @@ void vp9_cyclic_refresh_set_golden_update(VP9_COMP *const cpi) { rc->baseline_gf_interval = 10; } +static int is_superblock_flat_static(VP9_COMP *const cpi, int sb_row_index, + int sb_col_index) { + unsigned int source_variance; + const uint8_t *src_y = cpi->Source->y_buffer; + const int ystride = cpi->Source->y_stride; + unsigned int sse; + const BLOCK_SIZE bsize = BLOCK_64X64; + src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + source_variance = + cpi->fn_ptr[bsize].vf(src_y, ystride, VP9_VAR_OFFS, 0, &sse); + if (source_variance == 0) { + uint64_t block_sad; + const uint8_t *last_src_y = cpi->Last_Source->y_buffer; + const int last_ystride = cpi->Last_Source->y_stride; + last_src_y += (sb_row_index << 6) * ystride + (sb_col_index << 6); + block_sad = + cpi->fn_ptr[bsize].sdf(src_y, ystride, last_src_y, last_ystride); + if (block_sad == 0) return 1; + } + return 0; +} + // Update the segmentation map, and related quantities: cyclic refresh map, // refresh sb_index, and target number of blocks to be refreshed. // The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to @@ -368,8 +401,17 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { int sb_col_index = i - sb_row_index * sb_cols; int mi_row = sb_row_index * MI_BLOCK_SIZE; int mi_col = sb_col_index * MI_BLOCK_SIZE; + int flat_static_blocks = 0; + int compute_content = 1; assert(mi_row >= 0 && mi_row < cm->mi_rows); assert(mi_col >= 0 && mi_col < cm->mi_cols); +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) compute_content = 0; +#endif + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + compute_content = 0; bl_index = mi_row * cm->mi_cols + mi_col; // Loop through all 8x8 blocks in superblock and update map. xmis = @@ -400,11 +442,21 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { // Enforce constant segment over superblock. // If segment is at least half of superblock, set to 1. if (sum_map >= xmis * ymis / 2) { - for (y = 0; y < ymis; y++) - for (x = 0; x < xmis; x++) { - seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; - } - cr->target_num_seg_blocks += xmis * ymis; + // This superblock is a candidate for refresh: + // compute spatial variance and exclude blocks that are spatially flat + // and stationary. Note: this is currently only done for screne content + // mode. + if (compute_content && cr->skip_flat_static_blocks) + flat_static_blocks = + is_superblock_flat_static(cpi, sb_row_index, sb_col_index); + if (!flat_static_blocks) { + // Label this superblock as segment 1. + for (y = 0; y < ymis; y++) + for (x = 0; x < xmis; x++) { + seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1; + } + cr->target_num_seg_blocks += xmis * ymis; + } } i++; if (i == sbs_in_frame) { @@ -413,7 +465,8 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index); cr->sb_index = i; cr->reduce_refresh = 0; - if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1; } // Set cyclic refresh parameters. @@ -426,8 +479,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { double weight_segment_target = 0; double weight_segment = 0; int thresh_low_motion = (cm->width < 720) ? 55 : 20; + int qp_thresh = VPXMIN(20, rc->best_quality << 1); cr->apply_cyclic_refresh = 1; - if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 || + if (frame_is_intra_only(cm) || cpi->svc.temporal_layer_id > 0 || + is_lossless_requested(&cpi->oxcf) || + rc->avg_frame_qindex[INTER_FRAME] < qp_thresh || + (cpi->use_svc && + cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion && rc->frames_since_key > 40)) { cr->apply_cyclic_refresh = 0; @@ -454,6 +512,22 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_boost_fac = 13; } } + // For screen-content: keep rate_ratio_qdelta to 2.0 (segment#1 boost) and + // percent_refresh (refresh rate) to 10. But reduce rate boost for segment#2 + // (rate_boost_fac = 10 disables segment#2). + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) { + // Only enable feature of skipping flat_static blocks for top layer + // under screen content mode. + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cr->skip_flat_static_blocks = 1; + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 5 : 10; + // Increase the amount of refresh on scene change that is encoded at max Q, + // increase for a few cycles of the refresh period (~100 / percent_refresh). + if (cr->counter_encode_maxq_scene_change < 30) + cr->percent_refresh = (cr->skip_flat_static_blocks) ? 10 : 15; + cr->rate_ratio_qdelta = 2.0; + cr->rate_boost_fac = 10; + } // Adjust some parameters for low resolutions. if (cm->width <= 352 && cm->height <= 288) { if (rc->avg_frame_bandwidth < 3000) { @@ -464,10 +538,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5); } } - if (cpi->svc.spatial_layer_id > 0) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 12; - } if (cpi->oxcf.rc_mode == VPX_VBR) { // To be adjusted for VBR mode, e.g., based on gf period and boost. // For now use smaller qp-delta (than CBR), no second boosted seg, and @@ -492,6 +562,13 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { num8x8bl; if (weight_segment_target < 7 * weight_segment / 8) weight_segment = weight_segment_target; + // For screen-content: don't include target for the weight segment, + // since for all flat areas the segment is reset, so its more accurate + // to just use the previous actual number of seg blocks for the weight. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + weight_segment = + (double)(cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) / + num8x8bl; cr->weight_segment = weight_segment; } @@ -501,23 +578,31 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; struct segmentation *const seg = &cm->seg; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); if (cm->current_video_frame == 0) cr->low_content_avg = 0.0; - if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) { + // Reset if resoluton change has occurred. + if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); + if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation) || + scene_change_detected) { // Set segmentation map to 0 and disable. unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); vp9_disable_segmentation(&cm->seg); - if (cm->frame_type == KEY_FRAME) { + if (cm->frame_type == KEY_FRAME || scene_change_detected) { memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; cr->reduce_refresh = 0; + cr->counter_encode_maxq_scene_change = 0; } return; } else { int qindex_delta = 0; int qindex2; const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); + cr->counter_encode_maxq_scene_change++; vpx_clear_system_state(); // Set rate threshold to some multiple (set to 2 for now) of the target // rate (target is given by sb64_target_rate and scaled by 256). @@ -567,9 +652,6 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { cr->qindex_delta[2] = qindex_delta; vp9_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta); - // Reset if resoluton change has occurred. - if (cpi->resize_pending != 0) vp9_cyclic_refresh_reset_resize(cpi); - // Update the segmentation and refresh map. cyclic_refresh_update_map(cpi); } @@ -583,8 +665,19 @@ void vp9_cyclic_refresh_reset_resize(VP9_COMP *const cpi) { const VP9_COMMON *const cm = &cpi->common; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; memset(cr->map, 0, cm->mi_rows * cm->mi_cols); - memset(cr->last_coded_q_map, MAXQ, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); cr->sb_index = 0; cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; + cr->counter_encode_maxq_scene_change = 0; +} + +void vp9_cyclic_refresh_limit_q(const VP9_COMP *cpi, int *q) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; + // For now apply hard limit to frame-level decrease in q, if the cyclic + // refresh is active (percent_refresh > 0). + if (cr->percent_refresh > 0 && cpi->rc.q_1_frame - *q > 8) { + *q = cpi->rc.q_1_frame - 8; + } } diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h index 77fa67c9e..a4a9f1c98 100644 --- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h +++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ -#define VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#define VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -68,6 +68,8 @@ struct CYCLIC_REFRESH { int reduce_refresh; double weight_segment; int apply_cyclic_refresh; + int counter_encode_maxq_scene_change; + int skip_flat_static_blocks; }; struct VP9_COMP; @@ -139,8 +141,10 @@ static INLINE int cyclic_refresh_segment_id(int segment_id) { return CR_SEGMENT_ID_BASE; } +void vp9_cyclic_refresh_limit_q(const struct VP9_COMP *cpi, int *q); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_CYCLICREFRESH_H_ diff --git a/libvpx/vp9/encoder/vp9_aq_variance.c b/libvpx/vp9/encoder/vp9_aq_variance.c index 477f62ba5..9cd8819c3 100644 --- a/libvpx/vp9/encoder/vp9_aq_variance.c +++ b/libvpx/vp9/encoder/vp9_aq_variance.c @@ -19,6 +19,7 @@ #include "vp9/encoder/vp9_ratectrl.h" #include "vp9/encoder/vp9_rd.h" +#include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_segmentation.h" #define ENERGY_MIN (-4) @@ -192,6 +193,40 @@ double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { return log(var + 1.0); } +// Get the range of sub block energy values; +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e) { + VP9_COMMON *const cm = &cpi->common; + const int bw = num_8x8_blocks_wide_lookup[bsize]; + const int bh = num_8x8_blocks_high_lookup[bsize]; + const int xmis = VPXMIN(cm->mi_cols - mi_col, bw); + const int ymis = VPXMIN(cm->mi_rows - mi_row, bh); + int x, y; + + if (xmis < bw || ymis < bh) { + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); + *min_e = vp9_block_energy(cpi, mb, bsize); + *max_e = *min_e; + } else { + int energy; + *min_e = ENERGY_MAX; + *max_e = ENERGY_MIN; + + for (y = 0; y < ymis; ++y) { + for (x = 0; x < xmis; ++x) { + vp9_setup_src_planes(mb, cpi->Source, mi_row + y, mi_col + x); + energy = vp9_block_energy(cpi, mb, BLOCK_8X8); + *min_e = VPXMIN(*min_e, energy); + *max_e = VPXMAX(*max_e, energy); + } + } + } + + // Re-instate source pointers back to what they should have been on entry. + vp9_setup_src_planes(mb, cpi->Source, mi_row, mi_col); +} + #define DEFAULT_E_MIDPOINT 10.0 int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) { double energy; diff --git a/libvpx/vp9/encoder/vp9_aq_variance.h b/libvpx/vp9/encoder/vp9_aq_variance.h index 211a69f39..a4f872879 100644 --- a/libvpx/vp9/encoder/vp9_aq_variance.h +++ b/libvpx/vp9/encoder/vp9_aq_variance.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_AQ_VARIANCE_H_ -#define VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#ifndef VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#define VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -20,11 +20,15 @@ extern "C" { unsigned int vp9_vaq_segment_id(int energy); void vp9_vaq_frame_setup(VP9_COMP *cpi); +void vp9_get_sub_block_energy(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, + int mi_col, BLOCK_SIZE bsize, int *min_e, + int *max_e); int vp9_block_energy(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); + double vp9_log_block_var(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs); #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_AQ_VARIANCE_H_ +#endif // VPX_VP9_ENCODER_VP9_AQ_VARIANCE_H_ diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c index d346cd57a..76b7b123d 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.c +++ b/libvpx/vp9/encoder/vp9_bitstream.c @@ -86,7 +86,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, BLOCK_SIZE bsize = xd->mi[0]->sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; const vpx_prob *const tx_probs = - get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); + get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs); vpx_write(w, tx_size != TX_4X4, tx_probs[0]); if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) { vpx_write(w, tx_size != TX_8X8, tx_probs[1]); @@ -217,7 +217,8 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, } if (is_compound) { - vpx_write(w, mi->ref_frame[0] == GOLDEN_FRAME, + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + vpx_write(w, mi->ref_frame[!idx] == cm->comp_var_ref[1], vp9_get_pred_prob_comp_ref_p(cm, xd)); } else { const int bit0 = mi->ref_frame[0] != LAST_FRAME; @@ -459,7 +460,8 @@ static void write_modes_sb( write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, max_mv_magnitude, interp_filter_selected); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, max_mv_magnitude, interp_filter_selected); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, @@ -469,7 +471,6 @@ static void write_modes_sb( write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, subsize, max_mv_magnitude, interp_filter_selected); break; - default: assert(0); } } @@ -618,9 +619,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, return; } - case ONE_LOOP_REDUCED: { + default: { int updates = 0; int noupdates_before_first = 0; + assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED); for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { @@ -670,7 +672,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, } return; } - default: assert(0); } } @@ -909,10 +910,24 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { (cpi->refresh_golden_frame << cpi->alt_fb_idx); } else { int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + if (cpi->multi_layer_arf) { + for (arf_idx = 0; arf_idx < REF_FRAMES; ++arf_idx) { + if (arf_idx != cpi->alt_fb_idx && arf_idx != cpi->lst_fb_idx && + arf_idx != cpi->gld_fb_idx) { + int idx; + for (idx = 0; idx < gf_group->stack_size; ++idx) + if (arf_idx == gf_group->arf_index_stack[idx]) break; + if (idx == gf_group->stack_size) break; + } + } } + cpi->twopass.gf_group.top_arf_idx = arf_idx; + + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + return cpi->svc.update_buffer_slot[cpi->svc.spatial_layer_id]; return (cpi->refresh_last_frame << cpi->lst_fb_idx) | (cpi->refresh_golden_frame << cpi->gld_fb_idx) | (cpi->refresh_alt_ref_frame << arf_idx); @@ -1117,11 +1132,7 @@ static void write_frame_size_with_refs(VP9_COMP *cpi, ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) || (cpi->svc.number_spatial_layers > 1 && - cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame) || - (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state == ENCODING && - cpi->svc.layer_context[0].frames_from_key_frame < - cpi->svc.number_temporal_layers + 1))) { + cpi->svc.layer_context[cpi->svc.spatial_layer_id].is_key_frame))) { found = 0; } else if (cfg != NULL) { found = @@ -1153,8 +1164,10 @@ static void write_profile(BITSTREAM_PROFILE profile, case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break; case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break; case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break; - case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break; - default: assert(0); + default: + assert(profile == PROFILE_3); + vpx_wb_write_literal(wb, 6, 3); + break; } } @@ -1191,7 +1204,13 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_profile(cm->profile, wb); - vpx_wb_write_bit(wb, 0); // show_existing_frame + // If to use show existing frame. + vpx_wb_write_bit(wb, cm->show_existing_frame); + if (cm->show_existing_frame) { + vpx_wb_write_literal(wb, cpi->alt_fb_idx, 3); + return; + } + vpx_wb_write_bit(wb, cm->frame_type); vpx_wb_write_bit(wb, cm->show_frame); vpx_wb_write_bit(wb, cm->error_resilient_mode); @@ -1201,14 +1220,6 @@ static void write_uncompressed_header(VP9_COMP *cpi, write_bitdepth_colorspace_sampling(cm, wb); write_frame_size(cm, wb); } else { - // In spatial svc if it's not error_resilient_mode then we need to code all - // visible frames as invisible. But we need to keep the show_frame flag so - // that the publisher could know whether it is supposed to be visible. - // So we will code the show_frame flag as it is. Then code the intra_only - // bit here. This will make the bitstream incompatible. In the player we - // will change to show_frame flag to 0, then add an one byte frame with - // show_existing_frame flag which tells the decoder which frame we want to - // show. if (!cm->show_frame) vpx_wb_write_bit(wb, cm->intra_only); if (!cm->error_resilient_mode) @@ -1341,6 +1352,10 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { struct vpx_write_bit_buffer saved_wb; write_uncompressed_header(cpi, &wb); + + // Skip the rest coding process if use show existing frame. + if (cpi->common.show_existing_frame) return; + saved_wb = wb; vpx_wb_write_literal(&wb, 0, 16); // don't know in advance first part. size diff --git a/libvpx/vp9/encoder/vp9_bitstream.h b/libvpx/vp9/encoder/vp9_bitstream.h index 339c3fecb..208651dc2 100644 --- a/libvpx/vp9/encoder/vp9_bitstream.h +++ b/libvpx/vp9/encoder/vp9_bitstream.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BITSTREAM_H_ -#define VP9_ENCODER_VP9_BITSTREAM_H_ +#ifndef VPX_VP9_ENCODER_VP9_BITSTREAM_H_ +#define VPX_VP9_ENCODER_VP9_BITSTREAM_H_ #ifdef __cplusplus extern "C" { @@ -38,16 +38,12 @@ void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { - return !cpi->multi_arf_allowed && cpi->refresh_golden_frame && - cpi->rc.is_src_frame_alt_ref && - (!cpi->use_svc || // Add spatial svc base layer case here - (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].gold_ref_idx >= 0 && - cpi->oxcf.ss_enable_auto_arf[0])); + return cpi->refresh_golden_frame && cpi->rc.is_src_frame_alt_ref && + !cpi->use_svc; } #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BITSTREAM_H_ +#endif // VPX_VP9_ENCODER_VP9_BITSTREAM_H_ diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h index 724205dd5..563fdbbde 100644 --- a/libvpx/vp9/encoder/vp9_block.h +++ b/libvpx/vp9/encoder/vp9_block.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_BLOCK_H_ -#define VP9_ENCODER_VP9_BLOCK_H_ +#ifndef VPX_VP9_ENCODER_VP9_BLOCK_H_ +#define VPX_VP9_ENCODER_VP9_BLOCK_H_ #include "vpx_util/vpx_thread.h" @@ -92,6 +92,7 @@ struct macroblock { int sadperbit4; int rddiv; int rdmult; + int cb_rdmult; int mb_energy; // These are set to their default values at the beginning, and then adjusted @@ -115,6 +116,12 @@ struct macroblock { int *nmvsadcost_hp[2]; int **mvsadcost; + // sharpness is used to disable skip mode and change rd_mult + int sharpness; + + // aq mode is used to adjust rd based on segment. + int adjust_rdmult_by_segment; + // These define limits to motion vector components to prevent them // from extending outside the UMV borders MvLimits mv_limits; @@ -180,6 +187,8 @@ struct macroblock { int sb_pickmode_part; + int zero_temp_sad_source; + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) // based on source sad, prior to encoding the frame. uint8_t content_state_sb; @@ -199,10 +208,15 @@ struct macroblock { void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest, int stride, int eob, int bd); #endif +#if CONFIG_ML_VAR_PARTITION + DECLARE_ALIGNED(16, uint8_t, est_pred[64 * 64]); +#endif // CONFIG_ML_VAR_PARTITION + + struct scale_factors *me_sf; }; #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_BLOCK_H_ +#endif // VPX_VP9_ENCODER_VP9_BLOCK_H_ diff --git a/libvpx/vp9/encoder/vp9_blockiness.c b/libvpx/vp9/encoder/vp9_blockiness.c index 9ab57b57c..da68a3c3c 100644 --- a/libvpx/vp9/encoder/vp9_blockiness.c +++ b/libvpx/vp9/encoder/vp9_blockiness.c @@ -11,6 +11,7 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/system_state.h" +#include "vp9/encoder/vp9_blockiness.h" static int horizontal_filter(const uint8_t *s) { return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6; diff --git a/libvpx/vp9/encoder/vp9_blockiness.h b/libvpx/vp9/encoder/vp9_blockiness.h new file mode 100644 index 000000000..e840cb251 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_blockiness.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2019 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ +#define VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, + const uint8_t *img2, int img2_pitch, int width, + int height); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_BLOCKINESS_H_ diff --git a/libvpx/vp9/encoder/vp9_context_tree.c b/libvpx/vp9/encoder/vp9_context_tree.c index 2f7e54433..b74b9027c 100644 --- a/libvpx/vp9/encoder/vp9_context_tree.c +++ b/libvpx/vp9/encoder/vp9_context_tree.c @@ -12,7 +12,10 @@ #include "vp9/encoder/vp9_encoder.h" static const BLOCK_SIZE square[] = { - BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, + BLOCK_8X8, + BLOCK_16X16, + BLOCK_32X32, + BLOCK_64X64, }; static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk, @@ -136,17 +139,22 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) { } void vp9_free_pc_tree(ThreadData *td) { - const int tree_nodes = 64 + 16 + 4 + 1; int i; - // Set up all 4x4 mode contexts - for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + if (td == NULL) return; - // Sets up all the leaf nodes in the tree. - for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + if (td->leaf_tree != NULL) { + // Set up all 4x4 mode contexts + for (i = 0; i < 64; ++i) free_mode_context(&td->leaf_tree[i]); + vpx_free(td->leaf_tree); + td->leaf_tree = NULL; + } - vpx_free(td->pc_tree); - td->pc_tree = NULL; - vpx_free(td->leaf_tree); - td->leaf_tree = NULL; + if (td->pc_tree != NULL) { + const int tree_nodes = 64 + 16 + 4 + 1; + // Sets up all the leaf nodes in the tree. + for (i = 0; i < tree_nodes; ++i) free_tree_contexts(&td->pc_tree[i]); + vpx_free(td->pc_tree); + td->pc_tree = NULL; + } } diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h index 73423c075..d2cdb1010 100644 --- a/libvpx/vp9/encoder/vp9_context_tree.h +++ b/libvpx/vp9/encoder/vp9_context_tree.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_CONTEXT_TREE_H_ -#define VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#ifndef VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ +#define VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_block.h" @@ -56,6 +56,7 @@ typedef struct { // scope of refactoring. int rate; int64_t dist; + int64_t rdcost; #if CONFIG_VP9_TEMPORAL_DENOISING unsigned int newmv_sse; @@ -75,6 +76,8 @@ typedef struct { // Used for the machine learning-based early termination int32_t sum_y_eobs; + // Skip certain ref frames during RD search of rectangular partitions. + uint8_t skip_ref_frame_mask; } PICK_MODE_CONTEXT; typedef struct PC_TREE { @@ -97,4 +100,4 @@ void vp9_free_pc_tree(struct ThreadData *td); } // extern "C" #endif -#endif /* VP9_ENCODER_VP9_CONTEXT_TREE_H_ */ +#endif // VPX_VP9_ENCODER_VP9_CONTEXT_TREE_H_ diff --git a/libvpx/vp9/encoder/vp9_cost.h b/libvpx/vp9/encoder/vp9_cost.h index 70a1a2e0e..638d72a91 100644 --- a/libvpx/vp9/encoder/vp9_cost.h +++ b/libvpx/vp9/encoder/vp9_cost.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_COST_H_ -#define VP9_ENCODER_VP9_COST_H_ +#ifndef VPX_VP9_ENCODER_VP9_COST_H_ +#define VPX_VP9_ENCODER_VP9_COST_H_ #include "vpx_dsp/prob.h" #include "vpx/vpx_integer.h" @@ -55,4 +55,4 @@ void vp9_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_COST_H_ +#endif // VPX_VP9_ENCODER_VP9_COST_H_ diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c index b08ccaa66..2820b71b4 100644 --- a/libvpx/vp9/encoder/vp9_denoiser.c +++ b/libvpx/vp9/encoder/vp9_denoiser.c @@ -189,7 +189,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation( int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx, int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv, int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx, - int use_svc, int spatial_layer) { + int use_svc, int spatial_layer, int use_gf_temporal_ref) { const int sse_diff = (ctx->newmv_sse == UINT_MAX) ? 0 : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse); @@ -220,7 +220,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation( // If the best reference frame uses inter-prediction and there is enough of a // difference in sum-squared-error, use it. if (frame != INTRA_FRAME && frame != ALTREF_FRAME && - (frame != GOLDEN_FRAME || num_spatial_layers == 1) && + (frame != GOLDEN_FRAME || num_spatial_layers == 1 || + use_gf_temporal_ref) && sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) { mi->ref_frame[0] = ctx->best_reference_frame; mi->mode = ctx->best_sse_inter_mode; @@ -230,7 +231,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation( frame = ctx->best_zeromv_reference_frame; ctx->newmv_sse = ctx->zeromv_sse; // Bias to last reference. - if (num_spatial_layers > 1 || frame == ALTREF_FRAME || + if ((num_spatial_layers > 1 && !use_gf_temporal_ref) || + frame == ALTREF_FRAME || (frame != LAST_FRAME && ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) || denoiser->denoising_level >= kDenHigh))) { @@ -261,6 +263,14 @@ static VP9_DENOISER_DECISION perform_motion_compensation( denoise_layer_idx = num_spatial_layers - spatial_layer - 1; } + // Force copy (no denoise, copy source in denoised buffer) if + // running_avg_y[frame] is NULL. + if (denoiser->running_avg_y[frame].buffer_alloc == NULL) { + // Restore everything to its original state + *mi = saved_mi; + return COPY_BLOCK; + } + if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) { // Restore everything to its original state *mi = saved_mi; @@ -326,7 +336,8 @@ static VP9_DENOISER_DECISION perform_motion_compensation( void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision) { + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref) { int mv_col, mv_row; int motion_magnitude = 0; int zeromv_filter = 0; @@ -349,6 +360,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, int is_skin = 0; int increase_denoising = 0; int consec_zeromv = 0; + int last_is_reference = cpi->ref_frame_flags & VP9_LAST_FLAG; mv_col = ctx->best_sse_mv.as_mv.col; mv_row = ctx->best_sse_mv.as_mv.row; motion_magnitude = mv_row * mv_row + mv_col * mv_col; @@ -379,7 +391,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, // zero/small motion in skin detection is high, i.e, > 4). if (consec_zeromv < 4) { i = ymis; - j = xmis; + break; } } } @@ -392,12 +404,18 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, } if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1; - if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising) + // Copy block if LAST_FRAME is not a reference. + // Last doesn't always exist when SVC layers are dynamically changed, e.g. top + // spatial layer doesn't have last reference when it's brought up for the + // first time on the fly. + if (last_is_reference && denoiser->denoising_level >= kDenLow && + !ctx->sb_skip_denoising) decision = perform_motion_compensation( &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx, motion_magnitude, is_skin, &zeromv_filter, consec_zeromv, cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx, - cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id); + cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id, + use_gf_temporal_ref); if (decision == FILTER_BLOCK) { decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start, @@ -445,16 +463,16 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest, } void vp9_denoiser_update_frame_info( - VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, - int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, - int svc_base_is_key, int second_spatial_layer) { + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer) { const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0; // Copy source into denoised reference buffers on KEY_FRAME or // if the just encoded frame was resized. For SVC, copy source if the base // spatial layer was key frame. if (frame_type == KEY_FRAME || resized != 0 || denoiser->reset || - svc_base_is_key) { + svc_refresh_denoiser_buffers) { int i; // Start at 1 so as not to overwrite the INTRA_FRAME for (i = 1; i < denoiser->num_ref_frames; ++i) { @@ -465,32 +483,43 @@ void vp9_denoiser_update_frame_info( return; } - // If more than one refresh occurs, must copy frame buffer. - if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) { - if (refresh_alt_ref_frame) { - copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_golden_frame) { - copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_last_frame) { - copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) + copy_frame(&denoiser->running_avg_y[i + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); } } else { - if (refresh_alt_ref_frame) { - swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_golden_frame) { - swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); - } - if (refresh_last_frame) { - swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], - &denoiser->running_avg_y[INTRA_FRAME + shift]); + // If more than one refresh occurs, must copy frame buffer. + if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > + 1) { + if (refresh_alt_ref_frame) { + copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + } else { + if (refresh_alt_ref_frame) { + swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_golden_frame) { + swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } + if (refresh_last_frame) { + swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift], + &denoiser->running_avg_y[INTRA_FRAME + shift]); + } } } } @@ -539,26 +568,38 @@ static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm, } int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, - int svc_buf_shift, int refresh_alt, - int refresh_gld, int refresh_lst, int alt_fb_idx, - int gld_fb_idx, int lst_fb_idx) { + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx) { int fail = 0; - if (refresh_alt) { - // Increase the frame buffer index by 1 to map it to the buffer index in the - // denoiser. - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - alt_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; - } - if (refresh_gld) { - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - gld_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; - } - if (refresh_lst) { - fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, - lst_fb_idx + 1 + svc_buf_shift); - if (fail) return 1; + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + i + 1 + svc_buf_shift); + } + } + } else { + if (refresh_alt) { + // Increase the frame buffer index by 1 to map it to the buffer index in + // the denoiser. + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + alt_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_gld) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + gld_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } + if (refresh_lst) { + fail = vp9_denoiser_realloc_svc_helper(cm, denoiser, + lst_fb_idx + 1 + svc_buf_shift); + if (fail) return 1; + } } return 0; } @@ -651,6 +692,7 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, denoiser->denoising_level = kDenLow; denoiser->prev_denoising_level = kDenLow; denoiser->reset = 0; + denoiser->current_denoiser_frame = 0; return 0; } @@ -675,13 +717,29 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) { vpx_free_frame_buffer(&denoiser->last_source); } -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) { +static void force_refresh_longterm_ref(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // If long term reference is used, force refresh of that slot, so + // denoiser buffer for long term reference stays in sync. + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->refresh_alt_ref_frame = 1; + } +} + +void vp9_denoiser_set_noise_level(VP9_COMP *const cpi, int noise_level) { + VP9_DENOISER *const denoiser = &cpi->denoiser; denoiser->denoising_level = noise_level; if (denoiser->denoising_level > kDenLowLow && - denoiser->prev_denoising_level == kDenLowLow) + denoiser->prev_denoising_level == kDenLowLow) { denoiser->reset = 1; - else + force_refresh_longterm_ref(cpi); + } else { denoiser->reset = 0; + } denoiser->prev_denoising_level = denoiser->denoising_level; } @@ -713,6 +771,56 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, return threshold; } +void vp9_denoiser_reset_on_first_frame(VP9_COMP *const cpi) { + if (vp9_denoise_svc_non_key(cpi) && + cpi->denoiser.current_denoiser_frame == 0) { + cpi->denoiser.reset = 1; + force_refresh_longterm_ref(cpi); + } +} + +void vp9_denoiser_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + + if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && + cpi->denoiser.denoising_level > kDenLowLow) { + int svc_refresh_denoiser_buffers = 0; + int denoise_svc_second_layer = 0; + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; + cpi->denoiser.current_denoiser_frame++; + if (cpi->use_svc) { + const int svc_buf_shift = + svc->number_spatial_layers - svc->spatial_layer_id == 2 + ? cpi->denoiser.num_ref_frames + : 0; + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + svc_refresh_denoiser_buffers = + lc->is_key_frame || svc->spatial_layer_sync[svc->spatial_layer_id]; + denoise_svc_second_layer = + svc->number_spatial_layers - svc->spatial_layer_id == 2 ? 1 : 0; + // Check if we need to allocate extra buffers in the denoiser + // for refreshed frames. + if (vp9_denoiser_realloc_svc(cm, &cpi->denoiser, svc, svc_buf_shift, + cpi->refresh_alt_ref_frame, + cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, + cpi->gld_fb_idx, cpi->lst_fb_idx)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to re-allocate denoiser for SVC"); + } + vp9_denoiser_update_frame_info( + &cpi->denoiser, *cpi->Source, svc, frame_type, + cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, + cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, + cpi->lst_fb_idx, cpi->resize_pending, svc_refresh_denoiser_buffers, + denoise_svc_second_layer); + } +} + #ifdef OUTPUT_YUV_DENOISED static void make_grayscale(YV12_BUFFER_CONFIG *yuv) { int r, c; diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h index f4da24cbf..1973e9898 100644 --- a/libvpx/vp9/encoder/vp9_denoiser.h +++ b/libvpx/vp9/encoder/vp9_denoiser.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_DENOISER_H_ -#define VP9_ENCODER_DENOISER_H_ +#ifndef VPX_VP9_ENCODER_VP9_DENOISER_H_ +#define VPX_VP9_ENCODER_VP9_DENOISER_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -50,6 +50,7 @@ typedef struct vp9_denoiser { int reset; int num_ref_frames; int num_layers; + unsigned int current_denoiser_frame; VP9_DENOISER_LEVEL denoising_level; VP9_DENOISER_LEVEL prev_denoising_level; } VP9_DENOISER; @@ -70,14 +71,15 @@ struct VP9_COMP; struct SVC; void vp9_denoiser_update_frame_info( - VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, - int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame, - int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized, - int svc_base_is_key, int second_spatial_layer); + VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, struct SVC *svc, + FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, + int refresh_last_frame, int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, + int resized, int svc_refresh_denoiser_buffers, int second_spatial_layer); void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx, - VP9_DENOISER_DECISION *denoiser_decision); + VP9_DENOISER_DECISION *denoiser_decision, + int use_gf_temporal_ref); void vp9_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx); @@ -86,9 +88,9 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse, PICK_MODE_CONTEXT *ctx); int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser, - int svc_buf_shift, int refresh_alt, - int refresh_gld, int refresh_lst, int alt_fb_idx, - int gld_fb_idx, int lst_fb_idx); + struct SVC *svc, int svc_buf_shift, + int refresh_alt, int refresh_gld, int refresh_lst, + int alt_fb_idx, int gld_fb_idx, int lst_fb_idx); int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser, int use_svc, int noise_sen, int width, int height, @@ -110,7 +112,9 @@ static INLINE int total_adj_strong_thresh(BLOCK_SIZE bs, void vp9_denoiser_free(VP9_DENOISER *denoiser); -void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level); +void vp9_denoiser_set_noise_level(struct VP9_COMP *const cpi, int noise_level); + +void vp9_denoiser_reset_on_first_frame(struct VP9_COMP *const cpi); int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int content_state, int temporal_layer_id); @@ -119,8 +123,10 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level, int abs_sumdiff, int temporal_layer_id); +void vp9_denoiser_update_ref_frame(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_DENOISER_H_ +#endif // VPX_VP9_ENCODER_VP9_DENOISER_H_ diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c index 682477df1..5adefac1a 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/libvpx/vp9/encoder/vp9_encodeframe.c @@ -42,6 +42,8 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" +#include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_partition_models.h" #include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_rd.h" #include "vp9/encoder/vp9_rdopt.h" @@ -52,33 +54,6 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, int output_enabled, int mi_row, int mi_col, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx); -// Machine learning-based early termination parameters. -static const double train_mean[24] = { - 303501.697372, 3042630.372158, 24.694696, 1.392182, - 689.413511, 162.027012, 1.478213, 0.0, - 135382.260230, 912738.513263, 28.845217, 1.515230, - 544.158492, 131.807995, 1.436863, 0.0, - 43682.377587, 208131.711766, 28.084737, 1.356677, - 138.254122, 119.522553, 1.252322, 0.0 -}; - -static const double train_stdm[24] = { - 673689.212982, 5996652.516628, 0.024449, 1.989792, - 985.880847, 0.014638, 2.001898, 0.0, - 208798.775332, 1812548.443284, 0.018693, 1.838009, - 396.986910, 0.015657, 1.332541, 0.0, - 55888.847031, 448587.962714, 0.017900, 1.904776, - 98.652832, 0.016598, 1.320992, 0.0 -}; - -// Error tolerance: 0.01%-0.0.05%-0.1% -static const double classifiers[24] = { - 0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863, - 0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134, - 0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700, - 0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211 -}; - // This is used as a reference when computing the source variance for the // purpose of activity masking. // Eventually this should be replaced by custom no-reference routines, @@ -205,6 +180,64 @@ static BLOCK_SIZE get_rd_var_based_fixed_partition(VP9_COMP *cpi, MACROBLOCK *x, return BLOCK_8X8; } +static void set_segment_index(VP9_COMP *cpi, MACROBLOCK *const x, int mi_row, + int mi_col, BLOCK_SIZE bsize, int segment_index) { + VP9_COMMON *const cm = &cpi->common; + const struct segmentation *const seg = &cm->seg; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + + const AQ_MODE aq_mode = cpi->oxcf.aq_mode; + const uint8_t *const map = + seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + + // Initialize the segmentation index as 0. + mi->segment_id = 0; + + // Skip the rest if AQ mode is disabled. + if (!seg->enabled) return; + + switch (aq_mode) { + case CYCLIC_REFRESH_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case VARIANCE_AQ: + if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || + cpi->force_update_segmentation || + (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { + int min_energy; + int max_energy; + // Get sub block energy range + if (bsize >= BLOCK_32X32) { + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + } else { + min_energy = bsize <= BLOCK_16X16 ? x->mb_energy + : vp9_block_energy(cpi, x, bsize); + } + mi->segment_id = vp9_vaq_segment_id(min_energy); + } else { + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + } + break; + case LOOKAHEAD_AQ: + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case EQUATOR360_AQ: + if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) + mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); + else + mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); + break; + case PSNR_AQ: mi->segment_id = segment_index; break; + default: + // NO_AQ or PSNR_AQ + break; + } + + vp9_init_plane_quantizers(cpi, x); +} + // Lighter version of set_offsets that only sets the mode info // pointers. static INLINE void set_mode_info_offsets(VP9_COMMON *const cm, @@ -222,18 +255,14 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - MODE_INFO *mi; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const struct segmentation *const seg = &cm->seg; MvLimits *const mv_limits = &x->mv_limits; set_skip_context(xd, mi_row, mi_col); set_mode_info_offsets(cm, x, xd, mi_row, mi_col); - mi = xd->mi[0]; - // Set up destination pointers. vp9_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col); @@ -256,22 +285,6 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, x->rddiv = cpi->rd.RDDIV; x->rdmult = cpi->rd.RDMULT; - // Setup segment ID. - if (seg->enabled) { - if (cpi->oxcf.aq_mode != VARIANCE_AQ && cpi->oxcf.aq_mode != LOOKAHEAD_AQ && - cpi->oxcf.aq_mode != EQUATOR360_AQ) { - const uint8_t *const map = - seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - vp9_init_plane_quantizers(cpi, x); - - x->encode_breakout = cpi->segment_encode_breakout[mi->segment_id]; - } else { - mi->segment_id = 0; - x->encode_breakout = cpi->encode_breakout; - } - // required by vp9_append_sub8x8_mvs_for_idx() and vp9_find_best_ref_mvs() xd->tile = *tile; } @@ -385,16 +398,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { node->split[i] = &vt->split[i].part_variances.none; break; } - case BLOCK_4X4: { + default: { v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); node->part_variances = &vt->part_variances; for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; break; } - default: { - assert(0); - break; - } } } @@ -408,7 +418,8 @@ static void fill_variance(uint32_t s2, int32_t s, int c, var *v) { static void get_variance(var *v) { v->variance = (int)(256 * (v->sum_square_error - - ((v->sum_error * v->sum_error) >> v->log2_count)) >> + (uint32_t)(((int64_t)v->sum_error * v->sum_error) >> + v->log2_count)) >> v->log2_count); } @@ -450,7 +461,7 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold) { @@ -460,9 +471,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x, return 0; } else if (bsize > bsize_min) { // Variance already computed to set the force_split. - if (cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); + if (frame_is_intra_only(cm)) get_variance(&vt.part_variances->none); // For key frame: take split for bsize above 32X32 or very high variance. - if (cm->frame_type == KEY_FRAME && + if (frame_is_intra_only(cm) && (bsize > BLOCK_32X32 || vt.part_variances->none.variance > (threshold << 4))) { return 0; @@ -534,7 +545,7 @@ static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, int content_state) { VP9_COMMON *const cm = &cpi->common; - const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int is_key_frame = frame_is_intra_only(cm); const int threshold_multiplier = is_key_frame ? 20 : 1; int64_t threshold_base = (int64_t)(threshold_multiplier * cpi->y_dequant[q][1]); @@ -586,6 +597,7 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q, } else { thresholds[1] = (5 * threshold_base) >> 1; } + if (cpi->sf.disable_16x16part_nonkey) thresholds[2] = INT64_MAX; } } @@ -593,7 +605,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, int content_state) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const int is_key_frame = (cm->frame_type == KEY_FRAME); + const int is_key_frame = frame_is_intra_only(cm); if (sf->partition_search_type != VAR_BASED_PARTITION && sf->partition_search_type != REFERENCE_PARTITION) { return; @@ -620,6 +632,11 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q, cpi->vbp_threshold_copy = (cpi->y_dequant[q][1] << 3) > 8000 ? (cpi->y_dequant[q][1] << 3) : 8000; + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) { + cpi->vbp_threshold_sad = 0; + cpi->vbp_threshold_copy = 0; + } } cpi->vbp_threshold_minmax = 15 + (q >> 3); } @@ -885,13 +902,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, set_block_size(cpi, x, xd, mi_row, mi_col, subsize); set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -951,7 +968,9 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, PARTITION_TYPE partition_high; if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0; - if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0; + if (mi_row >= svc->mi_rows[svc->spatial_layer_id - 1] || + mi_col >= svc->mi_cols[svc->spatial_layer_id - 1]) + return 0; // Find corresponding (mi_col/mi_row) block down-scaled by 2x2. start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col; @@ -1004,7 +1023,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, subsize_high); break; - case PARTITION_SPLIT: + default: + assert(partition_high == PARTITION_SPLIT); if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, mi_row_high, mi_col_high)) return 1; @@ -1020,7 +1040,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, mi_col_high + bs_high)) return 1; break; - default: assert(0); } } @@ -1067,13 +1086,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_partition_svc(cpi, subsize, mi_row, mi_col); update_partition_svc(cpi, subsize, mi_row + bs, mi_col); update_partition_svc(cpi, subsize, mi_row, mi_col + bs); update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1108,13 +1127,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_prev_partition_helper(cpi, subsize, mi_row, mi_col); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1206,6 +1225,7 @@ static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, cpi->content_state_sb_fd[sb_offset] = 0; } } + if (tmp_sad == 0) x->zero_temp_sad_source = 1; return tmp_sad; } @@ -1241,21 +1261,38 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, int pixels_wide = 64, pixels_high = 64; int64_t thresholds[4] = { cpi->vbp_thresholds[0], cpi->vbp_thresholds[1], cpi->vbp_thresholds[2], cpi->vbp_thresholds[3] }; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); // For the variance computation under SVC mode, we treat the frame as key if // the reference (base layer frame) is key frame (i.e., is_key_frame == 1). - const int is_key_frame = - (cm->frame_type == KEY_FRAME || + int is_key_frame = + (frame_is_intra_only(cm) || (is_one_pass_cbr_svc(cpi) && cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)); // Always use 4x4 partition for key frame. - const int use_4x4_partition = cm->frame_type == KEY_FRAME; + const int use_4x4_partition = frame_is_intra_only(cm); const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; int segment_id; int sb_offset = (cm->mi_stride >> 3) * (mi_row >> 3) + (mi_col >> 3); + // For SVC: check if LAST frame is NULL or if the resolution of LAST is + // different than the current frame resolution, and if so, treat this frame + // as a key frame, for the purpose of the superblock partitioning. + // LAST == NULL can happen in some cases where enhancement spatial layers are + // enabled dyanmically in the stream and the only reference is the spatial + // reference (GOLDEN). + if (cpi->use_svc) { + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, LAST_FRAME); + if (ref == NULL || ref->y_crop_height != cm->height || + ref->y_crop_width != cm->width) + is_key_frame = 1; + } + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0); segment_id = xd->mi[0]->segment_id; if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame)) @@ -1289,6 +1326,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && + !scene_change_detected && copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) { x->sb_use_mv_part = 1; if (cpi->sf.svc_use_lowres_part && @@ -1317,7 +1355,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, // 5-20 for the 16x16 blocks. - force_split[0] = 0; + force_split[0] = scene_change_detected; if (!is_key_frame) { // In the case of spatial/temporal scalable coding, the assumption here is @@ -1333,7 +1371,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, assert(yv12 != NULL); - if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id)) { + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { // For now, GOLDEN will not be used for non-zero spatial layers, since // it may not be a temporal reference. yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); @@ -1374,10 +1413,26 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); } else { - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); x->sb_use_mv_part = 1; x->sb_mvcol_part = mi->mv[0].as_mv.col; x->sb_mvrow_part = mi->mv[0].as_mv.row; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && + cpi->svc.spatial_layer_id == 0 && + cpi->svc.high_num_blocks_with_motion && !x->zero_temp_sad_source && + cm->width > 640 && cm->height > 480) { + // Disable split below 16x16 block size when scroll motion is detected. + // TODO(marpan/jianj): Improve this condition: issue is that search + // range is hard-coded/limited in vp9_int_pro_motion_estimation() so + // scroll motion may not be detected here. + if ((abs(x->sb_mvrow_part) >= 48 && abs(x->sb_mvcol_part) <= 8) || + y_sad < 100000) { + compute_minmax_variance = 0; + thresholds[2] = INT64_MAX; + } + } } y_sad_last = y_sad; @@ -1513,9 +1568,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } } - if (is_key_frame || (low_res && - vt.split[i].split[j].part_variances.none.variance > - threshold_4x4avg)) { + if (is_key_frame || + (low_res && vt.split[i].split[j].part_variances.none.variance > + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -1648,11 +1703,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } - if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { + if (!frame_is_intra_only(cm) && cpi->sf.copy_partition_flag) { update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset); } - if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part && + if (!frame_is_intra_only(cm) && cpi->sf.svc_use_lowres_part && cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col); @@ -1836,14 +1891,33 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode, vp9_rd_cost_init(rd_cost); } -static int set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, - int8_t segment_id) { +static void set_segment_rdmult(VP9_COMP *const cpi, MACROBLOCK *const x, + int mi_row, int mi_col, BLOCK_SIZE bsize, + AQ_MODE aq_mode) { int segment_qindex; VP9_COMMON *const cm = &cpi->common; + const uint8_t *const map = + cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; + vp9_init_plane_quantizers(cpi, x); vpx_clear_system_state(); - segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex); - return vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); + segment_qindex = + vp9_get_qindex(&cm->seg, x->e_mbd.mi[0]->segment_id, cm->base_qindex); + + if (aq_mode == NO_AQ || aq_mode == PSNR_AQ) { + if (cpi->sf.enable_tpl_model) x->rdmult = x->cb_rdmult; + return; + } + + if (aq_mode == CYCLIC_REFRESH_AQ) { + // If segment is boosted, use rdmult for that segment. + if (cyclic_refresh_segment_id_boosted( + get_segment_id(cm, map, bsize, mi_row, mi_col))) + x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); + return; + } + + x->rdmult = vp9_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q); } static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, @@ -1914,44 +1988,8 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt; } - if (aq_mode == VARIANCE_AQ) { - const int energy = - bsize <= BLOCK_16X16 ? x->mb_energy : vp9_block_energy(cpi, x, bsize); - - if (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame || - cpi->force_update_segmentation || - (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { - mi->segment_id = vp9_vaq_segment_id(energy); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == LOOKAHEAD_AQ) { - const uint8_t *const map = cpi->segmentation_map; - - // I do not change rdmult here consciously. - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } else if (aq_mode == EQUATOR360_AQ) { - if (cm->frame_type == KEY_FRAME || cpi->force_update_segmentation) { - mi->segment_id = vp9_360aq_segment_id(mi_row, cm->mi_rows); - } else { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col); - } - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == COMPLEXITY_AQ) { - x->rdmult = set_segment_rdmult(cpi, x, mi->segment_id); - } else if (aq_mode == CYCLIC_REFRESH_AQ) { - const uint8_t *const map = - cm->seg.update_map ? cpi->segmentation_map : cm->last_frame_seg_map; - // If segment is boosted, use rdmult for that segment. - if (cyclic_refresh_segment_id_boosted( - get_segment_id(cm, map, bsize, mi_row, mi_col))) - x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); - } + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + set_segment_rdmult(cpi, x, mi_row, mi_col, bsize, aq_mode); // Find best coding mode & reconstruct the MB so it is available // as a predictor for MBs that follow in the SB @@ -1979,11 +2017,14 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate); } - x->rdmult = orig_rdmult; - // TODO(jingning) The rate-distortion optimization flow needs to be // refactored to provide proper exit/return handle. - if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX; + if (rd_cost->rate == INT_MAX) + rd_cost->rdcost = INT64_MAX; + else + rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist); + + x->rdmult = orig_rdmult; ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; @@ -2013,8 +2054,10 @@ static void update_stats(VP9_COMMON *cm, ThreadData *td) { [has_second_ref(mi)]++; if (has_second_ref(mi)) { - counts->comp_ref[vp9_get_pred_context_comp_ref_p(cm, xd)] - [ref0 == GOLDEN_FRAME]++; + const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd); + const int bit = mi->ref_frame[!idx] == cm->comp_var_ref[1]; + counts->comp_ref[ctx][bit]++; } else { counts->single_ref[vp9_get_pred_context_single_ref_p1(xd)][0] [ref0 != LAST_FRAME]++; @@ -2110,6 +2153,10 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile, ThreadData *td, PICK_MODE_CONTEXT *ctx) { MACROBLOCK *const x = &td->mb; set_offsets(cpi, tile, x, mi_row, mi_col, bsize); + + if (cpi->sf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ) + x->rdmult = x->cb_rdmult; + update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled); encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx); @@ -2168,7 +2215,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->leaf_split[0]); @@ -2183,7 +2231,6 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -2441,7 +2488,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } x->skip = ctx->skip; - x->skip_txfm[0] = mi->segment_id ? 0 : ctx->skip_txfm[0]; + x->skip_txfm[0] = (mi->segment_id || xd->lossless) ? 0 : ctx->skip_txfm[0]; } static void encode_b_rt(VP9_COMP *cpi, ThreadData *td, @@ -2509,7 +2556,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->split[0]); @@ -2520,7 +2568,6 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, subsize, pc_tree->split[3]); break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -2617,6 +2664,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, ctx, INT64_MAX); break; case PARTITION_HORZ: + pc_tree->horizontal[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->horizontal[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && @@ -2626,6 +2674,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + pc_tree->horizontal[1].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col, &tmp_rdc, subsize, &pc_tree->horizontal[1], INT64_MAX); if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) { @@ -2638,6 +2687,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, } break; case PARTITION_VERT: + pc_tree->vertical[0].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, &pc_tree->vertical[0], INT64_MAX); if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 && @@ -2647,6 +2697,7 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, vp9_rd_cost_init(&tmp_rdc); update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); + pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc, subsize, &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX); @@ -2659,7 +2710,8 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.rdcost += tmp_rdc.rdcost; } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, pc_tree->leaf_split[0], INT64_MAX); @@ -2689,7 +2741,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.dist += tmp_rdc.dist; } break; - default: assert(0); break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -3018,14 +3069,59 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv, } #endif -// Calculate the score used in machine-learning based partition search early -// termination. -static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, - PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, - BLOCK_SIZE bsize) { - const double *clf; - const double *mean; - const double *sd; +// Calculate prediction based on the given input features and neural net config. +// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden +// layer. +static void nn_predict(const float *features, const NN_CONFIG *nn_config, + float *output) { + int num_input_nodes = nn_config->num_inputs; + int buf_index = 0; + float buf[2][NN_MAX_NODES_PER_LAYER]; + const float *input_nodes = features; + + // Propagate hidden layers. + const int num_layers = nn_config->num_hidden_layers; + int layer, node, i; + assert(num_layers <= NN_MAX_HIDDEN_LAYERS); + for (layer = 0; layer < num_layers; ++layer) { + const float *weights = nn_config->weights[layer]; + const float *bias = nn_config->bias[layer]; + float *output_nodes = buf[buf_index]; + const int num_output_nodes = nn_config->num_hidden_nodes[layer]; + assert(num_output_nodes < NN_MAX_NODES_PER_LAYER); + for (node = 0; node < num_output_nodes; ++node) { + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + val += bias[node]; + // ReLU as activation function. + val = VPXMAX(val, 0.0f); + output_nodes[node] = val; + weights += num_input_nodes; + } + num_input_nodes = num_output_nodes; + input_nodes = output_nodes; + buf_index = 1 - buf_index; + } + + // Final output layer. + { + const float *weights = nn_config->weights[num_layers]; + for (node = 0; node < nn_config->num_outputs; ++node) { + const float *bias = nn_config->bias[num_layers]; + float val = 0.0f; + for (i = 0; i < num_input_nodes; ++i) val += weights[i] * input_nodes[i]; + output[node] = val + bias[node]; + weights += num_input_nodes; + } + } +} + +#define FEATURES 7 +// Machine-learning based partition search early termination. +// Return 1 to skip split and rect partitions. +static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd, + PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col, + BLOCK_SIZE bsize) { const int mag_mv = abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row); const int left_in_image = !!xd->left_mi; @@ -3035,11 +3131,32 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, int above_par = 0; // above_partitioning int left_par = 0; // left_partitioning int last_par = 0; // last_partitioning - BLOCK_SIZE context_size; - double score; int offset = 0; + int i; + BLOCK_SIZE context_size; + const NN_CONFIG *nn_config = NULL; + const float *mean, *sd, *linear_weights; + float nn_score, linear_score; + float features[FEATURES]; assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]); + vpx_clear_system_state(); + + switch (bsize) { + case BLOCK_64X64: + offset = 0; + nn_config = &vp9_partition_nnconfig_64x64; + break; + case BLOCK_32X32: + offset = 8; + nn_config = &vp9_partition_nnconfig_32x32; + break; + case BLOCK_16X16: + offset = 16; + nn_config = &vp9_partition_nnconfig_16x16; + break; + default: assert(0 && "Unexpected block size."); return 0; + } if (above_in_image) { context_size = xd->above_mi->sb_type; @@ -3065,25 +3182,458 @@ static double compute_score(VP9_COMMON *const cm, MACROBLOCKD *const xd, last_par = 1; } - if (bsize == BLOCK_64X64) - offset = 0; - else if (bsize == BLOCK_32X32) - offset = 8; - else if (bsize == BLOCK_16X16) - offset = 16; - - // early termination score calculation - clf = &classifiers[offset]; - mean = &train_mean[offset]; - sd = &train_stdm[offset]; - score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) + - clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) + - clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) + - clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) * sd[3]) + - clf[4] * (((double)ctx->sum_y_eobs - mean[4]) / sd[4]) + - clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) + - clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7]; - return score; + mean = &vp9_partition_feature_mean[offset]; + sd = &vp9_partition_feature_std[offset]; + features[0] = ((float)ctx->rate - mean[0]) / sd[0]; + features[1] = ((float)ctx->dist - mean[1]) / sd[1]; + features[2] = ((float)mag_mv / 2 - mean[2]) * sd[2]; + features[3] = ((float)(left_par + above_par) / 2 - mean[3]) * sd[3]; + features[4] = ((float)ctx->sum_y_eobs - mean[4]) / sd[4]; + features[5] = ((float)cm->base_qindex - mean[5]) * sd[5]; + features[6] = ((float)last_par - mean[6]) * sd[6]; + + // Predict using linear model. + linear_weights = &vp9_partition_linear_weights[offset]; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + if (linear_score > 0.1f) return 0; + + // Predict using neural net model. + nn_predict(features, nn_config, &nn_score); + + if (linear_score < -0.0f && nn_score < 0.1f) return 1; + if (nn_score < -0.0f && linear_score < 0.1f) return 1; + return 0; +} +#undef FEATURES + +#define FEATURES 4 +// ML-based partition search breakout. +static int ml_predict_breakout(VP9_COMP *const cpi, BLOCK_SIZE bsize, + const MACROBLOCK *const x, + const RD_COST *const rd_cost) { + DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 }; + const VP9_COMMON *const cm = &cpi->common; + float features[FEATURES]; + const float *linear_weights = NULL; // Linear model weights. + float linear_score = 0.0f; + const int qindex = cm->base_qindex; + const int q_ctx = qindex >= 200 ? 0 : (qindex >= 150 ? 1 : 2); + const int is_720p_or_larger = VPXMIN(cm->width, cm->height) >= 720; + const int resolution_ctx = is_720p_or_larger ? 1 : 0; + + switch (bsize) { + case BLOCK_64X64: + linear_weights = vp9_partition_breakout_weights_64[resolution_ctx][q_ctx]; + break; + case BLOCK_32X32: + linear_weights = vp9_partition_breakout_weights_32[resolution_ctx][q_ctx]; + break; + case BLOCK_16X16: + linear_weights = vp9_partition_breakout_weights_16[resolution_ctx][q_ctx]; + break; + case BLOCK_8X8: + linear_weights = vp9_partition_breakout_weights_8[resolution_ctx][q_ctx]; + break; + default: assert(0 && "Unexpected block size."); return 0; + } + if (!linear_weights) return 0; + + { // Generate feature values. +#if CONFIG_VP9_HIGHBITDEPTH + const int ac_q = + vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + const int num_pels_log2 = num_pels_log2_lookup[bsize]; + int feature_index = 0; + unsigned int var, sse; + float rate_f, dist_f; + +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + var = + vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, x->e_mbd.bd); + } else { + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); + } +#else + var = cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + vp9_64_zeros, 0, &sse); +#endif + var = var >> num_pels_log2; + + vpx_clear_system_state(); + + rate_f = (float)VPXMIN(rd_cost->rate, INT_MAX); + dist_f = (float)(VPXMIN(rd_cost->dist, INT_MAX) >> num_pels_log2); + rate_f = + ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) * + rate_f; + + features[feature_index++] = rate_f; + features[feature_index++] = dist_f; + features[feature_index++] = (float)var; + features[feature_index++] = (float)ac_q; + assert(feature_index == FEATURES); + } + + { // Calculate the output score. + int i; + linear_score = linear_weights[FEATURES]; + for (i = 0; i < FEATURES; ++i) + linear_score += linear_weights[i] * features[i]; + } + + return linear_score >= cpi->sf.ml_partition_search_breakout_thresh[q_ctx]; +} +#undef FEATURES + +#define FEATURES 17 +#define LABELS 4 +static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, + BLOCK_SIZE bsize, + const PC_TREE *const pc_tree, + int *allow_horz, int *allow_vert, + int64_t ref_rd, int mi_row, int mi_col) { + const NN_CONFIG *nn_config = NULL; + float score[LABELS] = { + 0.0f, + }; + int thresh = -1; + int i; + + if (ref_rd <= 0 || ref_rd > 1000000000) return; + + switch (bsize) { + case BLOCK_8X8: break; + case BLOCK_16X16: + nn_config = &vp9_rect_part_nnconfig_16; + thresh = cpi->sf.ml_prune_rect_partition_threhold[1]; + break; + case BLOCK_32X32: + nn_config = &vp9_rect_part_nnconfig_32; + thresh = cpi->sf.ml_prune_rect_partition_threhold[2]; + break; + case BLOCK_64X64: + nn_config = &vp9_rect_part_nnconfig_64; + thresh = cpi->sf.ml_prune_rect_partition_threhold[3]; + break; + default: assert(0 && "Unexpected block size."); return; + } + if (!nn_config || thresh < 0) return; + + // Feature extraction and model score calculation. + { + const int64_t none_rdcost = pc_tree->none.rdcost; + const VP9_COMMON *const cm = &cpi->common; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (x->e_mbd.bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + int feature_index = 0; + unsigned int block_var = 0; + unsigned int sub_block_var[4] = { 0 }; + float features[FEATURES]; + + features[feature_index++] = + (float)(pc_tree->partitioning == PARTITION_NONE); + features[feature_index++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + + // Calculate source pixel variance. + { + struct buf_2d buf; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const MACROBLOCKD *const xd = &x->e_mbd; + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + + (void)xd; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + block_var = vp9_high_get_sby_perpixel_variance(cpi, &x->plane[0].src, + bsize, xd->bd); + } else { + block_var = vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + } +#else + block_var = vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); +#endif // CONFIG_VP9_HIGHBITDEPTH + + buf.stride = x->plane[0].src.stride; + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + sub_block_var[i] = + vp9_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd); + } else { + sub_block_var[i] = vp9_get_sby_perpixel_variance(cpi, &buf, subsize); + } +#else + sub_block_var[i] = vp9_get_sby_perpixel_variance(cpi, &buf, subsize); +#endif // CONFIG_VP9_HIGHBITDEPTH + } + } + + features[feature_index++] = logf((float)block_var + 1.0f); + features[feature_index++] = logf((float)ref_rd + 1.0f); + features[feature_index++] = (none_rdcost > 0 && none_rdcost < 1000000000) + ? (float)pc_tree->none.skippable + : 0.0f; + + for (i = 0; i < 4; ++i) { + const int64_t this_rd = pc_tree->split[i]->none.rdcost; + const int rd_valid = this_rd > 0 && this_rd < 1000000000; + // Ratio between sub-block RD and whole block RD. + features[feature_index++] = + rd_valid ? ((float)this_rd / (float)ref_rd) : 1.0f; + // Sub-block skippable. + features[feature_index++] = + rd_valid ? ((float)pc_tree->split[i]->none.skippable) : 0.0f; + } + + { + const float denom = (float)(block_var + 1); + const float low_b = 0.1f; + const float high_b = 10.0f; + for (i = 0; i < 4; ++i) { + // Ratio between the quarter sub-block variance and the + // whole-block variance. + float var_ratio = (float)(sub_block_var[i] + 1) / denom; + if (var_ratio < low_b) var_ratio = low_b; + if (var_ratio > high_b) var_ratio = high_b; + features[feature_index++] = var_ratio; + } + } + assert(feature_index == FEATURES); + nn_predict(features, nn_config, score); + } + + // Make decisions based on the model score. + { + int max_score = -1000; + int horz = 0, vert = 0; + int int_score[LABELS]; + for (i = 0; i < LABELS; ++i) { + int_score[i] = (int)(100 * score[i]); + max_score = VPXMAX(int_score[i], max_score); + } + thresh = max_score - thresh; + for (i = 0; i < LABELS; ++i) { + if (int_score[i] >= thresh) { + if ((i >> 0) & 1) horz = 1; + if ((i >> 1) & 1) vert = 1; + } + } + *allow_horz = *allow_horz && horz; + *allow_vert = *allow_vert && vert; + } +} +#undef FEATURES +#undef LABELS + +// Use a neural net model to prune partition-none and partition-split search. +// The model uses prediction residue variance and quantization step size as +// input features. +#define FEATURES 6 +static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const NN_CONFIG *nn_config = NULL; +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]); + uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + ? (CONVERT_TO_BYTEPTR(pred_buffer)) + : pred_buffer; +#else + DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64]); + uint8_t *const pred_buf = pred_buffer; +#endif // CONFIG_VP9_HIGHBITDEPTH + const int speed = cpi->oxcf.speed; + int i; + float thresh = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_var_rd_part_nnconfig_64; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_var_rd_part_nnconfig_32; + thresh = speed > 0 ? 3.5f : 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_var_rd_part_nnconfig_16; + thresh = speed > 0 ? 3.5f : 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_var_rd_part_nnconfig_8; + if (cm->width >= 720 && cm->height >= 720) + thresh = speed > 0 ? 2.5f : 2.0f; + else + thresh = speed > 0 ? 3.5f : 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + mi->ref_frame[1] = NONE; + mi->sb_type = bsize; + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref); + MV ref_mv = { 0, 0 }; + MV ref_mv_full = { 0, 0 }; + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV best_mv = { 0, 0 }; + int cost_list[5]; + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ref - 1].sf); + mi->ref_frame[0] = ref; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; +#if CONFIG_VP9_HIGHBITDEPTH + const int dc_q = + vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth) >> (xd->bd - 8); +#else + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); +#endif // CONFIG_VP9_HIGHBITDEPTH + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh) *none = 0; + if (score < -thresh) *split = 0; + } +} +#undef FEATURES +#undef LABELS + +static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col, int orig_rdmult) { + const int gf_group_index = cpi->twopass.gf_group.index; + TplDepFrame *tpl_frame = &cpi->tpl_stats[gf_group_index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + int tpl_stride = tpl_frame->stride; + int64_t intra_cost = 0; + int64_t mc_dep_cost = 0; + int mi_wide = num_8x8_blocks_wide_lookup[bsize]; + int mi_high = num_8x8_blocks_high_lookup[bsize]; + int row, col; + + int dr = 0; + int count = 0; + double r0, rk, beta; + + if (tpl_frame->is_valid == 0) return orig_rdmult; + + if (cpi->twopass.gf_group.layer_depth[gf_group_index] > 1) return orig_rdmult; + + if (gf_group_index >= MAX_ARF_GOP_SIZE) return orig_rdmult; + + for (row = mi_row; row < mi_row + mi_high; ++row) { + for (col = mi_col; col < mi_col + mi_wide; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + + if (row >= cpi->common.mi_rows || col >= cpi->common.mi_cols) continue; + + intra_cost += this_stats->intra_cost; + mc_dep_cost += this_stats->mc_dep_cost; + + ++count; + } + } + + vpx_clear_system_state(); + + r0 = cpi->rd.r0; + rk = (double)intra_cost / mc_dep_cost; + beta = r0 / rk; + dr = vp9_get_adaptive_rdmult(cpi, beta); + + dr = VPXMIN(dr, orig_rdmult * 3 / 2); + dr = VPXMAX(dr, orig_rdmult * 1 / 2); + + dr = VPXMAX(1, dr); + + return dr; } // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are @@ -3102,7 +3652,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; - PICK_MODE_CONTEXT *ctx = &pc_tree->none; + PICK_MODE_CONTEXT *const ctx = &pc_tree->none; int i; const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); BLOCK_SIZE subsize; @@ -3133,15 +3683,22 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, int64_t dist_breakout_thr = cpi->sf.partition_search_breakout_thr.dist; int rate_breakout_thr = cpi->sf.partition_search_breakout_thr.rate; + int must_split = 0; + int partition_mul = cpi->sf.enable_tpl_model && cpi->oxcf.aq_mode == NO_AQ + ? x->cb_rdmult + : cpi->rd.RDMULT; + // Ref frames picked in the [i_th] quarter subblock during square partition + // RD search. It may be used to prune ref frame selection of rect partitions. + uint8_t ref_frames_used[4] = { 0, 0, 0, 0 }; (void)*tp_orig; assert(num_8x8_blocks_wide_lookup[bsize] == num_8x8_blocks_high_lookup[bsize]); - // Adjust dist breakout threshold according to the partition size. dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; vp9_rd_cost_init(&this_rdc); @@ -3165,10 +3722,18 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); } + // Get sub block energy range + if (bsize >= BLOCK_16X16) { + int min_energy, max_energy; + vp9_get_sub_block_energy(cpi, x, mi_row, mi_col, bsize, &min_energy, + &max_energy); + must_split = (min_energy < -3) && (max_energy - min_energy > 2); + } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_none_allowed &= (bsize <= max_size); partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); partition_vert_allowed &= @@ -3177,7 +3742,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (cpi->sf.use_square_partition_only && - bsize > cpi->sf.use_square_only_threshold) { + (bsize > cpi->sf.use_square_only_thresh_high || + bsize < cpi->sf.use_square_only_thresh_low)) { if (cpi->use_svc) { if (!vp9_active_h_edge(cpi, mi_row, mi_step) || x->e_mbd.lossless) partition_horz_allowed &= force_horz_split; @@ -3250,15 +3816,37 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } #endif + pc_tree->partitioning = PARTITION_NONE; + + if (cpi->sf.ml_var_partition_pruning) { + const int do_ml_var_partition_pruning = + !frame_is_intra_only(cm) && partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; + if (do_ml_var_partition_pruning) { + ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + } + } + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, best_rdc.rdcost); + ctx->rdcost = this_rdc.rdcost; if (this_rdc.rate != INT_MAX) { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = ctx->mic.ref_frame[0]; + const int ref2 = ctx->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } if (bsize >= BLOCK_8X8) { + this_rdc.rdcost += RDCOST(partition_mul, x->rddiv, + cpi->partition_cost[pl][PARTITION_NONE], 0); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; - this_rdc.rdcost = - RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); } if (this_rdc.rdcost < best_rdc.rdcost) { @@ -3267,31 +3855,41 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!cpi->sf.ml_partition_search_early_termination) { - // If all y, u, v transform blocks in this partition are skippable, - // and the dist & rate are within the thresholds, the partition search - // is terminated for current branch of the partition search tree. - if (!x->e_mbd.lossless && ctx->skippable && - ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { - do_split = 0; - do_rect = 0; - } - } else { + if (cpi->sf.ml_partition_search_early_termination) { // Currently, the machine-learning based partition search early // termination is only used while bsize is 16x16, 32x32 or 64x64, // VPXMIN(cm->width, cm->height) >= 480, and speed = 0. if (!x->e_mbd.lossless && !segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP) && ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) { - if (compute_score(cm, xd, ctx, mi_row, mi_col, bsize) < 0.0) { + if (ml_pruning_partition(cm, xd, ctx, mi_row, mi_col, bsize)) { do_split = 0; do_rect = 0; } } } + if ((do_split || do_rect) && !x->e_mbd.lossless && ctx->skippable) { + const int use_ml_based_breakout = + cpi->sf.use_ml_partition_search_breakout && + cm->base_qindex >= 100; + if (use_ml_based_breakout) { + if (ml_predict_breakout(cpi, bsize, x, &this_rdc)) { + do_split = 0; + do_rect = 0; + } + } else { + if (!cpi->sf.ml_partition_search_early_termination) { + if ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr)) { + do_split = 0; + do_rect = 0; + } + } + } + } + #if CONFIG_FP_MB_STATS // Check if every 16x16 first pass block statistics has zero // motion and the corresponding first pass residue is small enough. @@ -3341,10 +3939,13 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; } // store estimated motion vector - if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx); + store_pred_mv(x, ctx); // If the interp_filter is marked as SWITCHABLE_FILTERS, it was for an // intra block and used for context purposes. @@ -3357,35 +3958,65 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_SPLIT // TODO(jingning): use the motion vectors given by the above search as // the starting point of motion search in the following partition type check. - if (do_split) { + pc_tree->split[0]->none.rdcost = 0; + pc_tree->split[1]->none.rdcost = 0; + pc_tree->split[2]->none.rdcost = 0; + pc_tree->split[3]->none.rdcost = 0; + if (do_split || must_split) { subsize = get_subsize(bsize, PARTITION_SPLIT); + load_pred_mv(x, ctx); if (bsize == BLOCK_8X8) { i = 4; if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed) pc_tree->leaf_split[0]->pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, pc_tree->leaf_split[0], best_rdc.rdcost); - - if (sum_rdc.rate == INT_MAX) sum_rdc.rdcost = INT64_MAX; + if (sum_rdc.rate == INT_MAX) { + sum_rdc.rdcost = INT64_MAX; + } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + const int ref1 = pc_tree->leaf_split[0]->mic.ref_frame[0]; + const int ref2 = pc_tree->leaf_split[0]->mic.ref_frame[1]; + for (i = 0; i < 4; ++i) { + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } + } + } } else { - for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) { + for (i = 0; (i < 4) && ((sum_rdc.rdcost < best_rdc.rdcost) || must_split); + ++i) { const int x_idx = (i & 1) * mi_step; const int y_idx = (i >> 1) * mi_step; if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) continue; - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); - pc_tree->split[i]->index = i; + if (cpi->sf.prune_ref_frame_for_rect_partitions) + pc_tree->split[i]->none.rate = INT_MAX; rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx, subsize, &this_rdc, + // A must split test here increases the number of sub + // partitions but hurts metrics results quite a bit, + // so this extra test is commented out pending + // further tests on whether it adds much in terms of + // visual quality. + // (must_split) ? best_rdc.rdcost + // : best_rdc.rdcost - sum_rdc.rdcost, best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]); if (this_rdc.rate == INT_MAX) { sum_rdc.rdcost = INT64_MAX; break; } else { + if (cpi->sf.prune_ref_frame_for_rect_partitions && + pc_tree->split[i]->none.rate != INT_MAX) { + const int ref1 = pc_tree->split[i]->none.mic.ref_frame[0]; + const int ref2 = pc_tree->split[i]->none.mic.ref_frame[1]; + ref_frames_used[i] |= (1 << ref1); + if (ref2 > 0) ref_frames_used[i] |= (1 << ref2); + } sum_rdc.rate += this_rdc.rate; sum_rdc.dist += this_rdc.dist; sum_rdc.rdcost += this_rdc.rdcost; @@ -3393,51 +4024,87 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } - if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) { + if (((sum_rdc.rdcost < best_rdc.rdcost) || must_split) && i == 4) { + sum_rdc.rdcost += RDCOST(partition_mul, x->rddiv, + cpi->partition_cost[pl][PARTITION_SPLIT], 0); sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { + if ((sum_rdc.rdcost < best_rdc.rdcost) || + (must_split && (sum_rdc.dist < best_rdc.dist))) { best_rdc = sum_rdc; pc_tree->partitioning = PARTITION_SPLIT; // Rate and distortion based partition search termination clause. if (!cpi->sf.ml_partition_search_early_termination && - !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) || - (best_rdc.dist < dist_breakout_thr && - best_rdc.rate < rate_breakout_thr))) { + !x->e_mbd.lossless && + ((best_rdc.dist < (dist_breakout_thr >> 2)) || + (best_rdc.dist < dist_breakout_thr && + best_rdc.rate < rate_breakout_thr))) { do_rect = 0; } } } else { // skip rectangular partition test when larger block size // gives better rd cost - if ((cpi->sf.less_rectangular_check) && - ((bsize > cpi->sf.use_square_only_threshold) || - (best_rdc.dist < dist_breakout_thr))) + if (cpi->sf.less_rectangular_check && + (bsize > cpi->sf.use_square_only_thresh_high || + best_rdc.dist < dist_breakout_thr)) do_rect &= !partition_none_allowed; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } + pc_tree->horizontal[0].skip_ref_frame_mask = 0; + pc_tree->horizontal[1].skip_ref_frame_mask = 0; + pc_tree->vertical[0].skip_ref_frame_mask = 0; + pc_tree->vertical[1].skip_ref_frame_mask = 0; + if (cpi->sf.prune_ref_frame_for_rect_partitions) { + uint8_t used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[1]; + if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[2] | ref_frames_used[3]; + if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[0] | ref_frames_used[2]; + if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames; + used_frames = ref_frames_used[1] | ref_frames_used[3]; + if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames; + } + + { + const int do_ml_rect_partition_pruning = + !frame_is_intra_only(cm) && !force_horz_split && !force_vert_split && + (partition_horz_allowed || partition_vert_allowed) && bsize > BLOCK_8X8; + if (do_ml_rect_partition_pruning) { + ml_prune_rect_partition(cpi, x, bsize, pc_tree, &partition_horz_allowed, + &partition_vert_allowed, best_rdc.rdcost, mi_row, + mi_col); + } + } + // PARTITION_HORZ if (partition_horz_allowed && (do_rect || vp9_active_h_edge(cpi, mi_row, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_HORZ]; + const int64_t part_mode_rdcost = + RDCOST(partition_mul, x->rddiv, part_mode_rate, 0); subsize = get_subsize(bsize, PARTITION_HORZ); - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->horizontal[0], best_rdc.rdcost); + &pc_tree->horizontal[0], + best_rdc.rdcost - part_mode_rdcost); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rdcost += part_mode_rdcost; + sum_rdc.rate += part_mode_rate; + } if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows && bsize > BLOCK_8X8) { PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0]; update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter; @@ -3454,16 +4121,12 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_HORZ; + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_HORZ; - if ((cpi->sf.less_rectangular_check) && - (bsize > cpi->sf.use_square_only_threshold)) - do_rect = 0; - } + if (cpi->sf.less_rectangular_check && + bsize > cpi->sf.use_square_only_thresh_high) + do_rect = 0; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } @@ -3471,21 +4134,26 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && (do_rect || vp9_active_v_edge(cpi, mi_col, mi_step))) { + const int part_mode_rate = cpi->partition_cost[pl][PARTITION_VERT]; + const int64_t part_mode_rdcost = + RDCOST(partition_mul, x->rddiv, part_mode_rate, 0); subsize = get_subsize(bsize, PARTITION_VERT); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[0].pred_interp_filter = pred_interp_filter; rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, - &pc_tree->vertical[0], best_rdc.rdcost); + &pc_tree->vertical[0], best_rdc.rdcost - part_mode_rdcost); + if (sum_rdc.rdcost < INT64_MAX) { + sum_rdc.rdcost += part_mode_rdcost; + sum_rdc.rate += part_mode_rate; + } + if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols && bsize > BLOCK_8X8) { update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0); encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, &pc_tree->vertical[0]); - - if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 && partition_none_allowed) pc_tree->vertical[1].pred_interp_filter = pred_interp_filter; @@ -3502,12 +4170,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } if (sum_rdc.rdcost < best_rdc.rdcost) { - sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT]; - sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist); - if (sum_rdc.rdcost < best_rdc.rdcost) { - best_rdc = sum_rdc; - pc_tree->partitioning = PARTITION_VERT; - } + best_rdc = sum_rdc; + pc_tree->partitioning = PARTITION_VERT; } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); } @@ -3582,7 +4246,10 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, } } - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } td->pc_root->index = 0; if (seg->enabled) { @@ -3613,12 +4280,21 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root); } else { + int orig_rdmult = cpi->rd.RDMULT; + x->cb_rdmult = orig_rdmult; + if (cpi->twopass.gf_group.index > 0 && cpi->sf.enable_tpl_model) { + int dr = + get_rdmult_delta(cpi, BLOCK_64X64, mi_row, mi_col, orig_rdmult); + x->cb_rdmult = dr; + } + // If required set upper and lower partition size limits if (sf->auto_min_max_partition_size) { set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col, &x->min_partition_size, &x->max_partition_size); } + td->pc_root->none.rdcost = 0; rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, INT64_MAX, td->pc_root); } @@ -3703,6 +4379,36 @@ static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); } +static void hybrid_search_svc_baseiskey(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF) + vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); + else if (bsize >= BLOCK_8X8) + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + else + vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + +static void hybrid_search_scene_change(VP9_COMP *cpi, MACROBLOCK *const x, + RD_COST *rd_cost, BLOCK_SIZE bsize, + PICK_MODE_CONTEXT *ctx, + TileDataEnc *tile_data, int mi_row, + int mi_col) { + if (!cpi->sf.nonrd_keyframe && bsize <= BLOCK_8X8) { + vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); + } else { + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); + } +} + static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, MACROBLOCK *const x, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, @@ -3718,6 +4424,9 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int plane; set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize); + + set_segment_index(cpi, x, mi_row, mi_col, bsize, 0); + mi = xd->mi[0]; mi->sb_type = bsize; @@ -3733,14 +4442,23 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, if (cyclic_refresh_segment_id_boosted(mi->segment_id)) x->rdmult = vp9_cyclic_refresh_get_rdmult(cpi->cyclic_refresh); - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) hybrid_intra_mode_search(cpi, x, rd_cost, bsize, ctx); + else if (cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) + hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize); - else if (bsize >= BLOCK_8X8) - vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, ctx); - else + else if (bsize >= BLOCK_8X8) { + if (cpi->rc.hybrid_intra_scene_change) + hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row, + mi_col); + else + vp9_pick_inter_mode(cpi, x, tile_data, mi_row, mi_col, rd_cost, bsize, + ctx); + } else { vp9_pick_inter_mode_sub8x8(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx); + } duplicate_mode_info_in_sb(cm, xd, mi_row, mi_col, bsize); @@ -3830,6 +4548,78 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) { } } +#if CONFIG_ML_VAR_PARTITION +#define FEATURES 6 +#define LABELS 2 +static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const NN_CONFIG *nn_config = NULL; + + switch (bsize) { + case BLOCK_64X64: nn_config = &vp9_var_part_nnconfig_64; break; + case BLOCK_32X32: nn_config = &vp9_var_part_nnconfig_32; break; + case BLOCK_16X16: nn_config = &vp9_var_part_nnconfig_16; break; + case BLOCK_8X8: break; + default: assert(0 && "Unexpected block size."); return -1; + } + + if (!nn_config) return -1; + + vpx_clear_system_state(); + + { + const float thresh = cpi->oxcf.speed <= 5 ? 1.25f : 0.0f; + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score[LABELS]; + + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const int sb_offset_row = 8 * (mi_row & 7); + const int sb_offset_col = 8 * (mi_col & 7); + const uint8_t *pred = x->est_pred + sb_offset_row * 64 + sb_offset_col; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + int i; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + + assert(feature_idx == FEATURES); + nn_predict(features, nn_config, score); + if (score[0] > thresh) return PARTITION_SPLIT; + if (score[0] < -thresh) return PARTITION_NONE; + return -1; + } +} +#undef FEATURES +#undef LABELS +#endif // CONFIG_ML_VAR_PARTITION + static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, @@ -3859,6 +4649,11 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; int partition_vert_allowed = !force_horz_split && xss <= yss && bsize >= BLOCK_8X8; +#if CONFIG_ML_VAR_PARTITION + const int use_ml_based_partitioning = + sf->partition_search_type == ML_BASED_PARTITION; +#endif // CONFIG_ML_VAR_PARTITION + (void)*tp_orig; // Avoid checking for rectangular partitions for speed >= 6. @@ -3889,6 +4684,20 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, partition_vert_allowed &= force_vert_split; } +#if CONFIG_ML_VAR_PARTITION + if (use_ml_based_partitioning) { + if (partition_none_allowed || do_split) do_rect = 0; + if (partition_none_allowed && do_split) { + const int ml_predicted_partition = + ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col); + if (ml_predicted_partition == PARTITION_NONE) do_split = 0; + if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0; + } + } +#endif // CONFIG_ML_VAR_PARTITION + + if (!partition_none_allowed && !do_split) do_rect = 1; + ctx->pred_pixel_ready = !(partition_vert_allowed || partition_horz_allowed || do_split); @@ -3902,26 +4711,28 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, ctx->skip = x->skip; if (this_rdc.rate != INT_MAX) { - int pl = partition_plane_context(xd, mi_row, mi_col, bsize); + const int pl = partition_plane_context(xd, mi_row, mi_col, bsize); this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE]; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist); if (this_rdc.rdcost < best_rdc.rdcost) { - int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; - int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; - - dist_breakout_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - rate_breakout_thr *= num_pels_log2_lookup[bsize]; - best_rdc = this_rdc; if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE; - if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && - this_rdc.dist < dist_breakout_thr) { - do_split = 0; - do_rect = 0; +#if CONFIG_ML_VAR_PARTITION + if (!use_ml_based_partitioning) +#endif // CONFIG_ML_VAR_PARTITION + { + int64_t dist_breakout_thr = sf->partition_search_breakout_thr.dist; + int64_t rate_breakout_thr = sf->partition_search_breakout_thr.rate; + dist_breakout_thr >>= + 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + rate_breakout_thr *= num_pels_log2_lookup[bsize]; + if (!x->e_mbd.lossless && this_rdc.rate < rate_breakout_thr && + this_rdc.dist < dist_breakout_thr) { + do_split = 0; + do_rect = 0; + } } } } @@ -3969,7 +4780,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_HORZ if (partition_horz_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_HORZ); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->horizontal[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->horizontal[0]); @@ -4013,7 +4824,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td, // PARTITION_VERT if (partition_vert_allowed && do_rect) { subsize = get_subsize(bsize, PARTITION_VERT); - if (sf->adaptive_motion_search) load_pred_mv(x, ctx); + load_pred_mv(x, ctx); pc_tree->vertical[0].pred_pixel_ready = 1; nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize, &pc_tree->vertical[0]); @@ -4173,7 +4984,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, @@ -4203,7 +5015,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, rd_cost->dist += this_rdc.dist; } break; - default: assert(0 && "Invalid partition type."); break; } } @@ -4292,7 +5103,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, output_enabled, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, @@ -4313,13 +5125,117 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, dummy_cost, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) update_partition_context(xd, mi_row, mi_col, subsize, bsize); } +#if CONFIG_ML_VAR_PARTITION +// Get a prediction(stored in x->est_pred) for the whole 64x64 superblock. +static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile, + MACROBLOCK *x, int mi_row, int mi_col) { + VP9_COMMON *const cm = &cpi->common; + const int is_key_frame = frame_is_intra_only(cm); + + set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); + + if (!is_key_frame) { + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + const YV12_BUFFER_CONFIG *yv12_g = NULL; + const BLOCK_SIZE bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + + (mi_row + 4 < cm->mi_rows); + int pixels_wide = 64, pixels_high = 64; + unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad = UINT_MAX; + + assert(yv12 != NULL); + + if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); + if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3); + + if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) || + cpi->svc.use_gf_temporal_ref_current_layer) { + // For now, GOLDEN will not be used for non-zero spatial layers, since + // it may not be a temporal reference. + yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + + // Only compute y_sad_g (sad for golden reference) for speed < 8. + if (cpi->oxcf.speed < 8 && yv12_g && yv12_g != yv12 && + (cpi->ref_frame_flags & VP9_GOLD_FLAG)) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + y_sad_g = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, + xd->plane[0].pre[0].stride); + } else { + y_sad_g = UINT_MAX; + } + + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } + mi->ref_frame[1] = NONE; + mi->sb_type = BLOCK_64X64; + mi->mv[0].as_int = 0; + mi->interp_filter = BILINEAR; + + { + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); + x->sb_use_mv_part = 1; + x->sb_mvcol_part = mi->mv[0].as_mv.col; + x->sb_mvrow_part = mi->mv[0].as_mv.row; + } + + // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad + // are close if short_circuit_low_temp_var is on. + y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; + if (y_sad_g < y_sad_thr) { + vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, + &cm->frame_refs[GOLDEN_FRAME - 1].sf); + mi->ref_frame[0] = GOLDEN_FRAME; + mi->mv[0].as_int = 0; + y_sad = y_sad_g; + } else { + x->pred_mv[LAST_FRAME] = mi->mv[0].as_mv; + } + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = x->est_pred; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64); + } else { +#if CONFIG_VP9_HIGHBITDEPTH + switch (xd->bd) { + case 8: memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); break; + case 10: + memset(x->est_pred, 128 * 4, 64 * 64 * sizeof(x->est_pred[0])); + break; + case 12: + memset(x->est_pred, 128 * 16, 64 * 64 * sizeof(x->est_pred[0])); + break; + } +#else + memset(x->est_pred, 128, 64 * 64 * sizeof(x->est_pred[0])); +#endif // CONFIG_VP9_HIGHBITDEPTH + } +} +#endif // CONFIG_ML_VAR_PARTITION + static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, TileDataEnc *tile_data, int mi_row, TOKENEXTRA **tp) { @@ -4350,6 +5266,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, PARTITION_SEARCH_TYPE partition_search_type = sf->partition_search_type; BLOCK_SIZE bsize = BLOCK_64X64; int seg_skip = 0; + int i; (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile); @@ -4359,7 +5276,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } x->source_variance = UINT_MAX; - vp9_zero(x->pred_mv); + for (i = 0; i < MAX_REF_FRAMES; ++i) { + x->pred_mv[i].row = INT16_MAX; + x->pred_mv[i].col = INT16_MAX; + } vp9_rd_cost_init(&dummy_rdc); x->color_sensitivity[0] = 0; x->color_sensitivity[1] = 0; @@ -4367,6 +5287,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->skip_low_source_sad = 0; x->lowvar_highsumdiff = 0; x->content_state_sb = 0; + x->zero_temp_sad_source = 0; x->sb_use_mv_part = 0; x->sb_mvcol_part = 0; x->sb_mvrow_part = 0; @@ -4406,6 +5327,17 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; +#if CONFIG_ML_VAR_PARTITION + case ML_BASED_PARTITION: + get_estimated_pred(cpi, tile_info, x, mi_row, mi_col); + x->max_partition_size = BLOCK_64X64; + x->min_partition_size = BLOCK_8X8; + x->sb_pickmode_part = 1; + nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, + BLOCK_64X64, &dummy_rdc, 1, INT64_MAX, + td->pc_root); + break; +#endif // CONFIG_ML_VAR_PARTITION case SOURCE_VAR_BASED_PARTITION: set_source_var_based_partition(cpi, tile_info, x, mi, mi_row, mi_col); nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, @@ -4417,14 +5349,15 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; - case REFERENCE_PARTITION: + default: + assert(partition_search_type == REFERENCE_PARTITION); x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); // Use nonrd_pick_partition on scene-cut for VBR mode. // nonrd_pick_partition does not support 4x4 partition, so avoid it // on key frame for now. if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad && - cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME && + cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { // Use lower max_partition_size for low resoultions. if (cm->width <= 352 && cm->height <= 288) @@ -4440,7 +5373,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, // TODO(marpan): Seems like nonrd_select_partition does not support // 4x4 partition. Since 4x4 is used on key frame, use this switch // for now. - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); else @@ -4449,7 +5382,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } break; - default: assert(0); break; } // Update ref_frame usage for inter frame if this group is ARF group. @@ -4516,16 +5448,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10" - " or VPX_BITS_12"); - return -1; } } else { vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, @@ -4620,8 +5548,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) { if (cpi->tile_data != NULL) vpx_free(cpi->tile_data); - CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows * - sizeof(*cpi->tile_data))); + CHECK_MEM_ERROR( + cm, cpi->tile_data, + vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; for (tile_row = 0; tile_row < tile_rows; ++tile_row) @@ -4632,6 +5561,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; +#if CONFIG_CONSISTENT_RECODE + tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; +#endif tile_data->mode_map[i][j] = j; } } @@ -4645,6 +5577,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; TileInfo *tile_info = &this_tile->tile_info; + if (cpi->sf.adaptive_rd_thresh_row_mt && + this_tile->row_base_thresh_freq_fact == NULL) + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); vp9_tile_init(tile_info, cm, tile_row, tile_col); cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok; @@ -4735,10 +5670,10 @@ static void encode_frame_internal(VP9_COMP *cpi) { MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; + const int gf_group_index = cpi->twopass.gf_group.index; xd->mi = cm->mi_grid_visible; xd->mi[0] = cm->mi; - vp9_zero(*td->counts); vp9_zero(cpi->td.rd_counts); @@ -4756,8 +5691,12 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; - +#if CONFIG_CONSISTENT_RECODE + x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; +#endif if (xd->lossless) x->optimize = 0; + x->sharpness = cpi->oxcf.sharpness; + x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); cm->tx_mode = select_tx_mode(cpi, xd); @@ -4799,6 +5738,28 @@ static void encode_frame_internal(VP9_COMP *cpi) { if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) source_var_based_partition_search_method(cpi); + } else if (gf_group_index && gf_group_index < MAX_ARF_GOP_SIZE && + cpi->sf.enable_tpl_model) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + + int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int row, col; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + } + } + + vpx_clear_system_state(); + + if (tpl_frame->is_valid) + cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base; } { @@ -4881,9 +5842,48 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } +#if CONFIG_CONSISTENT_RECODE +static void restore_encode_params(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes[i][j] = + rd_opt->prediction_type_threshes_prev[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact[i][j] = + tile_data->thresh_freq_fact_prev[i][j]; + } + } + } + } + + cm->interp_filter = cpi->sf.default_interp_filter; +} +#endif + void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; +#if CONFIG_CONSISTENT_RECODE + restore_encode_params(cpi); +#endif + // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a // different sign bias and that buffer is then the fixed ref. However, this @@ -4891,16 +5891,11 @@ void vp9_encode_frame(VP9_COMP *cpi) { // side behavior is where the ALT ref buffer has opposite sign bias to // the other two. if (!frame_is_intra_only(cm)) { - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { - cpi->allow_comp_inter_inter = 0; - } else { + if (vp9_compound_reference_allowed(cm)) { cpi->allow_comp_inter_inter = 1; - cm->comp_fixed_ref = ALTREF_FRAME; - cm->comp_var_ref[0] = LAST_FRAME; - cm->comp_var_ref[1] = GOLDEN_FRAME; + vp9_setup_compound_reference_mode(cm); + } else { + cpi->allow_comp_inter_inter = 0; } } @@ -5064,7 +6059,8 @@ static void update_zeromv_cnt(VP9_COMP *const cpi, const MODE_INFO *const mi, for (y = 0; y < ymis; y++) for (x = 0; x < xmis; x++) { int map_offset = block_index + y * cm->mi_cols + x; - if (is_inter_block(mi) && mi->segment_id <= CR_SEGMENT_ID_BOOST2) { + if (mi->ref_frame[0] == LAST_FRAME && is_inter_block(mi) && + mi->segment_id <= CR_SEGMENT_ID_BOOST2) { if (abs(mv.row) < 8 && abs(mv.col) < 8) { if (cpi->consec_zero_mv[map_offset] < 255) cpi->consec_zero_mv[map_offset]++; @@ -5159,7 +6155,11 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t, ++td->counts->tx.tx_totals[get_uv_tx_size(mi, &xd->plane[1])]; if (cm->seg.enabled && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_sb_postencode(cpi, mi, mi_row, mi_col, bsize); - if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0) + if (cpi->oxcf.pass == 0 && cpi->svc.temporal_layer_id == 0 && + (!cpi->use_svc || + (cpi->use_svc && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1))) update_zeromv_cnt(cpi, mi, mi_row, mi_col, bsize); } } diff --git a/libvpx/vp9/encoder/vp9_encodeframe.h b/libvpx/vp9/encoder/vp9_encodeframe.h index cf5ae3d8a..1798c0048 100644 --- a/libvpx/vp9/encoder/vp9_encodeframe.h +++ b/libvpx/vp9/encoder/vp9_encodeframe.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_ -#define VP9_ENCODER_VP9_ENCODEFRAME_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ #include "vpx/vpx_integer.h" @@ -49,4 +49,4 @@ void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEFRAME_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEFRAME_H_ diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c index f3c17f255..a68a0926a 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.c +++ b/libvpx/vp9/encoder/vp9_encodemb.c @@ -50,7 +50,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { } static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = { - { 10, 6 }, { 8, 5 }, + { 10, 6 }, + { 8, 5 }, }; // 'num' can be negative, but 'shift' must be non-negative. @@ -76,13 +77,19 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const scan_order *const so = get_scan(xd, tx_size, plane_type, block); const int16_t *const scan = so->scan; const int16_t *const nb = so->neighbors; + const MODE_INFO *mbmi = xd->mi[0]; + const int sharpness = mb->sharpness; + const int64_t rdadj = (int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]; const int64_t rdmult = - ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1; + (sharpness == 0 ? rdadj >> 1 + : (rdadj * (8 - sharpness + mbmi->segment_id)) >> 4); + const int64_t rddiv = mb->rddiv; int64_t rd_cost0, rd_cost1; int64_t rate0, rate1; int16_t t0, t1; int i, final_eob; + int count_high_values_after_eob = 0; #if CONFIG_VP9_HIGHBITDEPTH const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd); #else @@ -200,9 +207,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, const int band_next = band_translate[i + 1]; const int token_next = (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN; - unsigned int( - *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = - token_costs + band_next; + unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS] + [ENTROPY_TOKENS] = + token_costs + band_next; token_cache[rc] = vp9_pt_energy_class[t0]; ctx_next = get_coef_context(nb, token_cache, i + 1); token_tree_sel_next = (x == 0); @@ -262,6 +269,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, assert(distortion0 <= distortion_for_zero); token_cache[rc] = vp9_pt_energy_class[t0]; } + if (sharpness > 0 && abs(qcoeff[rc]) > 1) count_high_values_after_eob++; assert(accu_error >= 0); x_prev = qcoeff[rc]; // Update based on selected quantized value. @@ -272,6 +280,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if (best_eob_cost_cur < best_block_rd_cost) { best_block_rd_cost = best_eob_cost_cur; final_eob = i + 1; + count_high_values_after_eob = 0; if (use_x1) { before_best_eob_qc = x1; before_best_eob_dqc = dqc1; @@ -283,19 +292,31 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, } } } - assert(final_eob <= eob); - if (final_eob > 0) { - int rc; - assert(before_best_eob_qc != 0); - i = final_eob - 1; - rc = scan[i]; - qcoeff[rc] = before_best_eob_qc; - dqcoeff[rc] = before_best_eob_dqc; - } - for (i = final_eob; i < eob; i++) { - int rc = scan[i]; - qcoeff[rc] = 0; - dqcoeff[rc] = 0; + if (count_high_values_after_eob > 0) { + final_eob = eob - 1; + for (; final_eob >= 0; final_eob--) { + const int rc = scan[final_eob]; + const int x = qcoeff[rc]; + if (x) { + break; + } + } + final_eob++; + } else { + assert(final_eob <= eob); + if (final_eob > 0) { + int rc; + assert(before_best_eob_qc != 0); + i = final_eob - 1; + rc = scan[i]; + qcoeff[rc] = before_best_eob_qc; + dqcoeff[rc] = before_best_eob_dqc; + } + for (i = final_eob; i < eob; i++) { + int rc = scan[i]; + qcoeff[rc] = 0; + dqcoeff[rc] = 0; + } } mb->plane[plane].eobs[block] = final_eob; return final_eob; @@ -357,13 +378,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -387,13 +408,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } @@ -433,13 +454,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); } return; } @@ -461,12 +482,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); break; } } @@ -510,14 +531,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -543,13 +564,13 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } @@ -633,14 +654,14 @@ static void encode_block(int plane, int block, int row, int col, vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - default: assert(0 && "Invalid transform size"); } return; } @@ -656,13 +677,13 @@ static void encode_block(int plane, int block, int row, int col, case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - default: assert(0 && "Invalid transform size"); break; } } @@ -847,7 +868,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, xd->bd); } break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); @@ -875,7 +897,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } break; - default: assert(0); return; } if (*eob) *(args->skip) = 0; return; @@ -929,7 +950,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride); @@ -954,7 +976,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; - default: assert(0); break; } if (*eob) *(args->skip) = 0; } diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h index cf943bedf..fa41f70ef 100644 --- a/libvpx/vp9/encoder/vp9_encodemb.h +++ b/libvpx/vp9/encoder/vp9_encodemb.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMB_H_ -#define VP9_ENCODER_VP9_ENCODEMB_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMB_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMB_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -48,4 +48,4 @@ void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMB_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMB_H_ diff --git a/libvpx/vp9/encoder/vp9_encodemv.h b/libvpx/vp9/encoder/vp9_encodemv.h index 9fc7ab8dc..2f1be4b23 100644 --- a/libvpx/vp9/encoder/vp9_encodemv.h +++ b/libvpx/vp9/encoder/vp9_encodemv.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODEMV_H_ -#define VP9_ENCODER_VP9_ENCODEMV_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODEMV_H_ +#define VPX_VP9_ENCODER_VP9_ENCODEMV_H_ #include "vp9/encoder/vp9_encoder.h" @@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], - const nmv_context *mvctx, int usehp); + const nmv_context *ctx, int usehp); void vp9_update_mv_count(ThreadData *td); @@ -35,4 +35,4 @@ void vp9_update_mv_count(ThreadData *td); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODEMV_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODEMV_H_ diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c index 2ae59dd98..bf35b3570 100644 --- a/libvpx/vp9/encoder/vp9_encoder.c +++ b/libvpx/vp9/encoder/vp9_encoder.c @@ -35,6 +35,7 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_tile_common.h" +#include "vp9/common/vp9_scan.h" #include "vp9/encoder/vp9_alt_ref_aq.h" #include "vp9/encoder/vp9_aq_360.h" @@ -42,14 +43,21 @@ #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_aq_variance.h" #include "vp9/encoder/vp9_bitstream.h" +#if CONFIG_INTERNAL_STATS +#include "vp9/encoder/vp9_blockiness.h" +#endif #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" -#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/encoder/vp9_mcomp.h" +#endif #include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_picklpf.h" @@ -65,12 +73,12 @@ #define AM_SEGMENT_ID_INACTIVE 7 #define AM_SEGMENT_ID_ACTIVE 0 -#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv - // for altref computation. -#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision - // mv. Choose a very high value for - // now so that HIGH_PRECISION is always - // chosen. +// Whether to use high precision mv for altref computation. +#define ALTREF_HIGH_PRECISION_MV 1 + +// Q threshold for high precision mv. Choose a very high value for now so that +// HIGH_PRECISION is always chosen. +#define HIGH_PRECISION_MV_QTHRESH 200 #define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold #define FRAME_RATE_FACTOR 8 @@ -84,6 +92,9 @@ static FILE *yuv_skinmap_file = NULL; #ifdef OUTPUT_YUV_REC FILE *yuv_rec_file; #endif +#ifdef OUTPUT_YUV_SVC_SRC +FILE *yuv_svc_src[3] = { NULL, NULL, NULL }; +#endif #if 0 FILE *framepsnr; @@ -483,14 +494,10 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { *hr = 3; *hs = 5; break; - case ONETWO: - *hr = 1; - *hs = 2; - break; default: + assert(mode == ONETWO); *hr = 1; - *hs = 1; - assert(0); + *hs = 2; break; } } @@ -547,6 +554,74 @@ static void apply_active_map(VP9_COMP *cpi) { } } +static void apply_roi_map(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + struct segmentation *const seg = &cm->seg; + vpx_roi_map_t *roi = &cpi->roi; + const int *delta_q = roi->delta_q; + const int *delta_lf = roi->delta_lf; + const int *skip = roi->skip; + int ref_frame[8]; + int internal_delta_q[MAX_SEGMENTS]; + int i; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + + // TODO(jianj): Investigate why ROI not working in speed < 5 or in non + // realtime mode. + if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return; + if (!roi->enabled) return; + + memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame)); + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + // Select delta coding method; + seg->abs_delta = SEGMENT_DELTADATA; + + memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols)); + + for (i = 0; i < MAX_SEGMENTS; ++i) { + // Translate the external delta q values to internal values. + internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i])); + if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i]; + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF); + if (internal_delta_q[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]); + } + if (delta_lf[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF); + vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]); + } + if (skip[i] != 0) { + vp9_enable_segfeature(seg, i, SEG_LVL_SKIP); + vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]); + } + if (ref_frame[i] >= 0) { + int valid_ref = 1; + // ALTREF is not used as reference for nonrd_pickmode with 0 lag. + if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode) + valid_ref = 0; + // If GOLDEN is selected, make sure it's set as reference. + if (ref_frame[i] == GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) { + valid_ref = 0; + } + // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are + // same reference. + if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0) + ref_frame[i] = LAST_FRAME; + if (valid_ref) { + vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME); + vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]); + } + } + } + roi->enabled = 1; +} + static void init_level_info(Vp9LevelInfo *level_info) { Vp9LevelStats *const level_stats = &level_info->level_stats; Vp9LevelSpec *const level_spec = &level_info->level_spec; @@ -557,6 +632,13 @@ static void init_level_info(Vp9LevelInfo *level_info) { level_spec->min_altref_distance = INT_MAX; } +static int check_seg_range(int seg_data[8], int range) { + return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range || + abs(seg_data[2]) > range || abs(seg_data[3]) > range || + abs(seg_data[4]) > range || abs(seg_data[5]) > range || + abs(seg_data[6]) > range || abs(seg_data[7]) > range); +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { int i; const Vp9LevelSpec *this_level; @@ -583,6 +665,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level; } +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]) { + VP9_COMMON *cm = &cpi->common; + vpx_roi_map_t *roi = &cpi->roi; + const int range = 63; + const int ref_frame_range = 3; // Alt-ref + const int skip_range = 1; + const int frame_rows = cpi->common.mi_rows; + const int frame_cols = cpi->common.mi_cols; + + // Check number of rows and columns match + if (frame_rows != (int)rows || frame_cols != (int)cols) { + return -1; + } + + if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) || + !check_seg_range(ref_frame, ref_frame_range) || + !check_seg_range(skip, skip_range)) + return -1; + + // Also disable segmentation if no deltas are specified. + if (!map || + (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] | + delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] | + delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] | + delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] | + skip[5] | skip[6] | skip[7]) && + (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 && + ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 && + ref_frame[6] == -1 && ref_frame[7] == -1))) { + vp9_disable_segmentation(&cm->seg); + cpi->roi.enabled = 0; + return 0; + } + + if (roi->roi_map) { + vpx_free(roi->roi_map); + roi->roi_map = NULL; + } + CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols)); + + // Copy to ROI sturcture in the compressor. + memcpy(roi->roi_map, map, rows * cols); + memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0])); + memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0])); + memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0])); + memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0])); + roi->enabled = 1; + roi->rows = rows; + roi->cols = cols; + + return 0; +} + int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols) { if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) { @@ -660,8 +797,17 @@ static void setup_frame(VP9_COMP *cpi) { if (!cpi->use_svc) cm->frame_context_idx = cpi->refresh_alt_ref_frame; } + // TODO(jingning): Overwrite the frame_context_idx index in multi-layer ARF + // case. Need some further investigation on if we could apply this to single + // layer ARF case as well. + if (cpi->multi_layer_arf && !cpi->use_svc) { + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + cm->frame_context_idx = clamp(gf_group->layer_depth[gf_group->index] - 1, 0, + FRAME_CONTEXTS - 1); + } + if (cm->frame_type == KEY_FRAME) { - if (!is_two_pass_svc(cpi)) cpi->refresh_golden_frame = 1; + cpi->refresh_golden_frame = 1; cpi->refresh_alt_ref_frame = 1; vp9_zero(cpi->interp_filter_selected); } else { @@ -713,12 +859,17 @@ static void vp9_enc_free_mi(VP9_COMMON *cm) { cm->mi_grid_base = NULL; vpx_free(cm->prev_mi_grid_base); cm->prev_mi_grid_base = NULL; + cm->mi_alloc_size = 0; } static void vp9_swap_mi_and_prev_mi(VP9_COMMON *cm) { // Current mip will be the prev_mip for the next frame. MODE_INFO **temp_base = cm->prev_mi_grid_base; MODE_INFO *temp = cm->prev_mip; + + // Skip update prev_mi frame in show_existing_frame mode. + if (cm->show_existing_frame) return; + cm->prev_mip = cm->mip; cm->mip = temp; @@ -817,6 +968,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->active_map.map); cpi->active_map.map = NULL; + vpx_free(cpi->roi.roi_map); + cpi->roi.roi_map = NULL; + vpx_free(cpi->consec_zero_mv); cpi->consec_zero_mv = NULL; @@ -1121,8 +1275,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) { // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a - // target of 1/4x1/4. - if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) { + // target of 1/4x1/4. number_spatial_layers must be greater than 2. + if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc && + cpi->svc.number_spatial_layers > 2) { cpi->svc.scaled_temp_is_alloc = 1; if (vpx_realloc_frame_buffer( &cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1, @@ -1213,15 +1368,9 @@ static void set_tile_limits(VP9_COMP *cpi) { int min_log2_tile_cols, max_log2_tile_cols; vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols); - if (is_two_pass_svc(cpi) && (cpi->svc.encode_empty_frame_state == ENCODING || - cpi->svc.number_spatial_layers > 1)) { - cm->log2_tile_cols = 0; - cm->log2_tile_rows = 0; - } else { - cm->log2_tile_cols = - clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); - cm->log2_tile_rows = cpi->oxcf.tile_rows; - } + cm->log2_tile_cols = + clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols); + cm->log2_tile_rows = cpi->oxcf.tile_rows; if (cpi->oxcf.target_level == LEVEL_AUTO) { const int level_tile_cols = @@ -1244,24 +1393,17 @@ static void update_frame_size(VP9_COMP *cpi) { cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base)); set_tile_limits(cpi); - - if (is_two_pass_svc(cpi)) { - if (vpx_realloc_frame_buffer(&cpi->alt_ref_buffer, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to reallocate alt_ref_buffer"); - } } static void init_buffer_indices(VP9_COMP *cpi) { - cpi->lst_fb_idx = 0; - cpi->gld_fb_idx = 1; - cpi->alt_fb_idx = 2; + int ref_frame; + + for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) + cpi->ref_fb_idx[ref_frame] = ref_frame; + + cpi->lst_fb_idx = cpi->ref_fb_idx[LAST_FRAME - 1]; + cpi->gld_fb_idx = cpi->ref_fb_idx[GOLDEN_FRAME - 1]; + cpi->alt_fb_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1]; } static void init_level_constraint(LevelConstraint *lc) { @@ -1610,7 +1752,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4x4d_bits10) break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, @@ -1689,11 +1832,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits12) break; - - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); } } } @@ -1757,6 +1895,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { int last_w = cpi->oxcf.width; int last_h = cpi->oxcf.height; + vp9_init_quantizer(cpi); if (cm->profile != oxcf->profile) cm->profile = oxcf->profile; cm->bit_depth = oxcf->bit_depth; cm->color_space = oxcf->color_space; @@ -2017,8 +2156,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, realloc_segmentation_maps(cpi); - CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(cpi->skin_map[0]))); + CHECK_MEM_ERROR( + cm, cpi->skin_map, + vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0]))); CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create()); @@ -2062,8 +2202,6 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #endif cpi->refresh_alt_ref_frame = 0; - cpi->multi_arf_last_grp_enabled = 0; - cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; init_level_info(&cpi->level_info); @@ -2104,9 +2242,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, if (cpi->b_calculate_consistency) { CHECK_MEM_ERROR(cm, cpi->ssim_vars, - vpx_malloc(sizeof(*cpi->ssim_vars) * 4 * - cpi->common.mi_rows * cpi->common.mi_cols)); + vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols, + sizeof(*cpi->ssim_vars) * 4)); cpi->worst_consistency = 100.0; + } else { + cpi->ssim_vars = NULL; } #endif @@ -2141,6 +2281,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, #ifdef OUTPUT_YUV_REC yuv_rec_file = fopen("rec.yuv", "wb"); #endif +#ifdef OUTPUT_YUV_SVC_SRC + yuv_svc_src[0] = fopen("svc_src_0.yuv", "wb"); + yuv_svc_src[1] = fopen("svc_src_1.yuv", "wb"); + yuv_svc_src[2] = fopen("svc_src_2.yuv", "wb"); +#endif #if 0 framepsnr = fopen("framepsnr.stt", "a"); @@ -2219,6 +2364,11 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_set_speed_features_framesize_independent(cpi); vp9_set_speed_features_framesize_dependent(cpi); +#if CONFIG_NON_GREEDY_MV + cpi->feature_score_loc_alloc = 0; +#endif // CONFIG_NON_GREEDY_MV + for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL; + // Allocate memory to store variances for a frame. CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff))); cpi->source_var_thresh = 0; @@ -2293,6 +2443,17 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, vp9_loop_filter_init(cm); + // Set up the unit scaling factor used during motion search. +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height, + cm->use_highbitdepth); +#else + vp9_setup_scale_factors_for_frame(&cpi->me_sf, cm->width, cm->height, + cm->width, cm->height); +#endif // CONFIG_VP9_HIGHBITDEPTH + cpi->td.mb.me_sf = &cpi->me_sf; + cm->error.setjmp = 0; return cpi; @@ -2307,11 +2468,15 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; - unsigned int i; + unsigned int i, frame; int t; if (!cpi) return; +#if CONFIG_INTERNAL_STATS + vpx_free(cpi->ssim_vars); +#endif + cm = &cpi->common; if (cm->current_video_frame > 0) { #if CONFIG_INTERNAL_STATS @@ -2383,7 +2548,6 @@ void vp9_remove_compressor(VP9_COMP *cpi) { fclose(f); } - #endif #if 0 @@ -2402,6 +2566,16 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_denoiser_free(&(cpi->denoiser)); #endif +#if CONFIG_NON_GREEDY_MV + vpx_free(cpi->feature_score_loc_arr); + vpx_free(cpi->feature_score_loc_sort); + vpx_free(cpi->feature_score_loc_heap); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } + for (t = 0; t < cpi->num_workers; ++t) { VPxWorker *const worker = &cpi->workers[t]; EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; @@ -2459,6 +2633,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { #ifdef OUTPUT_YUV_REC fclose(yuv_rec_file); #endif +#ifdef OUTPUT_YUV_SVC_SRC + fclose(yuv_svc_src[0]); + fclose(yuv_svc_src[1]); + fclose(yuv_svc_src[2]); +#endif #if 0 @@ -2754,11 +2933,14 @@ static int big_rate_miss(VP9_COMP *cpi) { // test in two pass for the first static int two_pass_first_group_inter(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - GF_GROUP *const gf_group = &twopass->gf_group; - if ((cpi->oxcf.pass == 2) && - (gf_group->index == gf_group->first_inter_index)) { - return 1; + if (cpi->oxcf.pass == 2) { + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + const int gfg_index = gf_group->index; + + if (gfg_index == 0) return gf_group->update_type[gfg_index] == LF_UPDATE; + return gf_group->update_type[gfg_index - 1] != LF_UPDATE && + gf_group->update_type[gfg_index] == LF_UPDATE; } else { return 0; } @@ -2808,9 +2990,18 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, return force_recode; } -void vp9_update_reference_frames(VP9_COMP *cpi) { +static void update_ref_frames(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; BufferPool *const pool = cm->buffer_pool; + GF_GROUP *const gf_group = &cpi->twopass.gf_group; + + // Pop ARF. + if (cm->show_existing_frame) { + cpi->lst_fb_idx = cpi->alt_fb_idx; + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; + } // At this point the new frame has been encoded. // If any buffer copy / swapping is signaled it should be done here. @@ -2836,23 +3027,23 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { tmp = cpi->alt_fb_idx; cpi->alt_fb_idx = cpi->gld_fb_idx; cpi->gld_fb_idx = tmp; - - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[0].gold_ref_idx = cpi->gld_fb_idx; - cpi->svc.layer_context[0].alt_ref_idx = cpi->alt_fb_idx; - } } else { /* For non key/golden frames */ if (cpi->refresh_alt_ref_frame) { - int arf_idx = cpi->alt_fb_idx; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_idx = gf_group->arf_update_idx[gf_group->index]; - } + int arf_idx = gf_group->top_arf_idx; + + // Push new ARF into stack. + stack_push(gf_group->arf_index_stack, cpi->alt_fb_idx, + gf_group->stack_size); + ++gf_group->stack_size; + + assert(arf_idx < REF_FRAMES); ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx); memcpy(cpi->interp_filter_selected[ALTREF_FRAME], cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); + + cpi->alt_fb_idx = arf_idx; } if (cpi->refresh_golden_frame) { @@ -2877,69 +3068,39 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { cpi->interp_filter_selected[0], sizeof(cpi->interp_filter_selected[0])); } -#if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) && - cpi->denoiser.denoising_level > kDenLowLow) { - int svc_base_is_key = 0; - int denoise_svc_second_layer = 0; - if (cpi->use_svc) { - int realloc_fail = 0; - const int svc_buf_shift = - cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 - ? cpi->denoiser.num_ref_frames - : 0; - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - svc_base_is_key = lc->is_key_frame; - denoise_svc_second_layer = - cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1 - : 0; - // Check if we need to allocate extra buffers in the denoiser - // for - // refreshed frames. - realloc_fail = vp9_denoiser_realloc_svc( - cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame, - cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx, - cpi->gld_fb_idx, cpi->lst_fb_idx); - if (realloc_fail) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to re-allocate denoiser for SVC"); - } - vp9_denoiser_update_frame_info( - &cpi->denoiser, *cpi->Source, cpi->common.frame_type, - cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, - cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx, - cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key, - denoise_svc_second_layer); + + if (gf_group->update_type[gf_group->index] == MID_OVERLAY_UPDATE) { + cpi->alt_fb_idx = + stack_pop(gf_group->arf_index_stack, gf_group->stack_size); + --gf_group->stack_size; } +} + +void vp9_update_reference_frames(VP9_COMP *cpi) { + update_ref_frames(cpi); + +#if CONFIG_VP9_TEMPORAL_DENOISING + vp9_denoiser_update_ref_frame(cpi); #endif - if (is_one_pass_cbr_svc(cpi)) { - // Keep track of frame index for each reference frame. - SVC *const svc = &cpi->svc; - if (cm->frame_type == KEY_FRAME) { - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } else { - if (cpi->refresh_last_frame) - svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; - if (cpi->refresh_golden_frame) - svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; - if (cpi->refresh_alt_ref_frame) - svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; - } - } + + if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi); } static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { MACROBLOCKD *xd = &cpi->td.mb.e_mbd; struct loopfilter *lf = &cm->lf; - - const int is_reference_frame = + int is_reference_frame = (cm->frame_type == KEY_FRAME || cpi->refresh_last_frame || cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + if (cpi->use_svc && + cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) + is_reference_frame = !cpi->svc.non_reference_frame; + + // Skip loop filter in show_existing_frame mode. + if (cm->show_existing_frame) { + lf->filter_level = 0; + return; + } if (xd->lossless) { lf->filter_level = 0; @@ -3066,8 +3227,8 @@ void vp9_scale_references(VP9_COMP *cpi) { if (cpi->oxcf.pass == 0 && !cpi->use_svc) { // Check for release of scaled reference. buf_idx = cpi->scaled_ref_idx[ref_frame - 1]; - buf = (buf_idx != INVALID_IDX) ? &pool->frame_bufs[buf_idx] : NULL; - if (buf != NULL) { + if (buf_idx != INVALID_IDX) { + buf = &pool->frame_bufs[buf_idx]; --buf->ref_count; cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX; } @@ -3098,22 +3259,21 @@ static void release_scaled_references(VP9_COMP *cpi) { refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0; for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i - 1]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); - if (buf != NULL && - (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && - buf->buf.y_crop_height == ref->y_crop_height))) { - --buf->ref_count; - cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; + const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi, i); + if (refresh[i - 1] || (buf->buf.y_crop_width == ref->y_crop_width && + buf->buf.y_crop_height == ref->y_crop_height)) { + --buf->ref_count; + cpi->scaled_ref_idx[i - 1] = INVALID_IDX; + } } } } else { - for (i = 0; i < MAX_REF_FRAMES; ++i) { + for (i = 0; i < REFS_PER_FRAME; ++i) { const int idx = cpi->scaled_ref_idx[i]; - RefCntBuffer *const buf = - idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL; - if (buf != NULL) { + if (idx != INVALID_IDX) { + RefCntBuffer *const buf = &cm->buffer_pool->frame_bufs[idx]; --buf->ref_count; cpi->scaled_ref_idx[i] = INVALID_IDX; } @@ -3172,11 +3332,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { case VPX_BITS_10: dc_quant_devisor = 16.0; break; - case VPX_BITS_12: - dc_quant_devisor = 64.0; - break; default: - assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + dc_quant_devisor = 64.0; break; } #else @@ -3308,6 +3466,11 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index, // Decide q and q bounds. *q = vp9_rc_pick_q_and_bounds(cpi, bottom_index, top_index); + if (cpi->oxcf.rc_mode == VPX_CBR && cpi->rc.force_max_q) { + *q = cpi->rc.worst_quality; + cpi->rc.force_max_q = 0; + } + if (!frame_is_intra_only(cm)) { vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH); } @@ -3415,9 +3578,7 @@ static void set_frame_size(VP9_COMP *cpi) { #endif } - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { + if ((oxcf->pass == 2) && !cpi->use_svc) { vp9_set_target_rate(cpi); } @@ -3464,19 +3625,75 @@ static void set_frame_size(VP9_COMP *cpi) { set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, - uint8_t *dest) { +#if CONFIG_CONSISTENT_RECODE +static void save_encode_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int tile_col, tile_row; + int i, j; + RD_OPT *rd_opt = &cpi->rd; + for (i = 0; i < MAX_REF_FRAMES; i++) { + for (j = 0; j < REFERENCE_MODES; j++) + rd_opt->prediction_type_threshes_prev[i][j] = + rd_opt->prediction_type_threshes[i][j]; + + for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; j++) + rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; + } + + if (cpi->tile_data != NULL) { + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) { + TileDataEnc *tile_data = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + for (i = 0; i < BLOCK_SIZES; ++i) { + for (j = 0; j < MAX_MODES; ++j) { + tile_data->thresh_freq_fact_prev[i][j] = + tile_data->thresh_freq_fact[i][j]; + } + } + } + } +} +#endif + +static INLINE void set_raw_source_frame(VP9_COMP *cpi) { +#ifdef ENABLE_KF_DENOISE + if (is_spatial_denoise_enabled(cpi)) { + cpi->raw_source_frame = vp9_scale_if_required( + cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source, + (oxcf->pass == 0), EIGHTTAP, 0); + } else { + cpi->raw_source_frame = cpi->Source; + } +#else + cpi->raw_source_frame = cpi->Source; +#endif +} + +static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size, + uint8_t *dest) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + int q = 0, bottom_index = 0, top_index = 0; + int no_drop_scene_change = 0; const INTERP_FILTER filter_scaler = (is_one_pass_cbr_svc(cpi)) - ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] + ? svc->downsample_filter_type[svc->spatial_layer_id] : EIGHTTAP; const int phase_scaler = (is_one_pass_cbr_svc(cpi)) - ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] + ? svc->downsample_filter_phase[svc->spatial_layer_id] : 0; + if (cm->show_existing_frame) { + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return 1; + } + + svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe; + // Flag to check if its valid to compute the source sad (used for // scene detection and for superblock content state in CBR mode). // The flag may get reset below based on SVC or resizing state. @@ -3489,30 +3706,36 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 2 && cpi->un_scaled_source->y_height == cm->height << 2 && - cpi->svc.scaled_temp.y_width == cm->width << 1 && - cpi->svc.scaled_temp.y_height == cm->height << 1) { + svc->scaled_temp.y_width == cm->width << 1 && + svc->scaled_temp.y_height == cm->height << 1) { // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2 // result will be saved in scaled_temp and might be used later. - const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1]; - const int phase_scaler2 = cpi->svc.downsample_filter_phase[1]; + const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1]; + const int phase_scaler2 = svc->downsample_filter_phase[1]; cpi->Source = vp9_svc_twostage_scale( - cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp, + cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp, filter_scaler, phase_scaler, filter_scaler2, phase_scaler2); - cpi->svc.scaled_one_half = 1; + svc->scaled_one_half = 1; } else if (is_one_pass_cbr_svc(cpi) && cpi->un_scaled_source->y_width == cm->width << 1 && cpi->un_scaled_source->y_height == cm->height << 1 && - cpi->svc.scaled_one_half) { + svc->scaled_one_half) { // If the spatial layer is 1/2x1/2 and the scaling is already done in the // two-stage scaling, use the result directly. - cpi->Source = &cpi->svc.scaled_temp; - cpi->svc.scaled_one_half = 0; + cpi->Source = &svc->scaled_temp; + svc->scaled_one_half = 0; } else { cpi->Source = vp9_scale_if_required( cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0), filter_scaler, phase_scaler); } +#ifdef OUTPUT_YUV_SVC_SRC + // Write out at most 3 spatial layers. + if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) { + vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source); + } +#endif // Unfiltered raw source used in metrics calculation if the source // has been filtered. if (is_psnr_calc_enabled(cpi)) { @@ -3530,9 +3753,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, } if ((cpi->use_svc && - (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 || - cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 || - cpi->svc.current_superframe < 1)) || + (svc->spatial_layer_id < svc->number_spatial_layers - 1 || + svc->temporal_layer_id < svc->number_temporal_layers - 1 || + svc->current_superframe < 1)) || cpi->resize_pending || cpi->resize_state || cpi->external_resize || cpi->resize_state != ORIG) { cpi->compute_source_sad_onepass = 0; @@ -3562,53 +3785,101 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->Last_Source->y_height != cpi->Source->y_height) cpi->compute_source_sad_onepass = 0; - if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) { + if (frame_is_intra_only(cm) || cpi->resize_pending != 0) { memset(cpi->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); } +#if CONFIG_VP9_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity > 0 && cpi->use_svc) + vp9_denoiser_reset_on_first_frame(cpi); +#endif vp9_update_noise_estimate(cpi); // Scene detection is always used for VBR mode or screen-content case. // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now // (need to check encoding time cost for doing this for speed 8). cpi->rc.high_source_sad = 0; - if (cpi->compute_source_sad_onepass && cm->show_frame && + cpi->rc.hybrid_intra_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 0; + if (cm->show_frame && cpi->oxcf.mode == REALTIME && (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.content == VP9E_CONTENT_SCREEN || - (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc))) + (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8))) vp9_scene_detection_onepass(cpi); + if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) { + svc->high_source_sad_superframe = cpi->rc.high_source_sad; + svc->high_num_blocks_with_motion = cpi->rc.high_num_blocks_with_motion; + // On scene change reset temporal layer pattern to TL0. + // Note that if the base/lower spatial layers are skipped: instead of + // inserting base layer here, we force max-q for the next superframe + // with lower spatial layers: this is done in vp9_encodedframe_overshoot() + // when max-q is decided for the current layer. + // Only do this reset for bypass/flexible mode. + if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0 && + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + // rc->high_source_sad will get reset so copy it to restore it. + int tmp_high_source_sad = cpi->rc.high_source_sad; + vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME); + cpi->rc.high_source_sad = tmp_high_source_sad; + } + } + + // For 1 pass CBR, check if we are dropping this frame. + // Never drop on key frame, if base layer is key for svc, + // on scene change, or if superframe has layer sync. + if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) && + !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0])) + no_drop_scene_change = 1; + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && + !frame_is_intra_only(cm) && !no_drop_scene_change && + !svc->superframe_has_layer_sync && + (!cpi->use_svc || + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + if (vp9_rc_drop_frame(cpi)) return 0; + } + // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can // avoid this frame-level upsampling (for non intra_only frames). if (frame_is_intra_only(cm) == 0 && - !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) { + !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) { vp9_scale_references(cpi); } set_size_independent_vars(cpi); set_size_dependent_vars(cpi, &q, &bottom_index, &top_index); + // search method and step parameter might be changed in speed settings. + init_motion_estimation(cpi); + if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi); if (cpi->sf.svc_use_lowres_part && - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) { - if (cpi->svc.prev_partition_svc == NULL) { + svc->spatial_layer_id == svc->number_spatial_layers - 2) { + if (svc->prev_partition_svc == NULL) { CHECK_MEM_ERROR( - cm, cpi->svc.prev_partition_svc, + cm, svc->prev_partition_svc, (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows, - sizeof(*cpi->svc.prev_partition_svc))); + sizeof(*svc->prev_partition_svc))); } } - if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && + // TODO(jianj): Look into issue of skin detection with high bitdepth. + if (cm->bit_depth == 8 && cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { cpi->use_skin_detection = 1; } + // Enable post encode frame dropping for CBR on non key frame, when + // ext_use_post_encode_drop is specified by user. + cpi->rc.use_post_encode_drop = cpi->rc.ext_use_post_encode_drop && + cpi->oxcf.rc_mode == VPX_CBR && + cm->frame_type != KEY_FRAME; + vp9_set_quantizer(cm, q); vp9_set_variance_partition_thresholds(cpi, q, 0); @@ -3616,6 +3887,33 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); + if (cpi->use_svc) { + // On non-zero spatial layer, check for disabling inter-layer + // prediction. + if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi); + vp9_svc_assert_constraints_pattern(cpi); + } + + if (cpi->rc.last_post_encode_dropped_scene_change) { + cpi->rc.high_source_sad = 1; + svc->high_source_sad_superframe = 1; + // For now disable use_source_sad since Last_Source will not be the previous + // encoded but the dropped one. + cpi->sf.use_source_sad = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + } + // Check if this high_source_sad (scene/slide change) frame should be + // encoded at high/max QP, and if so, set the q and adjust some rate + // control parameters. + if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { + if (vp9_encodedframe_overshoot(cpi, -1, &q)) { + vp9_set_quantizer(cm, q); + vp9_set_variance_partition_thresholds(cpi, q, 0); + } + } + // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. if (cpi->oxcf.aq_mode == VARIANCE_AQ) { @@ -3630,18 +3928,21 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // it may be pretty bad for rate-control, // and I should handle it somehow vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) { + apply_roi_map(cpi); } apply_active_map(cpi); vp9_encode_frame(cpi); - // Check if we should drop this frame because of high overshoot. - // Only for frames where high temporal-source SAD is detected. - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR && - cpi->resize_state == ORIG && cm->frame_type != KEY_FRAME && - cpi->oxcf.content == VP9E_CONTENT_SCREEN && - cpi->rc.high_source_sad == 1) { + // Check if we should re-encode this frame at high Q because of high + // overshoot based on the encoded frame size. Only for frames where + // high temporal-source SAD is detected. + // For SVC: all spatial layers are checked for re-encoding. + if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + (cpi->rc.high_source_sad || + (cpi->use_svc && svc->high_source_sad_superframe))) { int frame_size = 0; // Get an estimate of the encoded frame size. save_coding_context(cpi); @@ -3657,8 +3958,12 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); // Turn-off cyclic refresh for re-encoded frame. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->segmentation_map; memset(seg_map, 0, cm->mi_rows * cm->mi_cols); + memset(cr->last_coded_q_map, MAXQ, + cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map)); + cr->sb_index = 0; vp9_disable_segmentation(&cm->seg); } apply_active_map(cpi); @@ -3668,13 +3973,14 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, // Update some stats from cyclic refresh, and check for golden frame update. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled && - cm->frame_type != KEY_FRAME) + !frame_is_intra_only(cm)) vp9_cyclic_refresh_postencode(cpi); // Update the skip mb flag probabilities based on the distribution // seen in the last encoder iteration. // update_base_skip_probs(cpi); vpx_clear_system_state(); + return 1; } #define MAX_QSTEP_ADJ 4 @@ -3703,11 +4009,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int qrange_adj = 1; #endif + if (cm->show_existing_frame) { + if (is_psnr_calc_enabled(cpi)) set_raw_source_frame(cpi); + return; + } + set_size_independent_vars(cpi); - enable_acl = cpi->sf.allow_acl - ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0) - : 0; + enable_acl = cpi->sf.allow_acl ? (cm->frame_type == KEY_FRAME) || + (cpi->twopass.gf_group.index == 1) + : 0; do { vpx_clear_system_state(); @@ -3796,6 +4107,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, vp9_setup_in_frame_q_adj(cpi); } else if (oxcf->aq_mode == LOOKAHEAD_AQ) { vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi); + } else if (oxcf->aq_mode == PSNR_AQ) { + vp9_psnr_aq_mode_setup(&cm->seg); } vp9_encode_frame(cpi); @@ -3900,8 +4213,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // Special case if the projected size is > the max allowed. if ((q == q_high) && ((rc->projected_frame_size >= rc->max_frame_bandwidth) || - (rc->projected_frame_size >= - big_rate_miss_high_threshold(cpi)))) { + (!rc->is_src_frame_alt_ref && + (rc->projected_frame_size >= + big_rate_miss_high_threshold(cpi))))) { int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth, big_rate_miss_high_threshold(cpi))); double q_val_high; @@ -4006,7 +4320,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, #endif // Have we been forced to adapt Q outside the expected range by an extreme // rate miss. If so adjust the active maxQ for the subsequent frames. - if (q > cpi->twopass.active_worst_quality) { + if (!rc->is_src_frame_alt_ref && (q > cpi->twopass.active_worst_quality)) { cpi->twopass.active_worst_quality = q; } else if (oxcf->vbr_corpus_complexity && q == q_low && rc->projected_frame_size < rc->this_frame_target) { @@ -4028,12 +4342,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, vp9_encode_frame(cpi); vpx_clear_system_state(); restore_coding_context(cpi); - vp9_pack_bitstream(cpi, dest, size); - - vp9_encode_frame(cpi); - vpx_clear_system_state(); - - restore_coding_context(cpi); } } @@ -4131,20 +4439,21 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( } } -static void set_arf_sign_bias(VP9_COMP *cpi) { +static void set_ref_sign_bias(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - int arf_sign_bias; + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + const int cur_frame_index = ref_buffer->frame_index; + MV_REFERENCE_FRAME ref_frame; - if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - arf_sign_bias = cpi->rc.source_alt_ref_active && - (!cpi->refresh_alt_ref_frame || - (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)); - } else { - arf_sign_bias = - (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame); + for (ref_frame = LAST_FRAME; ref_frame < MAX_REF_FRAMES; ++ref_frame) { + const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame); + const RefCntBuffer *const ref_cnt_buf = + get_ref_cnt_buffer(&cpi->common, buf_idx); + if (ref_cnt_buf) { + cm->ref_frame_sign_bias[ref_frame] = + cur_frame_index < ref_cnt_buf->frame_index; + } } - cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias; } static int setup_interp_filter_search_mask(VP9_COMP *cpi) { @@ -4352,6 +4661,16 @@ static void vp9_try_disable_lookahead_aq(VP9_COMP *cpi, size_t *size, } } +static void set_frame_index(VP9_COMP *cpi, VP9_COMMON *cm) { + RefCntBuffer *const ref_buffer = get_ref_cnt_buffer(cm, cm->new_fb_idx); + + if (ref_buffer) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + ref_buffer->frame_index = + cm->current_video_frame + gf_group->arf_src_offset[gf_group->index]; + } +} + static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, uint8_t *dest, unsigned int *frame_flags) { @@ -4360,6 +4679,34 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, struct segmentation *const seg = &cm->seg; TX_SIZE t; + // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0. + // If in constrained layer drop mode (svc.framedrop_mode != LAYER_DROP) and + // base spatial layer was dropped, no need to set svc.skip_enhancement_layer, + // as whole superframe will be dropped. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && + cpi->oxcf.target_bandwidth == 0 && + !(cpi->svc.framedrop_mode != LAYER_DROP && + cpi->svc.drop_spatial_layer[0])) { + cpi->svc.skip_enhancement_layer = 1; + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1; + if (cpi->svc.framedrop_mode == LAYER_DROP || + cpi->svc.drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + return; + } + set_ext_overrides(cpi); vpx_clear_system_state(); @@ -4368,8 +4715,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, if (is_spatial_denoise_enabled(cpi)) spatial_denoise_frame(cpi); #endif - // Set the arf sign bias for this frame. - set_arf_sign_bias(cpi); + if (cm->show_existing_frame == 0) { + // Update frame index + set_frame_index(cpi, cm); + + // Set the arf sign bias for this frame. + set_ref_sign_bias(cpi); + } // Set default state for segment based loop filter update flags. cm->lf.mode_ref_delta_update = 0; @@ -4404,67 +4756,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->reset_frame_context = 2; } } - if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) { - // Use context 0 for intra only empty frame, but the last frame context - // for other empty frames. - if (cpi->svc.encode_empty_frame_state == ENCODING) { - if (cpi->svc.encode_intra_empty_frame != 0) - cm->frame_context_idx = 0; - else - cm->frame_context_idx = FRAME_CONTEXTS - 1; - } else { - cm->frame_context_idx = - cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id; - } - - cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode; - - // The probs will be updated based on the frame type of its previous - // frame if frame_parallel_decoding_mode is 0. The type may vary for - // the frame after a key frame in base layer since we may drop enhancement - // layers. So set frame_parallel_decoding_mode to 1 in this case. - if (cm->frame_parallel_decoding_mode == 0) { - if (cpi->svc.number_temporal_layers == 1) { - if (cpi->svc.spatial_layer_id == 0 && - cpi->svc.layer_context[0].last_frame_type == KEY_FRAME) - cm->frame_parallel_decoding_mode = 1; - } else if (cpi->svc.spatial_layer_id == 0) { - // Find the 2nd frame in temporal base layer and 1st frame in temporal - // enhancement layers from the key frame. - int i; - for (i = 0; i < cpi->svc.number_temporal_layers; ++i) { - if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) { - cm->frame_parallel_decoding_mode = 1; - break; - } - } - } - } - } - - // For 1 pass CBR, check if we are dropping this frame. - // For spatial layers, for now only check for frame-dropping on first spatial - // layer, and if decision is to drop, we drop whole super-frame. - if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && - cm->frame_type != KEY_FRAME) { - if (vp9_rc_drop_frame(cpi) || - (is_one_pass_cbr_svc(cpi) && cpi->svc.rc_drop_superframe == 1)) { - vp9_rc_postencode_update_drop_frame(cpi); - ++cm->current_video_frame; - cpi->ext_refresh_frame_flags_pending = 0; - cpi->svc.rc_drop_superframe = 1; - cpi->last_frame_dropped = 1; - // TODO(marpan): Advancing the svc counters on dropped frames can break - // the referencing scheme for the fixed svc patterns defined in - // vp9_one_pass_cbr_svc_start_layer(). Look into fixing this issue, but - // for now, don't advance the svc frame counters on dropped frame. - // if (cpi->use_svc) - // vp9_inc_frame_in_layer(cpi); - - return; - } - } vpx_clear_system_state(); @@ -4472,14 +4763,25 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif +#if CONFIG_CONSISTENT_RECODE + // Backup to ensure consistency between recodes + save_encode_params(cpi); +#endif if (cpi->sf.recode_loop == DISALLOW_RECODE) { - encode_without_recode_loop(cpi, size, dest); + if (!encode_without_recode_loop(cpi, size, dest)) return; } else { encode_with_recode_loop(cpi, size, dest); } - cpi->last_frame_dropped = 0; + // TODO(jingning): When using show existing frame mode, we assume that the + // current ARF will be directly used as the final reconstructed frame. This is + // an encoder control scheme. One could in principle explore other + // possibilities to arrange the reference frame buffer and their coding order. + if (cm->show_existing_frame) { + ref_cnt_fb(cm->buffer_pool->frame_bufs, &cm->new_fb_idx, + cm->ref_frame_map[cpi->alt_fb_idx]); + } // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) @@ -4527,9 +4829,33 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); + if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); + // build the bitstream vp9_pack_bitstream(cpi, dest, size); + if (cpi->rc.use_post_encode_drop && cm->base_qindex < cpi->rc.worst_quality && + cpi->svc.spatial_layer_id == 0 && post_encode_drop_cbr(cpi, size)) { + restore_coding_context(cpi); + return; + } + + cpi->last_frame_dropped = 0; + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0; + if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1) + cpi->svc.num_encoded_top_layer++; + + // Keep track of the frame buffer index updated/refreshed for the + // current encoded TL0 superframe. + if (cpi->svc.temporal_layer_id == 0) { + if (cpi->refresh_last_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx; + else if (cpi->refresh_golden_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx; + else if (cpi->refresh_alt_ref_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx; + } + if (cm->seg.update_map) update_reference_segmentation_map(cpi); if (frame_is_intra_only(cm) == 0) { @@ -4537,17 +4863,18 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, } vp9_update_reference_frames(cpi); - for (t = TX_4X4; t <= TX_32X32; t++) - full_to_model_counts(cpi->td.counts->coef[t], - cpi->td.rd_counts.coef_counts[t]); - - if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) - vp9_adapt_coef_probs(cm); + if (!cm->show_existing_frame) { + for (t = TX_4X4; t <= TX_32X32; ++t) { + full_to_model_counts(cpi->td.counts->coef[t], + cpi->td.rd_counts.coef_counts[t]); + } - if (!frame_is_intra_only(cm)) { if (!cm->error_resilient_mode && !cm->frame_parallel_decoding_mode) { - vp9_adapt_mode_probs(cm); - vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + if (!frame_is_intra_only(cm)) { + vp9_adapt_mode_probs(cm); + vp9_adapt_mv_probs(cm, cm->allow_high_precision_mv); + } + vp9_adapt_coef_probs(cm); } } @@ -4567,8 +4894,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_frame_type = cm->frame_type; - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_rc_postencode_update(cpi, *size); + vp9_rc_postencode_update(cpi, *size); + + *size = VPXMAX(1, *size); #if 0 output_frame_level_debug_stats(cpi); @@ -4592,7 +4920,10 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cm->last_height = cm->height; // reset to normal state now that we are done. - if (!cm->show_existing_frame) cm->last_show_frame = cm->show_frame; + if (!cm->show_existing_frame) { + cm->last_show_frame = cm->show_frame; + cm->prev_frame = cm->cur_frame; + } if (cm->show_frame) { vp9_swap_mi_and_prev_mi(cm); @@ -4601,19 +4932,24 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, ++cm->current_video_frame; if (cpi->use_svc) vp9_inc_frame_in_layer(cpi); } - cm->prev_frame = cm->cur_frame; - if (cpi->use_svc) + if (cpi->use_svc) { cpi->svc .layer_context[cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers + cpi->svc.temporal_layer_id] .last_frame_type = cm->frame_type; + // Reset layer_sync back to 0 for next frame. + cpi->svc.spatial_layer_sync[cpi->svc.spatial_layer_id] = 0; + } cpi->force_update_segmentation = 0; if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) vp9_alt_ref_aq_unset_all(cpi->alt_ref_aq, cpi); + + cpi->svc.previous_frame_is_intra_only = cm->intra_only; + cpi->svc.set_intra_only_frame = 0; } static void SvcEncode(VP9_COMP *cpi, size_t *size, uint8_t *dest, @@ -4638,8 +4974,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest, cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED; encode_frame_to_data_rate(cpi, size, dest, frame_flags); - if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING)) - vp9_twopass_postencode_update(cpi); + vp9_twopass_postencode_update(cpi); } #endif // !CONFIG_REALTIME_ONLY @@ -4649,6 +4984,8 @@ static void init_ref_frame_bufs(VP9_COMMON *cm) { cm->new_fb_idx = INVALID_IDX; for (i = 0; i < REF_FRAMES; ++i) { cm->ref_frame_map[i] = INVALID_IDX; + } + for (i = 0; i < FRAME_BUFFERS; ++i) { pool->frame_bufs[i].ref_count = 0; } } @@ -4702,6 +5039,12 @@ int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, check_initial_width(cpi, subsampling_x, subsampling_y); #endif // CONFIG_VP9_HIGHBITDEPTH +#if CONFIG_VP9_HIGHBITDEPTH + // Disable denoiser for high bitdepth since vp9_denoiser_filter only works for + // 8 bits. + if (cm->bit_depth > 8) cpi->oxcf.noise_sensitivity = 0; +#endif + #if CONFIG_VP9_TEMPORAL_DENOISING setup_denoiser_buffer(cpi); #endif @@ -4822,10 +5165,6 @@ static void check_src_altref(VP9_COMP *cpi, } #if CONFIG_INTERNAL_STATS -extern double vp9_get_blockiness(const uint8_t *img1, int img1_pitch, - const uint8_t *img2, int img2_pitch, int width, - int height); - static void adjust_image_stat(double y, double u, double v, double all, ImageStat *s) { s->stat[Y] += y; @@ -5065,6 +5404,1114 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + // TODO(jingning): To be used later for gf frame type parsing. + (void)gf_group; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + tpl_frame->mv_dist_sum[rf_idx] = 0; + tpl_frame->mv_cost_sum[rf_idx] = 0; + } +#endif + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction( + VP9_COMP *cpi, ThreadData *td, int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv, int rf_idx, double *mv_dist, double *mv_cost) { +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + int frame_idx, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + int mi_row, int mi_col, MV *mv) { +#endif // CONFIG_NON_GREEDY_MV + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; +#if CONFIG_NON_GREEDY_MV + // lambda is used to adjust the importance of motion vector consitency. + // TODO(angiebird): Figure out lambda's proper value. + double lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; +#endif + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + +#if CONFIG_NON_GREEDY_MV + (void)search_method; + (void)sadpb; + vp9_prepare_nb_full_mvs(&cpi->tpl_stats[frame_idx], mi_row, mi_col, rf_idx, + bsize, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, &best_ref_mv1_full, step_param, lambda, 1, + &cpi->fn_ptr[bsize], nb_full_mvs, NB_MVS_NUM, mv, + mv_dist, mv_cost); +#else + (void)frame_idx; + (void)mi_row; + (void)mi_col; + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); +#endif + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + uint16_t eob; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, + &eob, scan_order->scan, scan_order->iscan); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, + p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob, + scan_order->scan, scan_order->iscan); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +static void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +#if CONFIG_NON_GREEDY_MV +double get_feature_score(uint8_t *buf, ptrdiff_t stride, int rows, int cols) { + double IxIx = 0; + double IxIy = 0; + double IyIy = 0; + double score; + int r, c; + vpx_clear_system_state(); + for (r = 0; r + 1 < rows; ++r) { + for (c = 0; c + 1 < cols; ++c) { + int diff_x = buf[r * stride + c] - buf[r * stride + c + 1]; + int diff_y = buf[r * stride + c] - buf[(r + 1) * stride + c]; + IxIx += diff_x * diff_x; + IxIy += diff_x * diff_y; + IyIy += diff_y * diff_y; + } + } + IxIx /= (rows - 1) * (cols - 1); + IxIy /= (rows - 1) * (cols - 1); + IyIy /= (rows - 1) * (cols - 1); + score = (IxIx * IyIy - IxIy * IxIy + 0.0001) / (IxIx + IyIy + 0.0001); + return score; +} +#endif + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *sse) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int_mv mv; + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + mv.as_int = + get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col)->as_int; +#else + motion_compensated_prediction( + cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize, + mi_row, mi_col, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + +#if CONFIG_NON_GREEDY_MV + tpl_stats->inter_cost_arr[rf_idx] = inter_cost; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, + &tpl_stats->recon_error_arr[rf_idx], + &tpl_stats->sse_arr[rf_idx]); +#endif + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; +#if CONFIG_NON_GREEDY_MV + *recon_error = tpl_stats->recon_error_arr[rf_idx]; + *sse = tpl_stats->sse_arr[rf_idx]; +#else + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse); +#endif + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; +} + +#if CONFIG_NON_GREEDY_MV +static int compare_feature_score(const void *a, const void *b) { + const FEATURE_SCORE_LOC *aa = *(FEATURE_SCORE_LOC *const *)a; + const FEATURE_SCORE_LOC *bb = *(FEATURE_SCORE_LOC *const *)b; + if (aa->feature_score < bb->feature_score) { + return 1; + } else if (aa->feature_score > bb->feature_score) { + return -1; + } else { + return 0; + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, int frame_idx, + YV12_BUFFER_CONFIG **ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + int rf_idx; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + int_mv *mv = get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col); + if (ref_frame[rf_idx] == NULL) { + tpl_stats->ready[rf_idx] = 0; + continue; + } else { + tpl_stats->ready[rf_idx] = 1; + } + motion_compensated_prediction( + cpi, td, frame_idx, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bsize, + mi_row, mi_col, &mv->as_mv, rf_idx, &tpl_stats->mv_dist[rf_idx], + &tpl_stats->mv_cost[rf_idx]); + } +} + +#define CHANGE_MV_SEARCH_ORDER 1 +#define USE_PQSORT 1 +#define RE_COMPUTE_MV_INCONSISTENCY 1 + +#if CHANGE_MV_SEARCH_ORDER +#if USE_PQSORT +static void max_heap_pop(FEATURE_SCORE_LOC **heap, int *size, + FEATURE_SCORE_LOC **output) { + if (*size > 0) { + *output = heap[0]; + --*size; + if (*size > 0) { + int p, l, r; + heap[0] = heap[*size]; + p = 0; + l = 2 * p + 1; + r = 2 * p + 2; + while (l < *size) { + FEATURE_SCORE_LOC *tmp; + int c = l; + if (r < *size && heap[r]->feature_score > heap[l]->feature_score) { + c = r; + } + if (heap[p]->feature_score >= heap[c]->feature_score) { + break; + } + tmp = heap[p]; + heap[p] = heap[c]; + heap[c] = tmp; + p = c; + l = 2 * p + 1; + r = 2 * p + 2; + } + } + } else { + assert(0); + } +} + +static void max_heap_push(FEATURE_SCORE_LOC **heap, int *size, + FEATURE_SCORE_LOC *input) { + int c, p; + FEATURE_SCORE_LOC *tmp; + heap[*size] = input; + ++*size; + c = *size - 1; + p = c >> 1; + while (c > 0 && heap[c]->feature_score > heap[p]->feature_score) { + tmp = heap[p]; + heap[p] = heap[c]; + heap[c] = tmp; + c = p; + p >>= 1; + } +} + +static void add_nb_blocks_to_heap(VP9_COMP *cpi, const TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int *heap_size) { + const int mi_unit = num_8x8_blocks_wide_lookup[bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int i; + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0] * mi_unit; + int c = dirs[i][1] * mi_unit; + if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 && + mi_col + c < tpl_frame->mi_cols) { + FEATURE_SCORE_LOC *fs_loc = + &cpi->feature_score_loc_arr[(mi_row + r) * tpl_frame->stride + + (mi_col + c)]; + if (fs_loc->visited == 0) { + max_heap_push(cpi->feature_score_loc_heap, heap_size, fs_loc); + } + } + } +} +#endif // USE_PQSORT +#endif // CHANGE_MV_SEARCH_ORDER + +static void build_motion_field(VP9_COMP *cpi, MACROBLOCKD *xd, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[3], + BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int fs_loc_sort_size; + int fs_loc_heap_size; + int mi_row, mi_col; + + tpl_frame->lambda = 250; + + fs_loc_sort_size = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + FEATURE_SCORE_LOC *fs_loc = + &cpi->feature_score_loc_arr[mi_row * tpl_frame->stride + mi_col]; + tpl_stats->feature_score = get_feature_score( + xd->cur_buf->y_buffer + mb_y_offset, xd->cur_buf->y_stride, bw, bh); + fs_loc->visited = 0; + fs_loc->feature_score = tpl_stats->feature_score; + fs_loc->mi_row = mi_row; + fs_loc->mi_col = mi_col; + cpi->feature_score_loc_sort[fs_loc_sort_size] = fs_loc; + ++fs_loc_sort_size; + } + } + + qsort(cpi->feature_score_loc_sort, fs_loc_sort_size, + sizeof(*cpi->feature_score_loc_sort), compare_feature_score); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + int rf_idx; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + tpl_stats->ready[rf_idx] = 0; + } + } + } + +#if CHANGE_MV_SEARCH_ORDER +#if !USE_PQSORT + for (i = 0; i < fs_loc_sort_size; ++i) { + FEATURE_SCORE_LOC *fs_loc = cpi->feature_score_loc_sort[i]; + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row, + fs_loc->mi_col); + } +#else // !USE_PQSORT + fs_loc_heap_size = 0; + max_heap_push(cpi->feature_score_loc_heap, &fs_loc_heap_size, + cpi->feature_score_loc_sort[0]); + + while (fs_loc_heap_size > 0) { + FEATURE_SCORE_LOC *fs_loc; + max_heap_pop(cpi->feature_score_loc_heap, &fs_loc_heap_size, &fs_loc); + + fs_loc->visited = 1; + + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, fs_loc->mi_row, + fs_loc->mi_col); + + add_nb_blocks_to_heap(cpi, tpl_frame, bsize, fs_loc->mi_row, fs_loc->mi_col, + &fs_loc_heap_size); + } +#endif // !USE_PQSORT +#else // CHANGE_MV_SEARCH_ORDER + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, frame_idx, ref_frame, bsize, mi_row, mi_col); + } + } +#endif // CHANGE_MV_SEARCH_ORDER +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[3] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int64_t recon_error, sse; +#if CONFIG_NON_GREEDY_MV + int square_block_idx; +#endif + + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < 3; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, xd, frame_idx, ref_frame, square_bsize); + } +#endif + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &sse); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); +#if CONFIG_NON_GREEDY_MV + { + int rf_idx; + TplDepStats *this_tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { +#if RE_COMPUTE_MV_INCONSISTENCY + MV this_mv = + get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row, mi_col)->as_mv; + MV full_mv; + int_mv nb_full_mvs[NB_MVS_NUM]; + vp9_prepare_nb_full_mvs(tpl_frame, mi_row, mi_col, rf_idx, bsize, + nb_full_mvs); + full_mv.row = this_mv.row >> 3; + full_mv.col = this_mv.col >> 3; + this_tpl_stats->mv_cost[rf_idx] = + vp9_nb_mvs_inconsistency(&full_mv, nb_full_mvs, NB_MVS_NUM); +#endif // RE_COMPUTE_MV_INCONSISTENCY + tpl_frame->mv_dist_sum[rf_idx] += this_tpl_stats->mv_dist[rf_idx]; + tpl_frame->mv_cost_sum[rf_idx] += this_tpl_stats->mv_cost[rf_idx]; + } + } +#endif // CONFIG_NON_GREEDY_MV + } + } +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + printf("%d %d\n", h, w); + for (int i = 0; i < h; ++i) { + for (int j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int idx = 0; + int mi_row, mi_col; + int rf_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + printf("=\n"); + printf("frame_idx %d mi_rows %d mi_cols %d bsize %d\n", frame_idx, + cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + int_mv mv = *get_pyramid_mv(tpl_frame, idx, bsize, mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, mv.as_mv.col); + } + } + } + + dump_frame_buf(gf_picture[frame_idx].frame); + + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + rf_idx = gf_picture[frame_idx].ref_frame[idx]; + printf("has_ref %d\n", rf_idx != -1); + if (rf_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[rf_idx].frame; + dump_frame_buf(ref_frame_buf); + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +static void init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int sqr_bsize; + int rf_idx; + + // TODO(angiebird): This probably needs further modifications to support + // frame scaling later on. + if (cpi->feature_score_loc_alloc == 0) { + // The smallest block size of motion field is 4x4, but the mi_unit is 8x8, + // therefore the number of units is "mi_rows * mi_cols * 4" here. + CHECK_MEM_ERROR( + cm, cpi->feature_score_loc_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->feature_score_loc_arr))); + CHECK_MEM_ERROR(cm, cpi->feature_score_loc_sort, + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->feature_score_loc_sort))); + CHECK_MEM_ERROR(cm, cpi->feature_score_loc_heap, + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->feature_score_loc_heap))); + + cpi->feature_score_loc_alloc = 1; + } +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + vpx_free(cpi->tpl_stats[frame].pyramid_mv_arr); + for (rf_idx = 0; rf_idx < 3; ++rf_idx) { + for (sqr_bsize = 0; sqr_bsize < SQUARE_BLOCK_SIZES; ++sqr_bsize) { + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize], + vpx_calloc( + mi_rows * mi_cols, + sizeof( + *cpi->tpl_stats[frame].pyramid_mv_arr[rf_idx][sqr_bsize]))); + } + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +static void setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + const BLOCK_SIZE bsize = BLOCK_32X32; + + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, bsize); + } +#if CONFIG_NON_GREEDY_MV +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_picture, bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV +} + int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, int64_t *time_end, int flush) { @@ -5077,17 +6524,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, struct lookahead_entry *last_source = NULL; struct lookahead_entry *source = NULL; int arf_src_index; + const int gf_group_index = cpi->twopass.gf_group.index; int i; - if (is_two_pass_svc(cpi)) { -#if CONFIG_SPATIAL_SVC - vp9_svc_start_frame(cpi); - // Use a small empty frame instead of a real frame - if (cpi->svc.encode_empty_frame_state == ENCODING) - source = &cpi->svc.empty_frame; -#endif - if (oxcf->pass == 2) vp9_restore_layer_context(cpi); - } else if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_cbr_svc(cpi)) { vp9_one_pass_cbr_svc_start_layer(cpi); } @@ -5098,10 +6538,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Is multi-arf enabled. // Note that at the moment multi_arf is only configured for 2 pass VBR and // will not work properly with svc. - if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf > 1)) - cpi->multi_arf_allowed = 1; + // Enable the Jingning's new "multi_layer_arf" code if "enable_auto_arf" + // is greater than or equal to 2. + if ((oxcf->pass == 2) && !cpi->use_svc && (cpi->oxcf.enable_auto_arf >= 2)) + cpi->multi_layer_arf = 1; else - cpi->multi_arf_allowed = 0; + cpi->multi_layer_arf = 0; // Normal defaults cm->reset_frame_context = 0; @@ -5115,9 +6557,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // Should we encode an arf frame. arf_src_index = get_arf_src_index(cpi); - // Skip alt frame if we encode the empty frame - if (is_two_pass_svc(cpi) && source != NULL) arf_src_index = 0; - if (arf_src_index) { for (i = 0; i <= arf_src_index; ++i) { struct lookahead_entry *e = vp9_lookahead_peek(cpi->lookahead, i); @@ -5132,25 +6571,17 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } } + // Clear arf index stack before group of pictures processing starts. + if (gf_group_index == 1) { + stack_init(cpi->twopass.gf_group.arf_index_stack, MAX_LAG_BUFFERS * 2); + cpi->twopass.gf_group.stack_size = 0; + } + if (arf_src_index) { assert(arf_src_index <= rc->frames_to_key); - if ((source = vp9_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) { cpi->alt_ref_source = source; -#if CONFIG_SPATIAL_SVC - if (is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0) { - int i; - // Reference a hidden frame from a lower layer - for (i = cpi->svc.spatial_layer_id - 1; i >= 0; --i) { - if (oxcf->ss_enable_auto_arf[i]) { - cpi->gld_fb_idx = cpi->svc.layer_context[i].alt_ref_idx; - break; - } - } - } - cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; -#endif #if !CONFIG_REALTIME_ONLY if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { @@ -5192,7 +6623,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } // Read in the source frame. - if (cpi->use_svc) + if (cpi->use_svc || cpi->svc.set_intra_only_frame) source = vp9_svc_lookahead_pop(cpi, cpi->lookahead, flush); else source = vp9_lookahead_pop(cpi->lookahead, flush); @@ -5202,8 +6633,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->intra_only = 0; // if the flags indicate intra frame, but if the current picture is for // non-zero spatial layer, it should not be an intra picture. - if ((source->flags & VPX_EFLAG_FORCE_KF) && - cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) { + if ((source->flags & VPX_EFLAG_FORCE_KF) && cpi->use_svc && + cpi->svc.spatial_layer_id > 0) { source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF); } @@ -5227,7 +6658,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, *time_stamp = source->ts_start; *time_end = source->ts_end; *frame_flags = (source->flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; - } else { *size = 0; #if !CONFIG_REALTIME_ONLY @@ -5249,7 +6679,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, // adjust frame rates based on timestamps given if (cm->show_frame) { - adjust_frame_rate(cpi, source); + if (cpi->use_svc && cpi->svc.use_set_ref_frame_config && + cpi->svc.duration[cpi->svc.spatial_layer_id] > 0) + vp9_svc_adjust_frame_rate(cpi); + else + adjust_frame_rate(cpi, source); } if (is_one_pass_cbr_svc(cpi)) { @@ -5268,24 +6702,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx]; - if (!cpi->use_svc && cpi->multi_arf_allowed) { - if (cm->frame_type == KEY_FRAME) { - init_buffer_indices(cpi); - } else if (oxcf->pass == 2) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - cpi->alt_fb_idx = gf_group->arf_ref_idx[gf_group->index]; - } - } - // Start with a 0 size frame. *size = 0; cpi->frame_flags = *frame_flags; #if !CONFIG_REALTIME_ONLY - if ((oxcf->pass == 2) && - (!cpi->use_svc || (is_two_pass_svc(cpi) && - cpi->svc.encode_empty_frame_state != ENCODING))) { + if ((oxcf->pass == 2) && !cpi->use_svc) { vp9_rc_get_second_pass_params(cpi); } else if (oxcf->pass == 1) { set_frame_size(cpi); @@ -5297,7 +6720,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, level_rc_framerate(cpi, arf_src_index); if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { - for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; + for (i = 0; i < REFS_PER_FRAME; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; + } + + if (gf_group_index == 1 && + cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && + cpi->sf.enable_tpl_model) { + init_tpl_buffer(cpi); + vp9_estimate_qp_gop(cpi); + setup_tpl_stats(cpi); } cpi->td.mb.fp_src_pred = 0; @@ -5309,7 +6740,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, Pass0Encode(cpi, size, dest, frame_flags); } #else // !CONFIG_REALTIME_ONLY - if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + if (oxcf->pass == 1 && !cpi->use_svc) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH if (cpi->oxcf.use_highbitdepth) @@ -5324,7 +6755,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif // CONFIG_VP9_HIGHBITDEPTH cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); - } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) { + } else if (oxcf->pass == 2 && !cpi->use_svc) { Pass2Encode(cpi, size, dest, frame_flags); } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); @@ -5334,6 +6765,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_REALTIME_ONLY + if (cm->show_frame) cm->cur_show_frame_fb_idx = cm->new_fb_idx; + if (cm->refresh_frame_context) cm->frame_contexts[cm->frame_context_idx] = *cm->fc; @@ -5416,7 +6849,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, ppflags.post_proc_flag = VP9D_DEBLOCK; ppflags.deblocking_level = 0; // not used in vp9_post_proc_frame() ppflags.noise_level = 0; // not used in vp9_post_proc_frame() - vp9_post_proc_frame(cm, pp, &ppflags); + vp9_post_proc_frame(cm, pp, &ppflags, + cpi->un_scaled_source->y_width); } #endif vpx_clear_system_state(); @@ -5462,11 +6896,11 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->summedp_quality += frame_ssim2 * weight; cpi->summedp_weights += weight; #if 0 - { + if (cm->show_frame) { FILE *f = fopen("q_used.stt", "a"); fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n", - cpi->common.current_video_frame, y2, u2, v2, - frame_psnr2, frame_ssim2); + cpi->common.current_video_frame, psnr2.psnr[1], + psnr2.psnr[2], psnr2.psnr[3], psnr2.psnr[0], frame_ssim2); fclose(f); } #endif @@ -5525,21 +6959,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif - if (is_two_pass_svc(cpi)) { - if (cpi->svc.encode_empty_frame_state == ENCODING) { - cpi->svc.encode_empty_frame_state = ENCODED; - cpi->svc.encode_intra_empty_frame = 0; - } - - if (cm->show_frame) { - ++cpi->svc.spatial_layer_to_encode; - if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) - cpi->svc.spatial_layer_to_encode = 0; - - // May need the empty frame after an visible frame. - cpi->svc.encode_empty_frame_state = NEED_TO_ENCODE; - } - } else if (is_one_pass_cbr_svc(cpi)) { + if (is_one_pass_cbr_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers) @@ -5563,7 +6983,7 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } else { int ret; #if CONFIG_VP9_POSTPROC - ret = vp9_post_proc_frame(cm, dest, flags); + ret = vp9_post_proc_frame(cm, dest, flags, cpi->un_scaled_source->y_width); #else if (cm->frame_to_show) { *dest = *cm->frame_to_show; diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h index d723d93cb..18adfebfe 100644 --- a/libvpx/vp9/encoder/vp9_encoder.h +++ b/libvpx/vp9/encoder/vp9_encoder.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ENCODER_H_ -#define VP9_ENCODER_VP9_ENCODER_H_ +#ifndef VPX_VP9_ENCODER_VP9_ENCODER_H_ +#define VPX_VP9_ENCODER_VP9_ENCODER_H_ #include <stdio.h> @@ -119,9 +119,10 @@ typedef enum { COMPLEXITY_AQ = 2, CYCLIC_REFRESH_AQ = 3, EQUATOR360_AQ = 4, + PSNR_AQ = 5, // AQ based on lookahead temporal // variance (only valid for altref frames) - LOOKAHEAD_AQ = 5, + LOOKAHEAD_AQ = 6, AQ_MODE_COUNT // This should always be the last member of the enum } AQ_MODE; @@ -248,6 +249,8 @@ typedef struct VP9EncoderConfig { int tile_columns; int tile_rows; + int enable_tpl_model; + int max_threads; unsigned int target_level; @@ -278,11 +281,102 @@ static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0; } +typedef struct TplDepStats { + int64_t intra_cost; + int64_t inter_cost; + int64_t mc_flow; + int64_t mc_dep_cost; + int64_t mc_ref_cost; + + int ref_frame_index; + int_mv mv; + +#if CONFIG_NON_GREEDY_MV + int ready[3]; + double mv_dist[3]; + double mv_cost[3]; + int64_t inter_cost_arr[3]; + int64_t recon_error_arr[3]; + int64_t sse_arr[3]; + double feature_score; +#endif +} TplDepStats; + +#if CONFIG_NON_GREEDY_MV +#define SQUARE_BLOCK_SIZES 4 +#endif + +typedef struct TplDepFrame { + uint8_t is_valid; + TplDepStats *tpl_stats_ptr; + int stride; + int width; + int height; + int mi_rows; + int mi_cols; + int base_qindex; +#if CONFIG_NON_GREEDY_MV + double lambda; + double mv_dist_sum[3]; + double mv_cost_sum[3]; + int_mv *pyramid_mv_arr[3][SQUARE_BLOCK_SIZES]; +#endif +} TplDepFrame; + +#if CONFIG_NON_GREEDY_MV +static INLINE int get_square_block_idx(BLOCK_SIZE bsize) { + if (bsize == BLOCK_4X4) { + return 0; + } + if (bsize == BLOCK_8X8) { + return 1; + } + if (bsize == BLOCK_16X16) { + return 2; + } + if (bsize == BLOCK_32X32) { + return 3; + } + printf("ERROR: non-square block size\n"); + assert(0); + return -1; +} + +static INLINE BLOCK_SIZE square_block_idx_to_bsize(int square_block_idx) { + if (square_block_idx == 0) { + return BLOCK_4X4; + } + if (square_block_idx == 1) { + return BLOCK_8X8; + } + if (square_block_idx == 2) { + return BLOCK_16X16; + } + if (square_block_idx == 3) { + return BLOCK_32X32; + } + printf("ERROR: invalid square_block_idx\n"); + assert(0); + return BLOCK_INVALID; +} + +static INLINE int_mv *get_pyramid_mv(const TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + return &tpl_frame->pyramid_mv_arr[rf_idx][get_square_block_idx(bsize)] + [mi_row * tpl_frame->stride + mi_col]; +} +#endif + +#define TPL_DEP_COST_SCALE_LOG2 4 + // TODO(jingning) All spatially adaptive variables should go to TileDataEnc. typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; - int mode_map[BLOCK_SIZES][MAX_MODES]; +#if CONFIG_CONSISTENT_RECODE + int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; +#endif + int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; @@ -450,6 +544,23 @@ typedef struct ARNRFilterData { struct scale_factors sf; } ARNRFilterData; +typedef struct EncFrameBuf { + int mem_valid; + int released; + YV12_BUFFER_CONFIG frame; +} EncFrameBuf; + +// Maximum operating frame buffer size needed for a GOP using ARF reference. +#define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS) +#if CONFIG_NON_GREEDY_MV +typedef struct FEATURE_SCORE_LOC { + int visited; + double feature_score; + int mi_row; + int mi_col; +} FEATURE_SCORE_LOC; +#endif + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -473,17 +584,29 @@ typedef struct VP9_COMP { #endif YV12_BUFFER_CONFIG *raw_source_frame; + TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE]; + YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES]; + EncFrameBuf enc_frame_buf[REF_FRAMES]; +#if CONFIG_NON_GREEDY_MV + int feature_score_loc_alloc; + FEATURE_SCORE_LOC *feature_score_loc_arr; + FEATURE_SCORE_LOC **feature_score_loc_sort; + FEATURE_SCORE_LOC **feature_score_loc_heap; +#endif + TileDataEnc *tile_data; int allocated_tiles; // Keep track of memory allocated for tiles. // For a still frame, this flag is set to 1 to skip partition search. int partition_search_skippable_frame; - int scaled_ref_idx[MAX_REF_FRAMES]; + int scaled_ref_idx[REFS_PER_FRAME]; int lst_fb_idx; int gld_fb_idx; int alt_fb_idx; + int ref_fb_idx[REF_FRAMES]; + int refresh_last_frame; int refresh_golden_frame; int refresh_alt_ref_frame; @@ -499,7 +622,6 @@ typedef struct VP9_COMP { YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tile_tok[4][1 << 6]; - uint32_t tok_count[4][1 << 6]; TOKENLIST *tplist[4][1 << 6]; // Ambient reconstruction err target for force key frames @@ -521,7 +643,7 @@ typedef struct VP9_COMP { RATE_CONTROL rc; double framerate; - int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]; + int interp_filter_selected[REF_FRAMES][SWITCHABLE]; struct vpx_codec_pkt_list *output_pkt_list; @@ -555,6 +677,7 @@ typedef struct VP9_COMP { ActiveMap active_map; fractional_mv_step_fp *find_fractional_mv_step; + struct scale_factors me_sf; vp9_diamond_search_fn_t diamond_search_sad; vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES]; uint64_t time_receive_data; @@ -645,10 +768,8 @@ typedef struct VP9_COMP { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; - - int multi_arf_allowed; - int multi_arf_enabled; - int multi_arf_last_grp_enabled; + // Indices are: max_tx_size-1, tx_size_ctx, tx_size + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; @@ -723,6 +844,9 @@ typedef struct VP9_COMP { uint8_t *count_arf_frame_usage; uint8_t *count_lastgolden_frame_usage; + + int multi_layer_arf; + vpx_roi_map_t roi; } VP9_COMP; void vp9_initialize_enc(void); @@ -737,7 +861,7 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); + int64_t end_time); int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, @@ -758,9 +882,11 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, int vp9_update_entropy(VP9_COMP *cpi, int update); -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); -int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); @@ -770,6 +896,27 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, void vp9_set_svc(VP9_COMP *cpi, int use_svc); +static INLINE int stack_pop(int *stack, int stack_size) { + int idx; + const int r = stack[0]; + for (idx = 1; idx < stack_size; ++idx) stack[idx - 1] = stack[idx]; + + return r; +} + +static INLINE int stack_top(const int *stack) { return stack[0]; } + +static INLINE void stack_push(int *stack, int new_item, int stack_size) { + int idx; + for (idx = stack_size; idx > 0; --idx) stack[idx] = stack[idx - 1]; + stack[0] = new_item; +} + +static INLINE void stack_init(int *stack, int length) { + int idx; + for (idx = 0; idx < length; ++idx) stack[idx] = -1; +} + int vp9_get_quantizer(struct VP9_COMP *cpi); static INLINE int frame_is_kf_gf_arf(const VP9_COMP *cpi) { @@ -795,6 +942,10 @@ static INLINE int get_ref_frame_buf_idx(const VP9_COMP *const cpi, return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX; } +static INLINE RefCntBuffer *get_ref_cnt_buffer(VP9_COMMON *cm, int fb_idx) { + return fb_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[fb_idx] : NULL; +} + static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer( VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) { VP9_COMMON *const cm = &cpi->common; @@ -858,19 +1009,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required( void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags); -static INLINE int is_two_pass_svc(const struct VP9_COMP *const cpi) { - return cpi->use_svc && cpi->oxcf.pass != 0; -} - static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { return (cpi->use_svc && cpi->oxcf.pass == 0); } #if CONFIG_VP9_TEMPORAL_DENOISING static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { - return (!cpi->use_svc || - (cpi->use_svc && - cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise)); + return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >= + cpi->svc.first_layer_denoise)); } #endif @@ -878,9 +1024,7 @@ static INLINE int denoise_svc(const struct VP9_COMP *const cpi) { static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS && - (cpi->oxcf.enable_auto_arf && - (!is_two_pass_svc(cpi) || - cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); + cpi->oxcf.enable_auto_arf; } static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, @@ -938,6 +1082,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width, VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); +int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows, + unsigned int cols, int delta_q[8], int delta_lf[8], + int skip[8], int ref_frame[8]); + void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_set_row_mt(VP9_COMP *cpi); @@ -948,4 +1096,4 @@ void vp9_set_row_mt(VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ENCODER_H_ +#endif // VPX_VP9_ENCODER_VP9_ENCODER_H_ diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c index 0bd2e2145..e7f8a537d 100644 --- a/libvpx/vp9/encoder/vp9_ethread.c +++ b/libvpx/vp9/encoder/vp9_ethread.c @@ -270,19 +270,19 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, { int i; - CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, - vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows)); - if (row_mt_sync->mutex_) { + CHECK_MEM_ERROR(cm, row_mt_sync->mutex, + vpx_malloc(sizeof(*row_mt_sync->mutex) * rows)); + if (row_mt_sync->mutex) { for (i = 0; i < rows; ++i) { - pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + pthread_mutex_init(&row_mt_sync->mutex[i], NULL); } } - CHECK_MEM_ERROR(cm, row_mt_sync->cond_, - vpx_malloc(sizeof(*row_mt_sync->cond_) * rows)); - if (row_mt_sync->cond_) { + CHECK_MEM_ERROR(cm, row_mt_sync->cond, + vpx_malloc(sizeof(*row_mt_sync->cond) * rows)); + if (row_mt_sync->cond) { for (i = 0; i < rows; ++i) { - pthread_cond_init(&row_mt_sync->cond_[i], NULL); + pthread_cond_init(&row_mt_sync->cond[i], NULL); } } } @@ -301,17 +301,17 @@ void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { #if CONFIG_MULTITHREAD int i; - if (row_mt_sync->mutex_ != NULL) { + if (row_mt_sync->mutex != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { - pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + pthread_mutex_destroy(&row_mt_sync->mutex[i]); } - vpx_free(row_mt_sync->mutex_); + vpx_free(row_mt_sync->mutex); } - if (row_mt_sync->cond_ != NULL) { + if (row_mt_sync->cond != NULL) { for (i = 0; i < row_mt_sync->rows; ++i) { - pthread_cond_destroy(&row_mt_sync->cond_[i]); + pthread_cond_destroy(&row_mt_sync->cond[i]); } - vpx_free(row_mt_sync->cond_); + vpx_free(row_mt_sync->cond); } #endif // CONFIG_MULTITHREAD vpx_free(row_mt_sync->cur_col); @@ -327,11 +327,11 @@ void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { const int nsync = row_mt_sync->sync_range; if (r && !(c & (nsync - 1))) { - pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_t *const mutex = &row_mt_sync->mutex[r - 1]; pthread_mutex_lock(mutex); while (c > row_mt_sync->cur_col[r - 1] - nsync + 1) { - pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + pthread_cond_wait(&row_mt_sync->cond[r - 1], mutex); } pthread_mutex_unlock(mutex); } @@ -365,12 +365,12 @@ void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, } if (sig) { - pthread_mutex_lock(&row_mt_sync->mutex_[r]); + pthread_mutex_lock(&row_mt_sync->mutex[r]); row_mt_sync->cur_col[r] = cur; - pthread_cond_signal(&row_mt_sync->cond_[r]); - pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + pthread_cond_signal(&row_mt_sync->cond[r]); + pthread_mutex_unlock(&row_mt_sync->mutex[r]); } #else (void)row_mt_sync; @@ -390,8 +390,9 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, } #if !CONFIG_REALTIME_ONLY -static int first_pass_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int first_pass_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -470,8 +471,8 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, first_pass_worker_hook, multi_thread_ctxt, + num_workers); first_tile_col = &cpi->tile_data[0]; for (i = 1; i < tile_cols; i++) { @@ -480,8 +481,9 @@ void vp9_encode_fp_row_mt(VP9_COMP *cpi) { } } -static int temporal_filter_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int temporal_filter_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -508,8 +510,8 @@ static int temporal_filter_worker_hook(EncWorkerData *const thread_data, tile_col = proc_job->tile_col_id; tile_row = proc_job->tile_row_id; this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; - mb_col_start = (this_tile->tile_info.mi_col_start) >> 1; - mb_col_end = (this_tile->tile_info.mi_col_end + 1) >> 1; + mb_col_start = (this_tile->tile_info.mi_col_start) >> TF_SHIFT; + mb_col_end = (this_tile->tile_info.mi_col_end + TF_ROUND) >> TF_SHIFT; mb_row = proc_job->vert_unit_row_num; vp9_temporal_filter_iterate_row_c(cpi, thread_data->td, mb_row, @@ -553,13 +555,14 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, temporal_filter_worker_hook, multi_thread_ctxt, + num_workers); } #endif // !CONFIG_REALTIME_ONLY -static int enc_row_mt_worker_hook(EncWorkerData *const thread_data, - MultiThreadHandle *multi_thread_ctxt) { +static int enc_row_mt_worker_hook(void *arg1, void *arg2) { + EncWorkerData *const thread_data = (EncWorkerData *)arg1; + MultiThreadHandle *multi_thread_ctxt = (MultiThreadHandle *)arg2; VP9_COMP *const cpi = thread_data->cpi; const VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; @@ -648,8 +651,8 @@ void vp9_encode_tiles_row_mt(VP9_COMP *cpi) { } } - launch_enc_workers(cpi, (VPxWorkerHook)enc_row_mt_worker_hook, - multi_thread_ctxt, num_workers); + launch_enc_workers(cpi, enc_row_mt_worker_hook, multi_thread_ctxt, + num_workers); for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; diff --git a/libvpx/vp9/encoder/vp9_ethread.h b/libvpx/vp9/encoder/vp9_ethread.h index a396e621d..cda0293bc 100644 --- a/libvpx/vp9/encoder/vp9_ethread.h +++ b/libvpx/vp9/encoder/vp9_ethread.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_ETHREAD_H_ -#define VP9_ENCODER_VP9_ETHREAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_ETHREAD_H_ +#define VPX_VP9_ENCODER_VP9_ETHREAD_H_ #ifdef __cplusplus extern "C" { @@ -33,8 +33,8 @@ typedef struct EncWorkerData { // Encoder row synchronization typedef struct VP9RowMTSyncData { #if CONFIG_MULTITHREAD - pthread_mutex_t *mutex_; - pthread_cond_t *cond_; + pthread_mutex_t *mutex; + pthread_cond_t *cond; #endif // Allocate memory to store the sb/mb block index in each row. int *cur_col; @@ -69,4 +69,4 @@ void vp9_temporal_filter_row_mt(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_ETHREAD_H_ +#endif // VPX_VP9_ENCODER_VP9_ETHREAD_H_ diff --git a/libvpx/vp9/encoder/vp9_extend.h b/libvpx/vp9/encoder/vp9_extend.h index c0dd75715..4ba7fc95e 100644 --- a/libvpx/vp9/encoder/vp9_extend.h +++ b/libvpx/vp9/encoder/vp9_extend.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_EXTEND_H_ -#define VP9_ENCODER_VP9_EXTEND_H_ +#ifndef VPX_VP9_ENCODER_VP9_EXTEND_H_ +#define VPX_VP9_ENCODER_VP9_EXTEND_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_integer.h" @@ -28,4 +28,4 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_EXTEND_H_ +#endif // VPX_VP9_ENCODER_VP9_EXTEND_H_ diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c index fb6b132a5..8f0da48a2 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.c +++ b/libvpx/vp9/encoder/vp9_firstpass.c @@ -44,14 +44,11 @@ #define COMPLEXITY_STATS_OUTPUT 0 #define FIRST_PASS_Q 10.0 -#define INTRA_MODE_PENALTY 1024 +#define NORMAL_BOOST 100 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 #define NEW_MV_MODE_PENALTY 32 #define DARK_THRESH 64 -#define DEFAULT_GRP_WEIGHT 1.0 -#define RC_FACTOR_MIN 0.75 -#define RC_FACTOR_MAX 1.75 #define SECTION_NOISE_DEF 250.0 #define LOW_I_THRESH 24000 @@ -105,7 +102,7 @@ static void output_stats(FIRSTPASS_STATS *stats, fprintf(fpfile, "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf" "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf" - "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf" + "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.4lf %12.0lf" "%12.4lf" "\n", stats->frame, stats->weight, stats->intra_error, stats->coded_error, @@ -316,16 +313,7 @@ void vp9_init_first_pass(VP9_COMP *cpi) { } void vp9_end_first_pass(VP9_COMP *cpi) { - if (is_two_pass_svc(cpi)) { - int i; - for (i = 0; i < cpi->svc.number_spatial_layers; ++i) { - output_stats(&cpi->svc.layer_context[i].twopass.total_stats, - cpi->output_pkt_list); - } - } else { - output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); - } - + output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list); vpx_free(cpi->twopass.fp_mb_float_stats); cpi->twopass.fp_mb_float_stats = NULL; } @@ -503,11 +491,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = thresh; break; case VPX_BITS_10: ret_val = thresh << 4; break; - case VPX_BITS_12: ret_val = thresh << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = thresh << 8; + break; } } #else @@ -529,11 +516,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break; case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break; - case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = UL_INTRA_THRESH << 4; + break; } } #else @@ -550,11 +536,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break; case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break; - case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = SMOOTH_INTRA_THRESH << 8; + break; } } #else @@ -731,9 +716,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, // Exclude any image dead zone if (fp_acc_data->image_data_start_row > 0) { fp_acc_data->intra_skip_count = - VPXMAX(0, - fp_acc_data->intra_skip_count - - (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); } fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; @@ -825,6 +809,8 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, fp_acc_data->image_data_start_row); } +#define NZ_MOTION_PENALTY 128 +#define INTRA_MODE_PENALTY 1024 void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, FIRSTPASS_DATA *fp_acc_data, TileDataEnc *tile_data, MV *best_ref_mv, @@ -834,6 +820,8 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; TileInfo tile = tile_data->tile_info; + const int mb_col_start = ROUND_POWER_OF_TWO(tile.mi_col_start, 1); + const int mb_col_end = ROUND_POWER_OF_TWO(tile.mi_col_end, 1); struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; @@ -850,9 +838,6 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : NULL; MODE_INFO mi_above, mi_left; double mb_intra_factor; @@ -861,29 +846,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); - assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); - - if (lc != NULL) { - // Use either last frame or alt frame for motion search. - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - if (gld_yv12 == NULL) { - gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - } - } else { - gld_yv12 = NULL; - } - } + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); - xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + - (tile.mi_col_start >> 1); - xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1); + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + mb_col_start; + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + mb_col_start; for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][1]; @@ -897,10 +863,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); // Reset above block coeffs. - recon_yoffset = - (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16; - recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) + - (tile.mi_col_start >> 1) * uv_mb_height; + recon_yoffset = (mb_row * recon_y_stride * 16) + mb_col_start * 16; + recon_uvoffset = + (mb_row * recon_uv_stride * uv_mb_height) + mb_col_start * uv_mb_height; // Set up limit values for motion vectors to prevent them extending // outside the UMV borders. @@ -908,8 +873,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, x->mv_limits.row_max = ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; - for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1); - ++mb_col, c++) { + for (mb_col = mb_col_start, c = 0; mb_col < mb_col_end; ++mb_col, c++) { int this_error; int this_intra_error; const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); @@ -955,7 +919,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, x->skip_encode = 0; x->fp_src_pred = 0; // Do intra prediction based on source pixels for tile boundaries - if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) { + if (mb_col == mb_col_start && mb_col != 0) { xd->left_mi = &mi_left; x->fp_src_pred = 1; } @@ -1002,12 +966,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, switch (cm->bit_depth) { case VPX_BITS_8: break; case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; + assert(cm->bit_depth == VPX_BITS_12); + this_error >>= 8; + break; } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -1073,30 +1035,34 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; // Other than for the first frame do a motion search. - if ((lc == NULL && cm->current_video_frame > 0) || - (lc != NULL && lc->current_video_frame_in_layer > 0)) { - int tmp_err, motion_error, raw_motion_error; + if (cm->current_video_frame > 0) { + int tmp_err, motion_error, this_motion_error, raw_motion_error; // Assume 0,0 motion with no mv overhead. MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; struct buf_2d unscaled_last_source_buf_2d; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { motion_error = highbd_get_prediction_error( bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + this_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], 8); } else { motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; } #else motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + this_motion_error = motion_error; #endif // CONFIG_VP9_HIGHBITDEPTH // Compute the motion error of the 0,0 motion using the last source // frame as the reference. Skip the further motion search on - // reconstructed frame if this error is small. + // reconstructed frame if this error is very small. unscaled_last_source_buf_2d.buf = cpi->unscaled_last_source->y_buffer + recon_yoffset; unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; @@ -1113,12 +1079,20 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, &unscaled_last_source_buf_2d); #endif // CONFIG_VP9_HIGHBITDEPTH - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || lc != NULL) { + if (raw_motion_error > NZ_MOTION_PENALTY) { // Test last reference frame using the previous best mv as the // starting point (best reference) for the search. first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + v_fn_ptr.vf = get_block_variance_fn(bsize); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, 8); + } +#endif // CONFIG_VP9_HIGHBITDEPTH + this_motion_error = + vp9_get_mvpred_var(x, &mv, best_ref_mv, &v_fn_ptr, 0); + // If the current best reference mv is not centered on 0,0 then do a // 0,0 based search as well. if (!is_zero_mv(best_ref_mv)) { @@ -1128,13 +1102,13 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, if (tmp_err < motion_error) { motion_error = tmp_err; mv = tmp_mv; + this_motion_error = + vp9_get_mvpred_var(x, &tmp_mv, &zero_mv, &v_fn_ptr, 0); } } // Search in an older reference frame. - if (((lc == NULL && cm->current_video_frame > 1) || - (lc != NULL && lc->current_video_frame_in_layer > 1)) && - gld_yv12 != NULL) { + if ((cm->current_video_frame > 1) && gld_yv12 != NULL) { // Assume 0,0 motion with no mv overhead. int gf_motion_error; @@ -1316,7 +1290,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH); if (this_intra_error < scaled_low_intra_thresh) { fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); - if (motion_error < scaled_low_intra_thresh) { + if (this_motion_error < scaled_low_intra_thresh) { fp_acc_data->intra_count_low += 1.0; } else { fp_acc_data->intra_count_high += 1.0; @@ -1335,7 +1309,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, recon_uvoffset += uv_mb_height; // Accumulate row level stats to the corresponding tile stats - if (cpi->row_mt && mb_col == (tile.mi_col_end >> 1) - 1) + if (cpi->row_mt && mb_col == mb_col_end - 1) accumulate_fp_mb_row_stat(tile_data, fp_acc_data); (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, @@ -1372,9 +1346,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : NULL; BufferPool *const pool = cm->buffer_pool; FIRSTPASS_DATA fp_temp_data; @@ -1386,7 +1357,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); - assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); + assert(frame_is_intra_only(cm) || (lst_yv12 != NULL)); #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { @@ -1397,50 +1368,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { set_first_pass_params(cpi); vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); - if (lc != NULL) { - twopass = &lc->twopass; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - cpi->ref_frame_flags = VP9_LAST_FLAG; - - if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < - REF_FRAMES) { - cpi->gld_fb_idx = - cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; - cpi->ref_frame_flags |= VP9_GOLD_FLAG; - cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); - } else { - cpi->refresh_golden_frame = 0; - } - - if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0; - - vp9_scale_references(cpi); - - // Use either last frame or alt frame for motion search. - if (cpi->ref_frame_flags & VP9_LAST_FLAG) { - first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); - if (first_ref_buf == NULL) - first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); - } - - if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { - gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); - if (gld_yv12 == NULL) { - gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - } - } else { - gld_yv12 = NULL; - } - - set_ref_ptrs(cm, xd, - (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE, - (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); - - cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0, EIGHTTAP, 0); - } - vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); vp9_setup_src_planes(x, cpi->Source, 0, 0); @@ -1524,18 +1451,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vpx_extend_frame_borders(new_yv12); - if (lc != NULL) { - vp9_update_reference_frames(cpi); - } else { - // The frame we just compressed now becomes the last frame. - ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], - cm->new_fb_idx); - } + // The frame we just compressed now becomes the last frame. + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx], + cm->new_fb_idx); // Special case for the first frame. Copy into the GF buffer as a second // reference. - if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX && - lc == NULL) { + if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) { ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx], cm->ref_frame_map[cpi->lst_fb_idx]); } @@ -1583,7 +1505,26 @@ static double calc_correction_factor(double err_per_mb, double err_divisor, return fclamp(pow(error_term, power_term), 0.05, 5.0); } -#define ERR_DIVISOR 115.0 +static double wq_err_divisor(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area <= 640 * 360) { + return 115.0; + } else if (screen_area < 1280 * 720) { + return 125.0; + } else if (screen_area <= 1920 * 1080) { + return 130.0; + } else if (screen_area < 3840 * 2160) { + return 150.0; + } + + // Fall through to here only for 4K and above. + return 200.0; +} + #define NOISE_FACTOR_MIN 0.9 #define NOISE_FACTOR_MAX 1.1 static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, @@ -1643,7 +1584,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err, // content at the given rate. for (q = rc->best_quality; q < rc->worst_quality; ++q) { const double factor = - calc_correction_factor(av_err_per_mb, ERR_DIVISOR, q); + calc_correction_factor(av_err_per_mb, wq_err_divisor(cpi), q); const int bits_per_mb = vp9_rc_bits_per_mb( INTER_FRAME, q, factor * speed_term * cpi->twopass.bpm_factor * noise_factor, @@ -1690,14 +1631,9 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, } void vp9_init_second_pass(VP9_COMP *cpi) { - SVC *const svc = &cpi->svc; VP9EncoderConfig *const oxcf = &cpi->oxcf; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); RATE_CONTROL *const rc = &cpi->rc; - TWO_PASS *const twopass = - is_two_pass_svc ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; + TWO_PASS *const twopass = &cpi->twopass; double frame_rate; FIRSTPASS_STATS *stats; @@ -1774,18 +1710,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // encoded in the second pass is a guess. However, the sum duration is not. // It is calculated based on the actual durations of all frames from the // first pass. - - if (is_two_pass_svc) { - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - vp9_new_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); // This variable monitors how far behind the second ref update is lagging. twopass->sr_update_lag = 1; @@ -1913,10 +1840,12 @@ static int detect_flash(const TWO_PASS *twopass, int offset) { // brief break in prediction (such as a flash) but subsequent frames // are reasonably well predicted by an earlier (pre flash) frame. // The recovery after a flash is indicated by a high pcnt_second_ref - // compared to pcnt_inter. + // useage or a second ref coded error notabley lower than the last + // frame coded error. return next_frame != NULL && - next_frame->pcnt_second_ref > next_frame->pcnt_inter && - next_frame->pcnt_second_ref >= 0.5; + ((next_frame->sr_coded_error < next_frame->coded_error) || + ((next_frame->pcnt_second_ref > next_frame->pcnt_inter) && + (next_frame->pcnt_second_ref >= 0.5))); } // Update the motion related elements to the GF arf boost calculation. @@ -1971,7 +1900,20 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction); } -#define KF_BASELINE_ERR_PER_MB 12500.0 +static double kf_err_per_mb(VP9_COMP *cpi) { + const VP9_COMMON *const cm = &cpi->common; + unsigned int screen_area = (cm->width * cm->height); + + // Use a different error per mb factor for calculating boost for + // different formats. + if (screen_area < 1280 * 720) { + return 2000.0; + } else if (screen_area < 1920 * 1080) { + return 500.0; + } + return 250.0; +} + static double calc_kf_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, double *sr_accumulator, @@ -1984,7 +1926,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, const double active_area = calculate_active_area(cpi, this_frame); // Underlying boost factor is based on inter error ratio. - frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) / + frame_boost = (kf_err_per_mb(cpi) * active_area) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); // Update the accumulator for second ref error difference. @@ -1997,8 +1939,11 @@ static double calc_kf_frame_boost(VP9_COMP *cpi, if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // Q correction and scalling - frame_boost = frame_boost * boost_q_correction; + // Q correction and scaling + // The 40.0 value here is an experimentally derived baseline minimum. + // This value is in line with the minimum per frame boost in the alt_ref + // boost calculation. + frame_boost = ((frame_boost + 40.0) * boost_q_correction); return VPXMIN(frame_boost, max_boost * boost_q_correction); } @@ -2140,7 +2085,7 @@ static int calculate_boost_bits(int frame_count, int boost, // return 0 for invalid inputs (could arise e.g. through rounding errors) if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0; - allocation_chunks = (frame_count * 100) + boost; + allocation_chunks = (frame_count * NORMAL_BOOST) + boost; // Prevent overflow. if (boost > 1023) { @@ -2154,18 +2099,6 @@ static int calculate_boost_bits(int frame_count, int boost, 0); } -// Current limit on maximum number of active arfs in a GF/ARF group. -#define MAX_ACTIVE_ARFS 2 -#define ARF_SLOT1 2 -#define ARF_SLOT2 3 -// This function indirects the choice of buffers for arfs. -// At the moment the values are fixed but this may change as part of -// the integration process with other codec features that swap buffers around. -static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { - arf_buffer_indices[0] = ARF_SLOT1; - arf_buffer_indices[1] = ARF_SLOT2; -} - // Used in corpus vbr: Calculates the total normalized group complexity score // for a given number of frames starting at the current position in the stats // file. @@ -2185,11 +2118,129 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score, ++s; ++i; } - assert(i == frame_count); return score_total; } +static void find_arf_order(VP9_COMP *cpi, GF_GROUP *gf_group, + int *index_counter, int depth, int start, int end) { + TWO_PASS *twopass = &cpi->twopass; + const FIRSTPASS_STATS *const start_pos = twopass->stats_in; + FIRSTPASS_STATS fpf_frame; + const int mid = (start + end + 1) >> 1; + const int min_frame_interval = 2; + int idx; + + // Process regular P frames + if ((end - start < min_frame_interval) || + (depth > gf_group->allowed_max_layer_depth)) { + for (idx = start; idx <= end; ++idx) { + gf_group->update_type[*index_counter] = LF_UPDATE; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = idx; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + gf_group->gfu_boost[*index_counter] = NORMAL_BOOST; + ++(*index_counter); + } + gf_group->max_layer_depth = VPXMAX(gf_group->max_layer_depth, depth); + return; + } + + assert(abs(mid - start) >= 1 && abs(mid - end) >= 1); + + // Process ARF frame + gf_group->layer_depth[*index_counter] = depth; + gf_group->update_type[*index_counter] = ARF_UPDATE; + gf_group->arf_src_offset[*index_counter] = mid - start; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = GF_ARF_LOW; + + for (idx = 0; idx <= mid; ++idx) + if (EOF == input_stats(twopass, &fpf_frame)) break; + + gf_group->gfu_boost[*index_counter] = + VPXMAX(MIN_ARF_GF_BOOST, + calc_arf_boost(cpi, end - mid + 1, mid - start) >> depth); + + reset_fpf_position(twopass, start_pos); + + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, start, mid - 1); + + gf_group->update_type[*index_counter] = USE_BUF_FRAME; + gf_group->arf_src_offset[*index_counter] = 0; + gf_group->frame_gop_index[*index_counter] = mid; + gf_group->rf_level[*index_counter] = INTER_NORMAL; + gf_group->layer_depth[*index_counter] = depth; + ++(*index_counter); + + find_arf_order(cpi, gf_group, index_counter, depth + 1, mid + 1, end); +} + +static INLINE void set_gf_overlay_frame_type(GF_GROUP *gf_group, + int frame_index, + int source_alt_ref_active) { + if (source_alt_ref_active) { + gf_group->update_type[frame_index] = OVERLAY_UPDATE; + gf_group->rf_level[frame_index] = INTER_NORMAL; + gf_group->layer_depth[frame_index] = MAX_ARF_LAYERS - 1; + gf_group->gfu_boost[frame_index] = NORMAL_BOOST; + } else { + gf_group->update_type[frame_index] = GF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = 0; + } +} + +static void define_gf_group_structure(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + TWO_PASS *const twopass = &cpi->twopass; + GF_GROUP *const gf_group = &twopass->gf_group; + int frame_index = 0; + int key_frame = cpi->common.frame_type == KEY_FRAME; + int layer_depth = 1; + int gop_frames = + rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending); + + gf_group->frame_start = cpi->common.current_video_frame; + gf_group->frame_end = gf_group->frame_start + rc->baseline_gf_interval; + gf_group->max_layer_depth = 0; + gf_group->allowed_max_layer_depth = 0; + + // For key frames the frame target rate is already set and it + // is also the golden frame. + // === [frame_index == 0] === + if (!key_frame) + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_active); + + ++frame_index; + + // === [frame_index == 1] === + if (rc->source_alt_ref_pending) { + gf_group->update_type[frame_index] = ARF_UPDATE; + gf_group->rf_level[frame_index] = GF_ARF_STD; + gf_group->layer_depth[frame_index] = layer_depth; + gf_group->arf_src_offset[frame_index] = + (unsigned char)(rc->baseline_gf_interval - 1); + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + gf_group->max_layer_depth = 1; + ++frame_index; + ++layer_depth; + gf_group->allowed_max_layer_depth = cpi->oxcf.enable_auto_arf; + } + + find_arf_order(cpi, gf_group, &frame_index, layer_depth, 1, gop_frames); + + set_gf_overlay_frame_type(gf_group, frame_index, rc->source_alt_ref_pending); + gf_group->arf_src_offset[frame_index] = 0; + gf_group->frame_gop_index[frame_index] = rc->baseline_gf_interval; + + // Set the frame ops number. + gf_group->gf_group_size = frame_index; +} + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -2198,17 +2249,12 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS frame_stats; int i; - int frame_index = 1; + int frame_index = 0; int target_frame_size; int key_frame; const int max_bits = frame_max_bits(&cpi->rc, oxcf); int64_t total_group_bits = gf_group_bits; - int mid_boost_bits = 0; int mid_frame_idx; - unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS]; - int alt_frame_index = frame_index; - int has_temporal_layers = - is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; int last_frame_reduction = 0; @@ -2216,71 +2262,32 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, double tot_norm_frame_score = 1.0; double this_frame_score = 1.0; - // Only encode alt reference frame in temporal base layer. - if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; + // Define the GF structure and specify + int gop_frames = gf_group->gf_group_size; - key_frame = - cpi->common.frame_type == KEY_FRAME || vp9_is_upper_layer_key_frame(cpi); - - get_arf_buffer_indices(arf_buffer_indices); + key_frame = cpi->common.frame_type == KEY_FRAME; // For key frames the frame target rate is already set and it // is also the golden frame. + // === [frame_index == 0] === if (!key_frame) { - if (rc->source_alt_ref_active) { - gf_group->update_type[0] = OVERLAY_UPDATE; - gf_group->rf_level[0] = INTER_NORMAL; - gf_group->bit_allocation[0] = 0; - } else { - gf_group->update_type[0] = GF_UPDATE; - gf_group->rf_level[0] = GF_ARF_STD; - gf_group->bit_allocation[0] = gf_arf_bits; - } - gf_group->arf_update_idx[0] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[0] = arf_buffer_indices[0]; - - // Step over the golden frame / overlay frame - if (EOF == input_stats(twopass, &frame_stats)) return; + gf_group->bit_allocation[frame_index] = + rc->source_alt_ref_active ? 0 : gf_arf_bits; } // Deduct the boost bits for arf (or gf if it is not a key frame) // from the group total. if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits; + ++frame_index; + + // === [frame_index == 1] === // Store the bits to spend on the ARF if there is one. if (rc->source_alt_ref_pending) { - gf_group->update_type[alt_frame_index] = ARF_UPDATE; - gf_group->rf_level[alt_frame_index] = GF_ARF_STD; - gf_group->bit_allocation[alt_frame_index] = gf_arf_bits; - - if (has_temporal_layers) - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - - cpi->svc.number_temporal_layers); - else - gf_group->arf_src_offset[alt_frame_index] = - (unsigned char)(rc->baseline_gf_interval - 1); - - gf_group->arf_update_idx[alt_frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[alt_frame_index] = - arf_buffer_indices[cpi->multi_arf_last_grp_enabled && - rc->source_alt_ref_active]; - if (!has_temporal_layers) ++frame_index; - - if (cpi->multi_arf_enabled) { - // Set aside a slot for a level 1 arf. - gf_group->update_type[frame_index] = ARF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_LOW; - gf_group->arf_src_offset[frame_index] = - (unsigned char)((rc->baseline_gf_interval >> 1) - 1); - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[1]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - ++frame_index; - } - } + gf_group->bit_allocation[frame_index] = gf_arf_bits; - // Note index of the first normal inter frame int eh group (not gf kf arf) - gf_group->first_inter_index = frame_index; + ++frame_index; + } // Define middle frame mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1; @@ -2291,6 +2298,61 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, else normal_frame_bits = (int)total_group_bits; + gf_group->gfu_boost[1] = rc->gfu_boost; + + if (cpi->multi_layer_arf) { + int idx; + int arf_depth_bits[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_count[MAX_ARF_LAYERS] = { 0 }; + int arf_depth_boost[MAX_ARF_LAYERS] = { 0 }; + int total_arfs = 1; // Account for the base layer ARF. + + for (idx = 0; idx < gop_frames; ++idx) { + if (gf_group->update_type[idx] == ARF_UPDATE) { + arf_depth_boost[gf_group->layer_depth[idx]] += gf_group->gfu_boost[idx]; + ++arf_depth_count[gf_group->layer_depth[idx]]; + } + } + + for (idx = 2; idx < MAX_ARF_LAYERS; ++idx) { + if (arf_depth_boost[idx] == 0) break; + arf_depth_bits[idx] = calculate_boost_bits( + rc->baseline_gf_interval - total_arfs - arf_depth_count[idx], + arf_depth_boost[idx], total_group_bits); + + total_group_bits -= arf_depth_bits[idx]; + total_arfs += arf_depth_count[idx]; + } + + // offset the base layer arf + normal_frames -= (total_arfs - 1); + if (normal_frames > 1) + normal_frame_bits = (int)(total_group_bits / normal_frames); + else + normal_frame_bits = (int)total_group_bits; + + target_frame_size = normal_frame_bits; + target_frame_size = + clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); + + // The first layer ARF has its bit allocation assigned. + for (idx = frame_index; idx < gop_frames; ++idx) { + switch (gf_group->update_type[idx]) { + case ARF_UPDATE: + gf_group->bit_allocation[idx] = + (int)((arf_depth_bits[gf_group->layer_depth[idx]] * + gf_group->gfu_boost[idx]) / + arf_depth_boost[gf_group->layer_depth[idx]]); + break; + case USE_BUF_FRAME: gf_group->bit_allocation[idx] = 0; break; + default: gf_group->bit_allocation[idx] = target_frame_size; break; + } + } + gf_group->bit_allocation[idx] = 0; + + return; + } + if (oxcf->vbr_corpus_complexity) { av_score = get_distribution_av_err(cpi, twopass); tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); @@ -2298,13 +2360,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // Allocate bits to the other frames in the group. for (i = 0; i < normal_frames; ++i) { - int arf_idx = 0; if (EOF == input_stats(twopass, &frame_stats)) break; - - if (has_temporal_layers && frame_index == alt_frame_index) { - ++frame_index; - } - if (oxcf->vbr_corpus_complexity) { this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf, &frame_stats, av_score); @@ -2318,21 +2374,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, target_frame_size -= last_frame_reduction; } - if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { - mid_boost_bits += (target_frame_size >> 4); - target_frame_size -= (target_frame_size >> 4); - - if (frame_index <= mid_frame_idx) arf_idx = 1; - } - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[arf_idx]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[arf_idx]; - target_frame_size = clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits)); - gf_group->update_type[frame_index] = LF_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - gf_group->bit_allocation[frame_index] = target_frame_size; ++frame_index; } @@ -2344,27 +2388,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to // vp9_rc_get_second_pass_params() the data will be undefined. - gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0]; - gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0]; - - if (rc->source_alt_ref_pending) { - gf_group->update_type[frame_index] = OVERLAY_UPDATE; - gf_group->rf_level[frame_index] = INTER_NORMAL; - - // Final setup for second arf and its overlay. - if (cpi->multi_arf_enabled) { - gf_group->bit_allocation[2] = - gf_group->bit_allocation[mid_frame_idx] + mid_boost_bits; - gf_group->update_type[mid_frame_idx] = OVERLAY_UPDATE; - gf_group->bit_allocation[mid_frame_idx] = 0; - } - } else { - gf_group->update_type[frame_index] = GF_UPDATE; - gf_group->rf_level[frame_index] = GF_ARF_STD; - } - - // Note whether multi-arf was enabled this group for next time. - cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled; } // Adjusts the ARNF filter for a GF group. @@ -2382,9 +2405,9 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise, } // Analyse and define a gf/arf group. -#define ARF_DECAY_BREAKOUT 0.10 #define ARF_ABS_ZOOM_THRESH 4.0 +#define MAX_GF_BOOST 5400 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; @@ -2426,6 +2449,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { const int is_key_frame = frame_is_intra_only(cm); const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active; + double gop_intra_factor = 1.0; + // Reset the GF group data structures unless this is a key // frame in which case it will already have been done. if (is_key_frame == 0) { @@ -2465,36 +2490,49 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { { int int_max_q = (int)(vp9_convert_qindex_to_q(twopass->active_worst_quality, cpi->common.bit_depth)); - int int_lbq = (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, - cpi->common.bit_depth)); + int q_term = (cm->current_video_frame == 0) + ? int_max_q / 32 + : (int)(vp9_convert_qindex_to_q(rc->last_boosted_qindex, + cpi->common.bit_depth) / + 6); active_min_gf_interval = rc->min_gf_interval + arf_active_or_kf + VPXMIN(2, int_max_q / 200); active_min_gf_interval = VPXMIN(active_min_gf_interval, rc->max_gf_interval + arf_active_or_kf); - if (cpi->multi_arf_allowed) { - active_max_gf_interval = rc->max_gf_interval; - } else { - // The value chosen depends on the active Q range. At low Q we have - // bits to spare and are better with a smaller interval and smaller boost. - // At high Q when there are few bits to spare we are better with a longer - // interval to spread the cost of the GF. - active_max_gf_interval = 12 + arf_active_or_kf + VPXMIN(4, (int_lbq / 6)); - - // We have: active_min_gf_interval <= - // rc->max_gf_interval + arf_active_or_kf. - if (active_max_gf_interval < active_min_gf_interval) { - active_max_gf_interval = active_min_gf_interval; - } else { - active_max_gf_interval = VPXMIN(active_max_gf_interval, - rc->max_gf_interval + arf_active_or_kf); - } + // The value chosen depends on the active Q range. At low Q we have + // bits to spare and are better with a smaller interval and smaller boost. + // At high Q when there are few bits to spare we are better with a longer + // interval to spread the cost of the GF. + active_max_gf_interval = 11 + arf_active_or_kf + VPXMIN(5, q_term); - // Would the active max drop us out just before the near the next kf? - if ((active_max_gf_interval <= rc->frames_to_key) && - (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval))) - active_max_gf_interval = rc->frames_to_key / 2; + // Force max GF interval to be odd. + active_max_gf_interval = active_max_gf_interval | 0x01; + + // We have: active_min_gf_interval <= + // rc->max_gf_interval + arf_active_or_kf. + if (active_max_gf_interval < active_min_gf_interval) { + active_max_gf_interval = active_min_gf_interval; + } else { + active_max_gf_interval = VPXMIN(active_max_gf_interval, + rc->max_gf_interval + arf_active_or_kf); } + + // Would the active max drop us out just before the near the next kf? + if ((active_max_gf_interval <= rc->frames_to_key) && + (active_max_gf_interval >= (rc->frames_to_key - rc->min_gf_interval))) + active_max_gf_interval = rc->frames_to_key / 2; + } + + if (cpi->multi_layer_arf) { + int layers = 0; + int max_layers = VPXMIN(MAX_ARF_LAYERS, cpi->oxcf.enable_auto_arf); + + // Adapt the intra_error factor to active_max_gf_interval limit. + for (i = active_max_gf_interval; i > 0; i >>= 1) ++layers; + + layers = VPXMIN(max_layers, layers); + gop_intra_factor += (layers * 0.25); } i = 0; @@ -2523,15 +2561,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator, &abs_mv_in_out_accumulator, &mv_ratio_accumulator); + // Monitor for static sections. + if ((rc->frames_since_key + i - 1) > 1) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } + // Accumulate the effect of prediction quality decay. if (!flash_detected) { last_loop_decay_rate = loop_decay_rate; loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); - // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); - // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. if (detect_transition_to_still(cpi, i, 5, loop_decay_rate, @@ -2551,18 +2591,27 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Break out conditions. - if ( - // Break at active_max_gf_interval unless almost totally static. - ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || + // Break at maximum of active_max_gf_interval unless almost totally static. + // + // Note that the addition of a test of rc->source_alt_ref_active is + // deliberate. The effect of this is that after a normal altref group even + // if the material is static there will be one normal length GF group + // before allowing longer GF groups. The reason for this is that in cases + // such as slide shows where slides are separated by a complex transition + // such as a fade, the arf group spanning the transition may not be coded + // at a very high quality and hence this frame (with its overlay) is a + // poor golden frame to use for an extended group. + if (((i >= active_max_gf_interval) && + ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) || ( // Don't break out with a very short interval. (i >= active_min_gf_interval) && // If possible dont break very close to a kf - ((rc->frames_to_key - i) >= rc->min_gf_interval) && + ((rc->frames_to_key - i) >= rc->min_gf_interval) && (i & 0x01) && (!flash_detected) && ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) || (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) || - (sr_accumulator > next_frame.intra_error)))) { + (sr_accumulator > gop_intra_factor * next_frame.intra_error)))) { break; } @@ -2573,8 +2622,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && - (i >= rc->min_gf_interval)) { + if ((zero_motion_accumulator < 0.995) && allow_alt_ref && + (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && + (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { const int forward_frames = (rc->frames_to_key - i >= i - 1) ? i - 1 : VPXMAX(0, rc->frames_to_key - i); @@ -2582,15 +2632,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Calculate the boost for alt ref. rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1)); rc->source_alt_ref_pending = 1; - - // Test to see if multi arf is appropriate. - cpi->multi_arf_enabled = - (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) && - (zero_motion_accumulator < 0.995)) - ? 1 - : 0; } else { - rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1)); + rc->gfu_boost = VPXMIN(MAX_GF_BOOST, calc_arf_boost(cpi, 0, (i - 1))); rc->source_alt_ref_pending = 0; } @@ -2601,31 +2644,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); #endif - // Set the interval until the next gf. - rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending); - - // Only encode alt reference frame in temporal base layer. So - // baseline_gf_interval should be multiple of a temporal layer group - // (typically the frame distance between two base layer frames) - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_gf_interval = (rc->baseline_gf_interval + count) & (~count); - int j; - for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - gf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); - gf_group_raw_error += this_frame->coded_error; - gf_group_noise += this_frame->frame_noise_energy; - gf_group_skip_pct += this_frame->intra_skip_pct; - gf_group_inactive_zone_rows += this_frame->inactive_zone_rows; - gf_group_inter += this_frame->pcnt_inter; - gf_group_motion += this_frame->pcnt_motion; - } - rc->baseline_gf_interval = new_gf_interval; - } - - rc->frames_till_gf_update_due = rc->baseline_gf_interval; + rc->baseline_gf_interval = i - rc->source_alt_ref_pending; // Reset the file position. reset_fpf_position(twopass, start_pos); @@ -2671,12 +2690,15 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate the extra bits to be used for boosted frame(s) - gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost, - gf_group_bits); + gf_arf_bits = calculate_boost_bits((rc->baseline_gf_interval - 1), + rc->gfu_boost, gf_group_bits); // Adjust KF group bits and error remaining. twopass->kf_group_error_left -= gf_group_err; + // Decide GOP structure. + define_gf_group_structure(cpi); + // Allocate bits to each of the frames in the GF group. allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits); @@ -2700,17 +2722,31 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { #endif } -// Threshold for use of the lagging second reference frame. High second ref -// usage may point to a transient event like a flash or occlusion rather than -// a real scene cut. -#define SECOND_REF_USEAGE_THRESH 0.1 +// Intra / Inter threshold very low +#define VERY_LOW_II 1.5 +// Clean slide transitions we expect a sharp single frame spike in error. +#define ERROR_SPIKE 5.0 + +// Slide show transition detection. +// Tests for case where there is very low error either side of the current frame +// but much higher just for this frame. This can help detect key frames in +// slide shows even where the slides are pictures of different sizes. +// Also requires that intra and inter errors are very similar to help eliminate +// harmful false positives. +// It will not help if the transition is a fade or other multi-frame effect. +static int slide_transition(const FIRSTPASS_STATS *this_frame, + const FIRSTPASS_STATS *last_frame, + const FIRSTPASS_STATS *next_frame) { + return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) && + (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) && + (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE)); +} + // Minimum % intra coding observed in first pass (1.0 = 100%) #define MIN_INTRA_LEVEL 0.25 -// Minimum ratio between the % of intra coding and inter coding in the first -// pass after discounting neutral blocks (discounting neutral blocks in this -// way helps catch scene cuts in clips with very flat areas or letter box -// format clips with image padding. -#define INTRA_VS_INTER_THRESH 2.0 +// Threshold for use of the lagging second reference frame. Scene cuts do not +// usually have a high second ref useage. +#define SECOND_REF_USEAGE_THRESH 0.125 // Hard threshold where the first pass chooses intra for almost all blocks. // In such a case even if the frame is not a scene cut coding a key frame // may be a good option. @@ -2718,12 +2754,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Maximum threshold for the relative ratio of intra error score vs best // inter error score. #define KF_II_ERR_THRESHOLD 2.5 -// In real scene cuts there is almost always a sharp change in the intra -// or inter error score. -#define ERR_CHANGE_THRESHOLD 0.4 -// For real scene cuts we expect an improvment in the intra inter error -// ratio in the next frame. -#define II_IMPROVEMENT_THRESHOLD 3.5 #define KF_II_MAX 128.0 #define II_FACTOR 12.5 // Test for very low intra complexity which could cause false key frames @@ -2735,29 +2765,21 @@ static int test_candidate_kf(TWO_PASS *twopass, const FIRSTPASS_STATS *next_frame) { int is_viable_kf = 0; double pcnt_intra = 1.0 - this_frame->pcnt_inter; - double modified_pcnt_inter = - this_frame->pcnt_inter - this_frame->pcnt_neutral; // Does the frame satisfy the primary criteria of a key frame? // See above for an explanation of the test criteria. // If so, then examine how well it predicts subsequent frames. - if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && - (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && + if (!detect_flash(twopass, -1) && !detect_flash(twopass, 0) && + (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) && ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) || - ((pcnt_intra > MIN_INTRA_LEVEL) && - (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) && + (slide_transition(this_frame, last_frame, next_frame)) || + (((this_frame->coded_error > (next_frame->coded_error * 1.1)) && + (this_frame->coded_error > (last_frame->coded_error * 1.1))) && + (pcnt_intra > MIN_INTRA_LEVEL) && + ((pcnt_intra + this_frame->pcnt_neutral) > 0.5) && ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < - KF_II_ERR_THRESHOLD) && - ((fabs(last_frame->coded_error - this_frame->coded_error) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > - ERR_CHANGE_THRESHOLD) || - (fabs(last_frame->intra_error - this_frame->intra_error) / - DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > - ERR_CHANGE_THRESHOLD) || - ((next_frame->intra_error / - DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > - II_IMPROVEMENT_THRESHOLD))))) { + KF_II_ERR_THRESHOLD)))) { int i; const FIRSTPASS_STATS *start_pos = twopass->stats_in; FIRSTPASS_STATS local_next_frame = *next_frame; @@ -2815,6 +2837,7 @@ static int test_candidate_kf(TWO_PASS *twopass, #define FRAMES_TO_CHECK_DECAY 8 #define MIN_KF_TOT_BOOST 300 #define KF_BOOST_SCAN_MAX_FRAMES 32 +#define KF_ABS_ZOOM_THRESH 6.0 #ifdef AGGRESSIVE_VBR #define KF_MAX_FRAME_BOOST 80.0 @@ -2839,13 +2862,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double zero_motion_accumulator = 1.0; double boost_score = 0.0; double kf_mod_err = 0.0; + double kf_raw_err = 0.0; double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; double sr_accumulator = 0.0; + double abs_mv_in_out_accumulator = 0.0; const double av_err = get_distribution_av_err(cpi, twopass); vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; // Reset the GF group data structures. vp9_zero(*gf_group); @@ -2856,7 +2882,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Clear the alt ref active flag and last group multi arf flags as they // can never be set for a key frame. rc->source_alt_ref_active = 0; - cpi->multi_arf_last_grp_enabled = 0; // KF is always a GF so clear frames till next gf counter. rc->frames_till_gf_update_due = 0; @@ -2866,6 +2891,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->kf_group_bits = 0; // Total bits available to kf group twopass->kf_group_error_left = 0.0; // Group modified error score. + kf_raw_err = this_frame->intra_error; kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); @@ -2950,18 +2976,6 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->next_key_frame_forced = 0; } - if (is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1) { - int count = (1 << (cpi->svc.number_temporal_layers - 1)) - 1; - int new_frame_to_key = (rc->frames_to_key + count) & (~count); - int j; - for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) { - if (EOF == input_stats(twopass, this_frame)) break; - kf_group_err += - calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err); - } - rc->frames_to_key = new_frame_to_key; - } - // Special case for the last key frame of the file. if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error. @@ -3001,13 +3015,22 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + // The zero motion test here insures that if we mark a kf group as static + // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. + // It also allows for a larger boost on long static groups. + if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) { double frame_boost; double zm_factor; // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + // First frame in kf group the second ref indicator is invalid. + if (i > 0) { + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + } else { + zero_motion_accumulator = + next_frame.pcnt_inter - next_frame.pcnt_motion; + } // Factor 0.75-1.25 based on how much of frame is static. zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); @@ -3021,7 +3044,15 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { KF_MAX_FRAME_BOOST * zm_factor); boost_score += frame_boost; - if (frame_boost < 25.00) break; + + // Measure of zoom. Large zoom tends to indicate reduced boost. + abs_mv_in_out_accumulator += + fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion); + + if ((frame_boost < 25.00) || + (abs_mv_in_out_accumulator > KF_ABS_ZOOM_THRESH) || + (sr_accumulator > (kf_raw_err * 1.50))) + break; } else { break; } @@ -3036,10 +3067,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { twopass->section_intra_rating = calculate_section_intra_ratio( start_position, twopass->stats_in_end, rc->frames_to_key); - // Apply various clamps for min and max boost - rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); - rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + // Special case for static / slide show content but dont apply + // if the kf group is very short. + if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) { + rc->kf_boost = MAX_KF_TOT_BOOST; + } else { + // Apply various clamps for min and max boost + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); + } // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, @@ -3066,60 +3103,12 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } } -// Define the reference buffers that will be updated post encode. -static void configure_buffer_updates(VP9_COMP *cpi) { - TWO_PASS *const twopass = &cpi->twopass; - - cpi->rc.is_src_frame_alt_ref = 0; - switch (twopass->gf_group.update_type[twopass->gf_group.index]) { - case KF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 1; - break; - case LF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 0; - break; - case GF_UPDATE: - cpi->refresh_last_frame = 1; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - break; - case OVERLAY_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 1; - cpi->refresh_alt_ref_frame = 0; - cpi->rc.is_src_frame_alt_ref = 1; - break; - case ARF_UPDATE: - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - cpi->refresh_alt_ref_frame = 1; - break; - default: assert(0); break; - } - if (is_two_pass_svc(cpi)) { - if (cpi->svc.temporal_layer_id > 0) { - cpi->refresh_last_frame = 0; - cpi->refresh_golden_frame = 0; - } - if (cpi->svc.layer_context[cpi->svc.spatial_layer_id].gold_ref_idx < 0) - cpi->refresh_golden_frame = 0; - if (cpi->alt_ref_source == NULL) cpi->refresh_alt_ref_frame = 0; - } -} - static int is_skippable_frame(const VP9_COMP *cpi) { // If the current frame does not have non-zero motion vector detected in the // first pass, and so do its previous and forward frames, then this frame // can be skipped for partition check, and the partition size is assigned // according to the variance - const SVC *const svc = &cpi->svc; - const TWO_PASS *const twopass = - is_two_pass_svc(cpi) ? &svc->layer_context[svc->spatial_layer_id].twopass - : &cpi->twopass; + const TWO_PASS *const twopass = &cpi->twopass; return (!frame_is_intra_only(&cpi->common) && twopass->stats_in - 2 > twopass->stats_in_start && @@ -3140,38 +3129,25 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { GF_GROUP *const gf_group = &twopass->gf_group; FIRSTPASS_STATS this_frame; - int target_rate; - LAYER_CONTEXT *const lc = - is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] - : 0; - if (!twopass->stats_in) return; // If this is an arf frame then we dont want to read the stats file or // advance the input pointer as we already have what we need. if (gf_group->update_type[gf_group->index] == ARF_UPDATE) { int target_rate; - configure_buffer_updates(cpi); + + vp9_configure_buffer_updates(cpi, gf_group->index); + target_rate = gf_group->bit_allocation[gf_group->index]; target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); rc->base_frame_target = target_rate; cm->frame_type = INTER_FRAME; - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - } - } - // Do the firstpass stats indicate that this frame is skippable for the // partition search? if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { + !cpi->use_svc) { cpi->partition_search_skippable_frame = is_skippable_frame(cpi); } @@ -3182,12 +3158,9 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { if (cpi->oxcf.rc_mode == VPX_Q) { twopass->active_worst_quality = cpi->oxcf.cq_level; - } else if (cm->current_video_frame == 0 || - (lc != NULL && lc->current_video_frame_in_layer == 0)) { + } else if (cm->current_video_frame == 0) { const int frames_left = - (int)(twopass->total_stats.count - - ((lc != NULL) ? lc->current_video_frame_in_layer - : cm->current_video_frame)); + (int)(twopass->total_stats.count - cm->current_video_frame); // Special case code for first frame. const int section_target_bandwidth = (int)(twopass->bits_left / frames_left); @@ -3236,59 +3209,36 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { cm->frame_type = INTER_FRAME; } - if (lc != NULL) { - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = (cm->frame_type == KEY_FRAME); - if (lc->is_key_frame) { - cpi->ref_frame_flags &= - (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - lc->frames_from_key_frame = 0; - // Encode an intra only empty frame since we have a key frame. - cpi->svc.encode_intra_empty_frame = 1; - } - } else { - cm->frame_type = INTER_FRAME; - lc->is_key_frame = cpi->svc.layer_context[0].is_key_frame; - - if (lc->is_key_frame) { - cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - lc->frames_from_key_frame = 0; - } - } - } - // Define a new GF/ARF group. (Should always enter here for key frames). if (rc->frames_till_gf_update_due == 0) { define_gf_group(cpi, &this_frame); rc->frames_till_gf_update_due = rc->baseline_gf_interval; - if (lc != NULL) cpi->refresh_golden_frame = 1; #if ARF_STATS_OUTPUT { FILE *fpfile; fpfile = fopen("arf.stt", "a"); ++arf_count; - fprintf(fpfile, "%10d %10ld %10d %10d %10ld\n", cm->current_video_frame, - rc->frames_till_gf_update_due, rc->kf_boost, arf_count, - rc->gfu_boost); + fprintf(fpfile, "%10d %10ld %10d %10d %10ld %10ld\n", + cm->current_video_frame, rc->frames_till_gf_update_due, + rc->kf_boost, arf_count, rc->gfu_boost, cm->frame_type); fclose(fpfile); } #endif } - configure_buffer_updates(cpi); + vp9_configure_buffer_updates(cpi, gf_group->index); // Do the firstpass stats indicate that this frame is skippable for the // partition search? if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2 && - (!cpi->use_svc || is_two_pass_svc(cpi))) { + !cpi->use_svc) { cpi->partition_search_skippable_frame = is_skippable_frame(cpi); } - target_rate = gf_group->bit_allocation[gf_group->index]; - rc->base_frame_target = target_rate; + rc->base_frame_target = gf_group->bit_allocation[gf_group->index]; // The multiplication by 256 reverses a scaling factor of (>> 8) // applied when combining MB error values for the frame. @@ -3329,8 +3279,7 @@ void vp9_twopass_postencode_update(VP9_COMP *cpi) { rc->rate_error_estimate = 0; } - if (cpi->common.frame_type != KEY_FRAME && - !vp9_is_upper_layer_key_frame(cpi)) { + if (cpi->common.frame_type != KEY_FRAME) { twopass->kf_group_bits -= bits_used; twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct; } diff --git a/libvpx/vp9/encoder/vp9_firstpass.h b/libvpx/vp9/encoder/vp9_firstpass.h index 000ecd779..0807097ac 100644 --- a/libvpx/vp9/encoder/vp9_firstpass.h +++ b/libvpx/vp9/encoder/vp9_firstpass.h @@ -8,8 +8,10 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_FIRSTPASS_H_ -#define VP9_ENCODER_VP9_FIRSTPASS_H_ +#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ +#define VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ + +#include <assert.h> #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -41,6 +43,8 @@ typedef struct { #define INVALID_ROW -1 +#define MAX_ARF_LAYERS 6 + typedef struct { double frame_mb_intra_factor; double frame_mb_brightness_factor; @@ -107,7 +111,9 @@ typedef enum { GF_UPDATE = 2, ARF_UPDATE = 3, OVERLAY_UPDATE = 4, - FRAME_UPDATE_TYPES = 5 + MID_OVERLAY_UPDATE = 5, + USE_BUF_FRAME = 6, // Use show existing frame, no ref buffer update + FRAME_UPDATE_TYPES = 7 } FRAME_UPDATE_TYPE; #define FC_ANIMATION_THRESH 0.15 @@ -119,13 +125,23 @@ typedef enum { typedef struct { unsigned char index; - unsigned char first_inter_index; - RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1]; - FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1]; - unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1]; - int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1]; + RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 2]; + FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char layer_depth[MAX_STATIC_GF_GROUP_LENGTH + 2]; + unsigned char frame_gop_index[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 2]; + int gfu_boost[MAX_STATIC_GF_GROUP_LENGTH + 2]; + + int frame_start; + int frame_end; + // TODO(jingning): The array size of arf_stack could be reduced. + int arf_index_stack[MAX_LAG_BUFFERS * 2]; + int top_arf_idx; + int stack_size; + int gf_group_size; + int max_layer_depth; + int allowed_max_layer_depth; } GF_GROUP; typedef struct { @@ -182,7 +198,6 @@ struct ThreadData; struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); -void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); @@ -194,7 +209,6 @@ void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); -void vp9_twopass_postencode_update(struct VP9_COMP *cpi); // Post encode update of the rate control parameters for 2-pass void vp9_twopass_postencode_update(struct VP9_COMP *cpi); @@ -206,4 +220,4 @@ void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_FIRSTPASS_H_ +#endif // VPX_VP9_ENCODER_VP9_FIRSTPASS_H_ diff --git a/libvpx/vp9/encoder/vp9_job_queue.h b/libvpx/vp9/encoder/vp9_job_queue.h index 89c08f207..ad09c1119 100644 --- a/libvpx/vp9/encoder/vp9_job_queue.h +++ b/libvpx/vp9/encoder/vp9_job_queue.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_ -#define VP9_ENCODER_VP9_JOB_QUEUE_H_ +#ifndef VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ typedef enum { FIRST_PASS_JOB, @@ -43,4 +43,4 @@ typedef struct { int num_jobs_acquired; } JobQueueHandle; -#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_ +#endif // VPX_VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/libvpx/vp9/encoder/vp9_lookahead.h b/libvpx/vp9/encoder/vp9_lookahead.h index 88be0ffcd..c627bede2 100644 --- a/libvpx/vp9/encoder/vp9_lookahead.h +++ b/libvpx/vp9/encoder/vp9_lookahead.h @@ -8,17 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_LOOKAHEAD_H_ -#define VP9_ENCODER_VP9_LOOKAHEAD_H_ +#ifndef VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ +#define VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ #include "vpx_scale/yv12config.h" #include "vpx/vpx_encoder.h" #include "vpx/vpx_integer.h" -#if CONFIG_SPATIAL_SVC -#include "vpx/vp8cx.h" -#endif - #ifdef __cplusplus extern "C" { #endif @@ -115,4 +111,4 @@ unsigned int vp9_lookahead_depth(struct lookahead_ctx *ctx); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_LOOKAHEAD_H_ +#endif // VPX_VP9_ENCODER_VP9_LOOKAHEAD_H_ diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c index 46d626def..831c79c17 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.c +++ b/libvpx/vp9/encoder/vp9_mbgraph.c @@ -57,11 +57,12 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, const MV *ref_mv, { uint32_t distortion; uint32_t sse; + // TODO(yunqing): may use higher tap interp filter than 2 taps if needed. cpi->find_fractional_mv_step( x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, - &v_fn_ptr, 0, mv_sf->subpel_iters_per_step, + &v_fn_ptr, 0, mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, - 0); + 0, USE_2_TAPS); } xd->mi[0]->mode = NEWMV; diff --git a/libvpx/vp9/encoder/vp9_mbgraph.h b/libvpx/vp9/encoder/vp9_mbgraph.h index df2fb98ef..7b629861d 100644 --- a/libvpx/vp9/encoder/vp9_mbgraph.h +++ b/libvpx/vp9/encoder/vp9_mbgraph.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MBGRAPH_H_ -#define VP9_ENCODER_VP9_MBGRAPH_H_ +#ifndef VPX_VP9_ENCODER_VP9_MBGRAPH_H_ +#define VPX_VP9_ENCODER_VP9_MBGRAPH_H_ #ifdef __cplusplus extern "C" { @@ -25,7 +25,9 @@ typedef struct { } ref[MAX_REF_FRAMES]; } MBGRAPH_MB_STATS; -typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS; +typedef struct { + MBGRAPH_MB_STATS *mb_stats; +} MBGRAPH_FRAME_STATS; struct VP9_COMP; @@ -35,4 +37,4 @@ void vp9_update_mbgraph_stats(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MBGRAPH_H_ +#endif // VPX_VP9_ENCODER_VP9_MBGRAPH_H_ diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c index 44f01be25..5a6717ab2 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.c +++ b/libvpx/vp9/encoder/vp9_mcomp.c @@ -263,27 +263,6 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { } \ } -// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of -// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten -// later in the same way. -#define SECOND_LEVEL_CHECKS_BEST \ - { \ - unsigned int second; \ - int br0 = br; \ - int bc0 = bc; \ - assert(tr == br || tc == bc); \ - if (tr == br && tc != bc) { \ - kc = bc - tc; \ - } else if (tr != br && tc == bc) { \ - kr = br - tr; \ - } \ - CHECK_BETTER(second, br0 + kr, bc0); \ - CHECK_BETTER(second, br0, bc0 + kc); \ - if (br0 != br || bc0 != bc) { \ - CHECK_BETTER(second, br0 + kr, bc0 + kc); \ - } \ - } - #define SETUP_SUBPEL_SEARCH \ const uint8_t *const z = x->plane[0].src.buf; \ const int src_stride = x->plane[0].src.stride; \ @@ -329,8 +308,8 @@ static unsigned int setup_center_error( if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); - vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, - y_stride); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, CONVERT_TO_SHORTPTR(y + offset), y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { @@ -388,14 +367,12 @@ static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) { *ir = (int)divide_and_round(x1 * b, y1); } -uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, - const MV *ref_mv, int allow_hp, - int error_per_bit, - const vp9_variance_fn_ptr_t *vfp, - int forced_stop, int iters_per_step, - int *cost_list, int *mvjcost, int *mvcost[2], - uint32_t *distortion, uint32_t *sse1, - const uint8_t *second_pred, int w, int h) { +uint32_t vp9_skip_sub_pixel_tree( + const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, + int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, + int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], + uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -418,6 +395,7 @@ uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv, (void)sse; (void)thismse; (void)cost_list; + (void)use_accurate_subpel_search; return besterr; } @@ -427,7 +405,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, @@ -439,6 +417,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( (void)allow_hp; (void)forced_stop; (void)hstep; + (void)use_accurate_subpel_search; if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX && cost_list[2] != INT_MAX && cost_list[3] != INT_MAX && @@ -492,8 +471,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_more( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -552,8 +533,10 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { SETUP_SUBPEL_SEARCH; + (void)use_accurate_subpel_search; + besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp, z, src_stride, y, y_stride, second_pred, w, h, offset, mvjcost, mvcost, sse1, distortion); @@ -638,12 +621,119 @@ static const MV search_step_table[12] = { }; /* clang-format on */ +static int accurate_sub_pel_search( + const MACROBLOCKD *xd, const MV *this_mv, const struct scale_factors *sf, + const InterpKernel *kernel, const vp9_variance_fn_ptr_t *vfp, + const uint8_t *const src_address, const int src_stride, + const uint8_t *const pre_address, int y_stride, const uint8_t *second_pred, + int w, int h, uint32_t *sse) { +#if CONFIG_VP9_HIGHBITDEPTH + uint64_t besterr; + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + DECLARE_ALIGNED(16, uint16_t, pred16[64 * 64]); + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(pre_address), y_stride, + pred16, w, this_mv, sf, w, h, 0, kernel, + MV_PRECISION_Q3, 0, 0, xd->bd); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]); + vpx_highbd_comp_avg_pred(comp_pred16, CONVERT_TO_SHORTPTR(second_pred), w, + h, pred16, w); + besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src_address, + src_stride, sse); + } else { + besterr = + vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src_address, src_stride, sse); + } + } else { + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + } + if (besterr >= UINT_MAX) return UINT_MAX; + return (int)besterr; +#else + int besterr; + DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]); + assert(sf->x_step_q4 == 16 && sf->y_step_q4 == 16); + assert(w != 0 && h != 0); + (void)xd; + + vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h, + 0, kernel, MV_PRECISION_Q3, 0, 0); + if (second_pred != NULL) { + DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]); + vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w); + besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse); + } else { + besterr = vfp->vf(pred, w, src_address, src_stride, sse); + } + return besterr; +#endif // CONFIG_VP9_HIGHBITDEPTH +} + +// TODO(yunqing): this part can be further refactored. +#if CONFIG_VP9_HIGHBITDEPTH +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + int64_t tmpmse; \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + tmpmse = thismse; \ + tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + if (tmpmse >= INT_MAX) { \ + v = INT_MAX; \ + } else if ((v = (uint32_t)tmpmse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } +#else +/* checks if (r, c) has better score than previous best */ +#define CHECK_BETTER1(v, r, c) \ + if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ + const MV mv = { r, c }; \ + const MV ref_mv = { rr, rc }; \ + thismse = \ + accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \ + y, y_stride, second_pred, w, h, &sse); \ + if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + thismse) < besterr) { \ + besterr = v; \ + br = r; \ + bc = c; \ + *distortion = thismse; \ + *sse1 = sse; \ + } \ + } else { \ + v = INT_MAX; \ + } + +#endif + uint32_t vp9_find_best_sub_pixel_tree( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { const uint8_t *const z = x->plane[0].src.buf; const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; @@ -671,6 +761,17 @@ uint32_t vp9_find_best_sub_pixel_tree( int kr, kc; MvLimits subpel_mv_limits; + // TODO(yunqing): need to add 4-tap filter optimization to speed up the + // encoder. + const InterpKernel *kernel = + (use_accurate_subpel_search > 0) + ? ((use_accurate_subpel_search == USE_4_TAPS) + ? vp9_filter_kernels[FOURTAP] + : ((use_accurate_subpel_search == USE_8_TAPS) + ? vp9_filter_kernels[EIGHTTAP] + : vp9_filter_kernels[EIGHTTAP_SHARP])) + : vp9_filter_kernels[BILINEAR]; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); minc = subpel_mv_limits.col_min; maxc = subpel_mv_limits.col_max; @@ -695,16 +796,25 @@ uint32_t vp9_find_best_sub_pixel_tree( tr = br + search_step[idx].row; tc = bc + search_step[idx].col; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv; this_mv.row = tr; this_mv.col = tc; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), - src_address, src_stride, &sse, second_pred); + + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, + y_stride, second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = + y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -726,14 +836,21 @@ uint32_t vp9_find_best_sub_pixel_tree( tc = bc + kc; tr = br + kr; if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) { - const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); MV this_mv = { tr, tc }; - if (second_pred == NULL) - thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse); - else - thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), src_address, - src_stride, &sse, second_pred); + if (use_accurate_subpel_search) { + thismse = accurate_sub_pel_search(xd, &this_mv, x->me_sf, kernel, vfp, + src_address, src_stride, y, y_stride, + second_pred, w, h, &sse); + } else { + const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3); + if (second_pred == NULL) + thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr), src_address, + src_stride, &sse); + else + thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr), + src_address, src_stride, &sse, second_pred); + } + cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); @@ -755,10 +872,48 @@ uint32_t vp9_find_best_sub_pixel_tree( bc = tc; } - if (iters_per_step > 1 && best_idx != -1) SECOND_LEVEL_CHECKS_BEST; + if (iters_per_step > 0 && best_idx != -1) { + unsigned int second; + const int br0 = br; + const int bc0 = bc; + assert(tr == br || tc == bc); + + if (tr == br && tc != bc) { + kc = bc - tc; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0, bc0 + kc); + } else { + CHECK_BETTER(second, br0, bc0 + kc); + } + } + } else if (tr != br && tc == bc) { + kr = br - tr; + if (iters_per_step == 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + } else { + CHECK_BETTER(second, br0 + kr, bc0); + } + } + } - tr = br; - tc = bc; + if (iters_per_step > 1) { + if (use_accurate_subpel_search) { + CHECK_BETTER1(second, br0 + kr, bc0); + CHECK_BETTER1(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER1(second, br0 + kr, bc0 + kc); + } + } else { + CHECK_BETTER(second, br0 + kr, bc0); + CHECK_BETTER(second, br0, bc0 + kc); + if (br0 != br || bc0 != bc) { + CHECK_BETTER(second, br0 + kr, bc0 + kc); + } + } + } + } search_step += 4; hstep >>= 1; @@ -780,6 +935,7 @@ uint32_t vp9_find_best_sub_pixel_tree( } #undef CHECK_BETTER +#undef CHECK_BETTER1 static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col, int range) { @@ -1576,6 +1732,183 @@ static int exhuastive_mesh_search(const MACROBLOCK *x, MV *ref_mv, MV *best_mv, return best_sad; } +#if CONFIG_NON_GREEDY_MV +double vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, + int mv_num) { + int i; + int update = 0; + double best_cost = 0; + vpx_clear_system_state(); + for (i = 0; i < mv_num; ++i) { + if (nb_mvs[i].as_int != INVALID_MV) { + MV nb_mv = nb_mvs[i].as_mv; + const double row_diff = mv->row - nb_mv.row; + const double col_diff = mv->col - nb_mv.col; + double cost = row_diff * row_diff + col_diff * col_diff; + cost = log2(1 + cost); + if (update == 0) { + best_cost = cost; + update = 1; + } else { + best_cost = cost < best_cost ? cost : best_cost; + } + } + } + return best_cost; +} + +double vp9_diamond_search_sad_new(const MACROBLOCK *x, + const search_site_config *cfg, + const MV *init_full_mv, MV *best_full_mv, + double *best_mv_dist, double *best_mv_cost, + int search_param, double lambda, int *num00, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num) { + int i, j, step; + + const MACROBLOCKD *const xd = &x->e_mbd; + uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address; + + double bestsad; + int best_site = -1; + int last_site = -1; + + // search_param determines the length of the initial step and hence the number + // of iterations. + // 0 = initial step (MAX_FIRST_STEP) pel + // 1 = (MAX_FIRST_STEP/2) pel, + // 2 = (MAX_FIRST_STEP/4) pel... + // const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step]; + const MV *ss_mv = &cfg->ss_mv[search_param * cfg->searches_per_step]; + const intptr_t *ss_os = &cfg->ss_os[search_param * cfg->searches_per_step]; + const int tot_steps = cfg->total_steps - search_param; + vpx_clear_system_state(); + + *best_full_mv = *init_full_mv; + clamp_mv(best_full_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + *num00 = 0; + + // Work out the start point for the search + in_what = xd->plane[0].pre[0].buf + best_full_mv->row * in_what_stride + + best_full_mv->col; + best_address = in_what; + + // Check the starting position + *best_mv_dist = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); + *best_mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + bestsad = (*best_mv_dist) + lambda * (*best_mv_cost); + + i = 0; + + for (step = 0; step < tot_steps; step++) { + int all_in = 1, t; + + // All_in is true if every one of the points we are checking are within + // the bounds of the image. + all_in &= ((best_full_mv->row + ss_mv[i].row) > x->mv_limits.row_min); + all_in &= ((best_full_mv->row + ss_mv[i + 1].row) < x->mv_limits.row_max); + all_in &= ((best_full_mv->col + ss_mv[i + 2].col) > x->mv_limits.col_min); + all_in &= ((best_full_mv->col + ss_mv[i + 3].col) < x->mv_limits.col_max); + + // If all the pixels are within the bounds we don't check whether the + // search point is valid in this loop, otherwise we check each point + // for validity.. + if (all_in) { + unsigned int sad_array[4]; + + for (j = 0; j < cfg->searches_per_step; j += 4) { + unsigned char const *block_offset[4]; + + for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address; + + fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, + sad_array); + + for (t = 0; t < 4; t++, i++) { + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + const double mv_dist = sad_array[t]; + const double mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + double thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + best_site = i; + } + } + } + } else { + for (j = 0; j < cfg->searches_per_step; j++) { + // Trap illegal vectors + const MV this_mv = { best_full_mv->row + ss_mv[i].row, + best_full_mv->col + ss_mv[i].col }; + + if (is_mv_in(&x->mv_limits, &this_mv)) { + const uint8_t *const check_here = ss_os[i] + best_address; + const double mv_dist = + fn_ptr->sdf(what, what_stride, check_here, in_what_stride); + const double mv_cost = + vp9_nb_mvs_inconsistency(&this_mv, nb_full_mvs, full_mv_num); + double thissad = mv_dist + lambda * mv_cost; + if (thissad < bestsad) { + bestsad = thissad; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + best_site = i; + } + } + i++; + } + } + if (best_site != last_site) { + best_full_mv->row += ss_mv[best_site].row; + best_full_mv->col += ss_mv[best_site].col; + best_address += ss_os[best_site]; + last_site = best_site; + } else if (best_address == in_what) { + (*num00)++; + } + } + return bestsad; +} + +void vp9_prepare_nb_full_mvs(const TplDepFrame *tpl_frame, int mi_row, + int mi_col, int rf_idx, BLOCK_SIZE bsize, + int_mv *nb_full_mvs) { + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int dirs[NB_MVS_NUM][2] = { { -1, 0 }, { 0, -1 }, { 1, 0 }, { 0, 1 } }; + int i; + for (i = 0; i < NB_MVS_NUM; ++i) { + int r = dirs[i][0] * mi_height; + int c = dirs[i][1] * mi_width; + if (mi_row + r >= 0 && mi_row + r < tpl_frame->mi_rows && mi_col + c >= 0 && + mi_col + c < tpl_frame->mi_cols) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[(mi_row + r) * tpl_frame->stride + mi_col + c]; + int_mv *mv = + get_pyramid_mv(tpl_frame, rf_idx, bsize, mi_row + r, mi_col + c); + if (tpl_ptr->ready[rf_idx]) { + nb_full_mvs[i].as_mv = get_full_mv(&mv->as_mv); + } else { + nb_full_mvs[i].as_int = INVALID_MV; + } + } else { + nb_full_mvs[i].as_int = INVALID_MV; + } + } +} +#endif // CONFIG_NON_GREEDY_MV + int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, @@ -1785,12 +2118,15 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { } static const MV search_pos[4] = { - { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 }, + { -1, 0 }, + { 0, -1 }, + { 0, 1 }, + { 1, 0 }, }; unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col) { + int mi_col, const MV *ref_mv) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; @@ -1812,6 +2148,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, const int norm_factor = 3 + (bw >> 5); const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; if (scaled_ref_frame) { int i; @@ -1876,7 +2213,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, { const uint8_t *const pos[4] = { - ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride, + ref_buf - ref_stride, + ref_buf - 1, + ref_buf + 1, + ref_buf + ref_stride, }; cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad); @@ -1911,6 +2251,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; @@ -1919,6 +2263,74 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, return best_sad; } +#if CONFIG_NON_GREEDY_MV +// Runs sequence of diamond searches in smaller steps for RD. +/* do_refine: If last step (1-away) of n-step search doesn't pick the center + point as the best match, we will do a final 1-away diamond + refining search */ +double vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, double lambda, + int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv, double *best_mv_dist, + double *best_mv_cost) { + int n, num00 = 0; + double thissme; + double bestsme; + const int further_steps = MAX_MVSEARCH_STEPS - 1 - step_param; + vpx_clear_system_state(); + bestsme = vp9_diamond_search_sad_new( + x, &cpi->ss_cfg, mvp_full, best_mv, best_mv_dist, best_mv_cost, + step_param, lambda, &n, fn_ptr, nb_full_mvs, full_mv_num); + + // If there won't be more n-step search, check to see if refining search is + // needed. + if (n > further_steps) do_refine = 0; + + while (n < further_steps) { + ++n; + if (num00) { + num00--; + } else { + MV temp_mv; + double mv_dist; + double mv_cost; + thissme = vp9_diamond_search_sad_new( + x, &cpi->ss_cfg, mvp_full, &temp_mv, &mv_dist, &mv_cost, + step_param + n, lambda, &num00, fn_ptr, nb_full_mvs, full_mv_num); + // check to see if refining search is needed. + if (num00 > further_steps - n) do_refine = 0; + + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + } + } + } + + // final 1-away diamond refining search + if (do_refine) { + const int search_range = 8; + MV temp_mv = *best_mv; + double mv_dist; + double mv_cost; + thissme = vp9_refining_search_sad_new(x, &temp_mv, &mv_dist, &mv_cost, + lambda, search_range, fn_ptr, + nb_full_mvs, full_mv_num); + if (thissme < bestsme) { + bestsme = thissme; + *best_mv = temp_mv; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + } + } + return bestsme; +} +#endif // CONFIG_NON_GREEDY_MV + // Runs sequence of diamond searches in smaller steps for RD. /* do_refine: If last step (1-away) of n-step search doesn't pick the center point as the best match, we will do a final 1-away diamond @@ -2042,6 +2454,90 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x, return bestsme; } +#if CONFIG_NON_GREEDY_MV +double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + double *best_mv_dist, double *best_mv_cost, + double lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num) { + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } }; + const struct buf_2d *const what = &x->plane[0].src; + const struct buf_2d *const in_what = &xd->plane[0].pre[0]; + const uint8_t *best_address = get_buf_from_mv(in_what, best_full_mv); + double best_sad; + int i, j; + vpx_clear_system_state(); + *best_mv_dist = + fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride); + *best_mv_cost = + vp9_nb_mvs_inconsistency(best_full_mv, nb_full_mvs, full_mv_num); + best_sad = (*best_mv_dist) + lambda * (*best_mv_cost); + + for (i = 0; i < search_range; i++) { + int best_site = -1; + const int all_in = ((best_full_mv->row - 1) > x->mv_limits.row_min) & + ((best_full_mv->row + 1) < x->mv_limits.row_max) & + ((best_full_mv->col - 1) > x->mv_limits.col_min) & + ((best_full_mv->col + 1) < x->mv_limits.col_max); + + if (all_in) { + unsigned int sads[4]; + const uint8_t *const positions[4] = { best_address - in_what->stride, + best_address - 1, best_address + 1, + best_address + in_what->stride }; + + fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads); + + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + const double mv_dist = sads[j]; + const double mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const double thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + best_site = j; + } + } + } else { + for (j = 0; j < 4; ++j) { + const MV mv = { best_full_mv->row + neighbors[j].row, + best_full_mv->col + neighbors[j].col }; + + if (is_mv_in(&x->mv_limits, &mv)) { + const double mv_dist = + fn_ptr->sdf(what->buf, what->stride, + get_buf_from_mv(in_what, &mv), in_what->stride); + const double mv_cost = + vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num); + const double thissad = mv_dist + lambda * mv_cost; + if (thissad < best_sad) { + best_sad = thissad; + *best_mv_dist = mv_dist; + *best_mv_cost = mv_cost; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + best_full_mv->row += neighbors[best_site].row; + best_full_mv->col += neighbors[best_site].col; + best_address = get_buf_from_mv(in_what, best_full_mv); + } + } + + return best_sad; +} +#endif // CONFIG_NON_GREEDY_MV + int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, const vp9_variance_fn_ptr_t *fn_ptr, @@ -2175,6 +2671,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const SEARCH_METHODS method = (SEARCH_METHODS)search_method; vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize]; int var = 0; + int run_exhaustive_search = 0; + if (cost_list) { cost_list[0] = INT_MAX; cost_list[1] = INT_MAX; @@ -2205,35 +2703,38 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, fn_ptr, 1, ref_mv, tmp_mv); break; case NSTEP: + case MESH: var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); - - // Should we allow a follow on exhaustive search? - if ((sf->exhaustive_searches_thresh < INT_MAX) && - !cpi->rc.is_src_frame_alt_ref) { - int64_t exhuastive_thr = sf->exhaustive_searches_thresh; - exhuastive_thr >>= - 8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - - // Threshold variance for an exhaustive full search. - if (var > exhuastive_thr) { - int var_ex; - MV tmp_mv_ex; - var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, - cost_list, fn_ptr, ref_mv, &tmp_mv_ex); - - if (var_ex < var) { - var = var_ex; - *tmp_mv = tmp_mv_ex; - } - } - } break; - default: assert(0 && "Invalid search method."); + default: assert(0 && "Unknown search method"); + } + + if (method == NSTEP) { + if (sf->exhaustive_searches_thresh < INT_MAX && + !cpi->rc.is_src_frame_alt_ref) { + const int64_t exhuastive_thr = + sf->exhaustive_searches_thresh >> + (8 - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize])); + if (var > exhuastive_thr) run_exhaustive_search = 1; + } + } else if (method == MESH) { + run_exhaustive_search = 1; } - if (method != NSTEP && rd && var < var_max) + if (run_exhaustive_search) { + int var_ex; + MV tmp_mv_ex; + var_ex = full_pixel_exhaustive(cpi, x, tmp_mv, error_per_bit, cost_list, + fn_ptr, ref_mv, &tmp_mv_ex); + if (var_ex < var) { + var = var_ex; + *tmp_mv = tmp_mv_ex; + } + } + + if (method != NSTEP && method != MESH && rd && var < var_max) var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1); return var; @@ -2274,7 +2775,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, (void)tc; \ (void)sse; \ (void)thismse; \ - (void)cost_list; + (void)cost_list; \ + (void)use_accurate_subpel_search; // Return the maximum MV. uint32_t vp9_return_max_sub_pixel_mv( @@ -2282,7 +2784,7 @@ uint32_t vp9_return_max_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)minr; @@ -2304,7 +2806,7 @@ uint32_t vp9_return_min_sub_pixel_mv( int error_per_bit, const vp9_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h) { + int h, int use_accurate_subpel_search) { COMMON_MV_TEST; (void)maxr; diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h index b8db2c353..ab69afdcd 100644 --- a/libvpx/vp9/encoder/vp9_mcomp.h +++ b/libvpx/vp9/encoder/vp9_mcomp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MCOMP_H_ -#define VP9_ENCODER_VP9_MCOMP_H_ +#ifndef VPX_VP9_ENCODER_VP9_MCOMP_H_ +#define VPX_VP9_ENCODER_VP9_MCOMP_H_ #include "vp9/encoder/vp9_block.h" #include "vpx_dsp/variance.h" @@ -59,14 +59,15 @@ struct SPEED_FEATURES; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, - int sad_per_bit, int distance, + int error_per_bit, int search_range, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col); + int mi_row, int mi_col, + const MV *ref_mv); typedef uint32_t(fractional_mv_step_fp)( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, @@ -74,7 +75,7 @@ typedef uint32_t(fractional_mv_step_fp)( int forced_stop, // 0 - full, 1 - qtr only, 2 - half only int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2], uint32_t *distortion, uint32_t *sse1, const uint8_t *second_pred, int w, - int h); + int h, int use_accurate_subpel_search); extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree; extern fractional_mv_step_fp vp9_find_best_sub_pixel_tree_pruned; @@ -106,6 +107,9 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, struct VP9_COMP; +// "mvp_full" is the MV search starting point; +// "ref_mv" is the context reference MV; +// "tmp_mv" is the searched best MV. int vp9_full_pixel_search(struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, MV *mvp_full, int step_param, int search_method, int error_per_bit, int *cost_list, const MV *ref_mv, @@ -115,8 +119,38 @@ void vp9_set_subpel_mv_search_range(MvLimits *subpel_mv_limits, const MvLimits *umv_window_limits, const MV *ref_mv); +#if CONFIG_NON_GREEDY_MV +#define NB_MVS_NUM 4 +struct TplDepStats; +double vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv, + double *best_mv_dist, double *best_mv_cost, + double lambda, int search_range, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num); + +double vp9_full_pixel_diamond_new(const struct VP9_COMP *cpi, MACROBLOCK *x, + MV *mvp_full, int step_param, double lambda, + int do_refine, + const vp9_variance_fn_ptr_t *fn_ptr, + const int_mv *nb_full_mvs, int full_mv_num, + MV *best_mv, double *best_mv_dist, + double *best_mv_cost); + +double vp9_nb_mvs_inconsistency(const MV *mv, const int_mv *nb_mvs, int mv_num); +static INLINE MV get_full_mv(const MV *mv) { + MV out_mv; + out_mv.row = mv->row >> 3; + out_mv.col = mv->col >> 3; + return out_mv; +} + +struct TplDepFrame; +void vp9_prepare_nb_full_mvs(const struct TplDepFrame *tpl_frame, int mi_row, + int mi_col, int rf_idx, BLOCK_SIZE bsize, + int_mv *nb_full_mvs); +#endif // CONFIG_NON_GREEDY_MV #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_MCOMP_H_ +#endif // VPX_VP9_ENCODER_VP9_MCOMP_H_ diff --git a/libvpx/vp9/encoder/vp9_multi_thread.c b/libvpx/vp9/encoder/vp9_multi_thread.c index da06fb151..c66c03549 100644 --- a/libvpx/vp9/encoder/vp9_multi_thread.c +++ b/libvpx/vp9/encoder/vp9_multi_thread.c @@ -13,6 +13,7 @@ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_multi_thread.h" +#include "vp9/encoder/vp9_temporal_filter.h" void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, int tile_id) { @@ -50,6 +51,20 @@ void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, return job_info; } +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile) { + VP9_COMMON *const cm = &cpi->common; + const int sb_rows = + (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; + int i; + + this_tile->row_base_thresh_freq_fact = + (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, + sizeof(*(this_tile->row_base_thresh_freq_fact))); + for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) + this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; +} + void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { struct VP9Common *cm = &cpi->common; MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; @@ -59,6 +74,8 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; int jobs_per_tile_col, total_jobs; + // Allocate memory that is large enough for all row_mt stages. First pass + // uses 16x16 block size. jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); // Calculate the total number of jobs total_jobs = jobs_per_tile_col * tile_cols; @@ -83,14 +100,11 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_col]; vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); if (cpi->sf.adaptive_rd_thresh_row_mt) { - const int sb_rows = - (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1; - int i; - this_tile->row_base_thresh_freq_fact = - (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES, - sizeof(*(this_tile->row_base_thresh_freq_fact))); - for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++) - this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT; + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; + } + vp9_row_mt_alloc_rd_thresh(cpi, this_tile); } } @@ -146,11 +160,9 @@ void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { TileDataEnc *this_tile = &cpi->tile_data[tile_row * multi_thread_ctxt->allocated_tile_cols + tile_col]; - if (cpi->sf.adaptive_rd_thresh_row_mt) { - if (this_tile->row_base_thresh_freq_fact != NULL) { - vpx_free(this_tile->row_base_thresh_freq_fact); - this_tile->row_base_thresh_freq_fact = NULL; - } + if (this_tile->row_base_thresh_freq_fact != NULL) { + vpx_free(this_tile->row_base_thresh_freq_fact); + this_tile->row_base_thresh_freq_fact = NULL; } } } @@ -219,11 +231,19 @@ void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; JobQueue *job_queue = multi_thread_ctxt->job_queue; const int tile_cols = 1 << cm->log2_tile_cols; - int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs; + int job_row_num, jobs_per_tile, jobs_per_tile_col = 0, total_jobs; const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; int tile_col, i; - jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows; + switch (job_type) { + case ENCODE_JOB: jobs_per_tile_col = sb_rows; break; + case FIRST_PASS_JOB: jobs_per_tile_col = cm->mb_rows; break; + case ARNR_JOB: + jobs_per_tile_col = ((cm->mi_rows + TF_ROUND) >> TF_SHIFT); + break; + default: assert(0); + } + total_jobs = jobs_per_tile_col * tile_cols; multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; diff --git a/libvpx/vp9/encoder/vp9_multi_thread.h b/libvpx/vp9/encoder/vp9_multi_thread.h index bfc0c0ae4..a2276f4fe 100644 --- a/libvpx/vp9/encoder/vp9_multi_thread.h +++ b/libvpx/vp9/encoder/vp9_multi_thread.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H -#define VP9_ENCODER_VP9_MULTI_THREAD_H +#ifndef VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ +#define VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_job_queue.h" @@ -29,10 +29,13 @@ void vp9_multi_thread_tile_init(VP9_COMP *cpi); void vp9_row_mt_mem_alloc(VP9_COMP *cpi); +void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi, + TileDataEnc *const this_tile); + void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, int *tile_completion_status, int *cur_tile_id, int tile_cols); -#endif // VP9_ENCODER_VP9_MULTI_THREAD_H +#endif // VPX_VP9_ENCODER_VP9_MULTI_THREAD_H_ diff --git a/libvpx/vp9/encoder/vp9_noise_estimate.c b/libvpx/vp9/encoder/vp9_noise_estimate.c index 276a0c785..fc189dbb1 100644 --- a/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -148,7 +148,9 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->last_h = cm->height; } return; - } else if (cm->current_video_frame > 60 && + } else if (frame_counter > 60 && cpi->svc.num_encoded_top_layer > 1 && + cpi->rc.frames_since_key > cpi->svc.number_spatial_layers && + cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 && cpi->rc.avg_frame_low_motion < (low_res ? 70 : 50)) { // Force noise estimation to 0 and denoiser off if content has high motion. ne->level = kLowLow; @@ -157,7 +159,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) && cpi->svc.current_superframe > 1) { - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); + vp9_denoiser_set_noise_level(cpi, ne->level); copy_frame(&cpi->denoiser.last_source, cpi->Source); } #endif @@ -258,7 +260,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { // Normalize. avg_est = avg_est / num_samples; // Update noise estimate. - ne->value = (int)((15 * ne->value + avg_est) >> 4); + ne->value = (int)((3 * ne->value + avg_est) >> 2); ne->count++; if (ne->count == ne->num_frames_estimate) { // Reset counter and check noise level condition. @@ -267,7 +269,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) { ne->level = vp9_noise_estimate_extract_level(ne); #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) - vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level); + vp9_denoiser_set_noise_level(cpi, ne->level); #endif } } diff --git a/libvpx/vp9/encoder/vp9_noise_estimate.h b/libvpx/vp9/encoder/vp9_noise_estimate.h index 335cdbe64..574b7c337 100644 --- a/libvpx/vp9/encoder/vp9_noise_estimate.h +++ b/libvpx/vp9/encoder/vp9_noise_estimate.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_NOISE_ESTIMATE_H_ -#define VP9_ENCODER_NOISE_ESTIMATE_H_ +#ifndef VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ +#define VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ #include "vp9/encoder/vp9_block.h" #include "vp9/encoder/vp9_skin_detection.h" @@ -48,4 +48,4 @@ void vp9_update_noise_estimate(struct VP9_COMP *const cpi); } // extern "C" #endif -#endif // VP9_ENCODER_NOISE_ESTIMATE_H_ +#endif // VPX_VP9_ENCODER_VP9_NOISE_ESTIMATE_H_ diff --git a/libvpx/vp9/encoder/vp9_partition_models.h b/libvpx/vp9/encoder/vp9_partition_models.h new file mode 100644 index 000000000..904d21400 --- /dev/null +++ b/libvpx/vp9/encoder/vp9_partition_models.h @@ -0,0 +1,1172 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ +#define VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#define NN_MAX_HIDDEN_LAYERS 10 +#define NN_MAX_NODES_PER_LAYER 128 + +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. +typedef struct { + int num_inputs; // Number of input nodes, i.e. features. + int num_outputs; // Number of output nodes. + int num_hidden_layers; // Number of hidden layers, maximum 10. + // Number of nodes for each hidden layer. + int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS]; + // Weight parameters, indexed by layer. + const float *weights[NN_MAX_HIDDEN_LAYERS + 1]; + // Bias parameters, indexed by layer. + const float *bias[NN_MAX_HIDDEN_LAYERS + 1]; +} NN_CONFIG; + +// Partition search breakout model. +#define FEATURES 4 +#define Q_CTX 3 +#define RESOLUTION_CTX 2 +static const float + vp9_partition_breakout_weights_64[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.016673f, + -0.001025f, + -0.000032f, + 0.000833f, + 1.94261885f - 2.1f, + }, + { + -0.160867f, + -0.002101f, + 0.000011f, + 0.002448f, + 1.65738142f - 2.5f, + }, + { + -0.628934f, + -0.011459f, + -0.000009f, + 0.013833f, + 1.47982645f - 1.6f, + }, + }, + { + { + -0.064309f, + -0.006121f, + 0.000232f, + 0.005778f, + 0.7989465f - 5.0f, + }, + { + -0.314957f, + -0.009346f, + -0.000225f, + 0.010072f, + 2.80695581f - 5.5f, + }, + { + -0.635535f, + -0.015135f, + 0.000091f, + 0.015247f, + 2.90381241f - 5.0f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_32[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.010554f, + -0.003081f, + -0.000134f, + 0.004491f, + 1.68445992f - 3.5f, + }, + { + -0.051489f, + -0.007609f, + 0.000016f, + 0.009792f, + 1.28089404f - 2.5f, + }, + { + -0.163097f, + -0.013081f, + 0.000022f, + 0.019006f, + 1.36129403f - 3.2f, + }, + }, + { + { + -0.024629f, + -0.006492f, + -0.000254f, + 0.004895f, + 1.27919173f - 4.5f, + }, + { + -0.083936f, + -0.009827f, + -0.000200f, + 0.010399f, + 2.73731065f - 4.5f, + }, + { + -0.279052f, + -0.013334f, + 0.000289f, + 0.023203f, + 2.43595719f - 3.5f, + }, + }, + }; + +static const float + vp9_partition_breakout_weights_16[RESOLUTION_CTX][Q_CTX][FEATURES + 1] = { + { + { + -0.013154f, + -0.002404f, + -0.000977f, + 0.008450f, + 2.57404566f - 5.5f, + }, + { + -0.019146f, + -0.004018f, + 0.000064f, + 0.008187f, + 2.15043926f - 2.5f, + }, + { + -0.075755f, + -0.010858f, + 0.000030f, + 0.024505f, + 2.06848121f - 2.5f, + }, + }, + { + { + -0.007636f, + -0.002751f, + -0.000682f, + 0.005968f, + 0.19225763f - 4.5f, + }, + { + -0.047306f, + -0.009113f, + -0.000518f, + 0.016007f, + 2.61068869f - 4.0f, + }, + { + -0.069336f, + -0.010448f, + -0.001120f, + 0.023083f, + 1.47591054f - 5.5f, + }, + }, + }; + +static const float vp9_partition_breakout_weights_8[RESOLUTION_CTX][Q_CTX] + [FEATURES + 1] = { + { + { + -0.011807f, + -0.009873f, + -0.000931f, + 0.034768f, + 1.32254851f - 2.0f, + }, + { + -0.003861f, + -0.002701f, + 0.000100f, + 0.013876f, + 1.96755111f - 1.5f, + }, + { + -0.013522f, + -0.008677f, + -0.000562f, + 0.034468f, + 1.53440356f - 1.5f, + }, + }, + { + { + -0.003221f, + -0.002125f, + 0.000993f, + 0.012768f, + 0.03541421f - 2.0f, + }, + { + -0.006069f, + -0.007335f, + 0.000229f, + 0.026104f, + 0.17135315f - 1.5f, + }, + { + -0.039894f, + -0.011419f, + 0.000070f, + 0.061817f, + 0.6739977f - 1.5f, + }, + }, + }; +#undef FEATURES +#undef Q_CTX +#undef RESOLUTION_CTX + +// Rectangular partition search pruning model. +#define FEATURES 17 +#define LABELS 4 +static const float vp9_rect_part_nn_weights_16_layer0[FEATURES * 32] = { + 1.262885f, -0.533345f, -0.161280f, 0.106098f, 0.194799f, 0.003600f, + 0.394783f, -0.053954f, 0.264474f, -0.016651f, 0.376765f, 0.221471f, + 0.489799f, 0.054924f, 0.018292f, 0.037633f, -0.053430f, 1.092426f, + 0.205791f, -0.055661f, -0.227335f, 0.301274f, -0.169917f, 0.100426f, + 0.254388f, 0.103465f, 0.189560f, 0.116479f, 1.647195f, -0.667044f, + 0.067795f, -0.044580f, 0.019428f, 0.072938f, -0.797569f, -0.077539f, + -0.225636f, 0.262883f, -1.048009f, 0.210118f, -0.416156f, -0.143741f, + -0.296985f, 0.205918f, -0.517383f, -0.118527f, -0.396606f, -0.113128f, + -0.279468f, 0.096141f, -0.342051f, -0.337036f, 0.143222f, -0.860280f, + 0.137169f, 0.339767f, -0.336076f, 0.071988f, 0.251557f, -0.004068f, + 0.170734f, 0.237283f, -0.332443f, 0.073643f, 0.375357f, 0.220407f, + 0.150708f, -0.176979f, 0.265786f, -0.105878f, -0.337465f, -0.000491f, + 0.234308f, -0.098973f, 0.129038f, -0.205936f, -0.034793f, -0.106981f, + 0.009974f, 0.037861f, -0.282874f, -0.354414f, 0.023021f, -0.266749f, + -0.041762f, -0.721725f, 0.182262f, -0.273945f, 0.123722f, -0.036749f, + -0.788645f, -0.081560f, -0.472226f, 0.004654f, -0.756766f, -0.132186f, + 1.085412f, -0.221324f, -0.072577f, -0.172834f, -0.104831f, -1.391641f, + -0.345893f, 0.194442f, -0.306583f, -0.041813f, -0.267635f, -0.218568f, + -0.178452f, 0.044421f, -0.128042f, -0.094797f, -0.253724f, 0.273931f, + 0.144843f, -0.401416f, -0.014354f, -0.348929f, 0.123550f, 0.494504f, + -0.007050f, -0.143830f, 0.111292f, 0.211057f, -1.579988f, 0.117744f, + -1.732487f, 0.009320f, -1.162696f, 0.176687f, -0.705609f, 0.524827f, + 0.089822f, 0.082976f, -0.023681f, 0.006120f, -0.907175f, -0.026273f, + 0.019027f, 0.027170f, -0.462563f, -0.535335f, 0.202231f, 0.709803f, + -0.112251f, -1.213869f, 0.225714f, 0.323785f, -0.518254f, -0.014235f, + -0.070790f, -0.369589f, 0.373399f, 0.002738f, 0.175113f, 0.084529f, + -0.101586f, -0.018978f, 0.773392f, -0.673230f, -0.549279f, 0.790196f, + 0.658609f, -0.826831f, -0.514211f, 0.575341f, -0.711311f, 0.276289f, + -0.435715f, 0.392986f, -0.079298f, -0.318719f, 0.188429f, -0.114366f, + 0.172527f, -0.261721f, -0.216761f, 0.163822f, -0.189374f, -0.391901f, + 0.142013f, -0.135046f, 0.144419f, 0.053887f, 0.074673f, -0.290791f, + -0.039560f, -0.103830f, -0.330263f, -0.042091f, 0.050646f, -0.057466f, + -0.069064f, -0.412864f, 0.071097f, 0.126693f, 0.175397f, -0.168485f, + 0.018129f, -0.419188f, -0.272024f, -0.436859f, -0.425711f, -0.024382f, + 0.248042f, -0.169090f, -0.346878f, -0.070926f, 0.292278f, -0.197610f, + -0.218286f, 0.290846f, 0.297843f, 0.247394f, -0.160736f, 0.110314f, + 0.276000f, -0.301676f, -0.232816f, -0.127576f, -0.174457f, -0.124503f, + 0.264880f, -0.332379f, 0.012659f, -0.197333f, 0.604700f, 0.801582f, + 0.758702f, 0.691880f, 0.440917f, 0.773548f, 0.064242f, 1.147508f, + -0.127543f, -0.189628f, -0.122994f, -0.226776f, -0.053531f, -0.187548f, + 0.226554f, -0.273451f, 0.011751f, 0.009133f, 0.185091f, 0.003031f, + 0.000525f, 0.221829f, 0.331550f, -0.202558f, -0.286550f, 0.100683f, + 0.268818f, 0.179971f, -0.050016f, 0.579665f, 0.015911f, 0.033068f, + 0.077768f, -0.017757f, -1.411251f, 0.051519f, -1.745767f, 0.011258f, + -1.947372f, 0.111396f, -1.112755f, -0.008989f, -0.006211f, -0.002098f, + -0.015236f, -0.095697f, -0.095820f, 0.044622f, -0.112096f, 0.060000f, + 0.138957f, -0.462708f, 0.590790f, -0.021405f, -0.283744f, -1.141749f, + 0.213121f, -0.332311f, -0.314090f, -0.789311f, 0.157605f, -0.438019f, + 0.642189f, -0.340764f, -0.996025f, 0.109871f, 0.106128f, -0.010505f, + -0.117233f, -0.223194f, 0.344105f, -0.308754f, 0.386020f, -0.305270f, + -0.538281f, -0.270720f, -0.101688f, 0.207580f, 0.237153f, -0.055730f, + 0.842779f, 0.393543f, 0.007886f, -0.318167f, 0.603768f, 0.388241f, + 0.421536f, 0.632080f, 0.423965f, 0.371472f, 0.456827f, 0.488134f, + 0.358997f, 0.032621f, -0.017104f, 0.032198f, 0.113266f, -0.312277f, + 0.178189f, 0.234180f, 0.134271f, -0.414889f, 0.774141f, -0.225043f, + 0.614052f, -0.279921f, 1.329141f, -0.140827f, 0.797267f, -0.171361f, + 0.066205f, 0.339976f, 0.015223f, 0.193725f, -0.245067f, -0.035578f, + -0.084043f, 0.086756f, 0.029478f, -0.845370f, 0.388613f, -1.215236f, + 0.304573f, -0.439884f, -0.293969f, -0.107988f, -0.267837f, -0.695339f, + -0.702099f, 0.359047f, 0.511730f, 1.429516f, 0.216959f, -0.313828f, + 0.068062f, -0.124917f, -0.648327f, -0.308411f, -0.378467f, -0.429288f, + -0.032415f, -0.357005f, 0.170068f, 0.161167f, -0.250280f, -0.320468f, + -0.408987f, -0.201496f, -0.155996f, 0.021067f, 0.141083f, -0.202733f, + -0.130953f, -0.278148f, -0.042051f, 0.070576f, 0.009982f, -0.044326f, + -0.346851f, -0.255397f, -0.346456f, 0.281781f, 0.001618f, 0.120648f, + 0.297140f, 0.198343f, 0.186104f, 0.183548f, -0.344482f, 0.182258f, + 0.291003f, -0.330228f, -0.048174f, 0.133694f, 0.264582f, 0.229671f, + -0.167251f, -0.316040f, 0.191829f, 0.153417f, -0.345158f, -0.212790f, + -0.878872f, -0.313099f, -0.028368f, 0.065869f, -0.695388f, 1.102812f, + -0.605539f, 0.400680f, -0.350120f, -0.432965f, 0.034553f, -0.693476f, + -0.045708f, 0.492409f, -0.043825f, -0.430522f, 0.071159f, -0.317376f, + -1.164842f, 0.112394f, 0.034137f, -0.611882f, 0.251020f, -0.245113f, + 0.286093f, -0.187883f, 0.340263f, -0.211592f, -0.065706f, -0.332148f, + 0.104026f, -0.003206f, 0.036397f, 0.206499f, 0.161962f, 0.037663f, + -0.313039f, -0.199837f, 0.117952f, -0.182145f, -0.343724f, 0.017625f, + 0.033427f, -0.288075f, -0.101873f, -0.083378f, 0.147870f, 0.049598f, + -0.241824f, 0.070494f, 0.140942f, -0.013795f, 0.020023f, -0.192213f, + -0.320505f, -0.193072f, 0.147260f, 0.311352f, 0.053486f, 0.183716f, + 0.142535f, 0.294333f, -0.054853f, 0.293314f, -0.025398f, 0.190815f, + -0.137574f, -0.191864f, -0.190950f, -0.205988f, -0.199046f, -0.017582f, + -0.149347f, 0.131040f, 0.006854f, -0.350732f, 0.113301f, -0.194371f, + -0.296885f, -0.249199f, -0.193946f, 0.116150f, -0.310411f, -0.325851f, + -0.053275f, -0.063419f, 0.204170f, -0.091940f, -0.146229f, 0.298173f, + 0.053349f, -0.368540f, 0.235629f, -0.317825f, -0.107304f, -0.114618f, + 0.058709f, -0.272070f, 0.076224f, 0.110668f, -0.193282f, -0.135440f, + -0.267950f, -0.102285f, 0.102699f, -0.159082f, 0.262721f, -0.263227f, + 0.094509f, -0.113405f, 0.069888f, -0.169665f, 0.070800f, 0.035432f, + 0.054243f, 0.264229f, 0.117416f, 0.091568f, -0.022069f, -0.069214f, + 0.124543f, 0.070413f, -0.039343f, 0.082823f, -0.838348f, 0.153727f, + -0.000947f, 0.270348f, -1.404952f, -0.159680f, -0.234320f, 0.061023f, + 0.271660f, -0.541834f, 0.570828f, -0.277254f, +}; + +static const float vp9_rect_part_nn_bias_16_layer0[32] = { + 0.045740f, 0.292685f, -0.754007f, -0.150412f, -0.006171f, 0.005915f, + 0.000167f, 0.322797f, -0.381793f, 0.349786f, 0.003878f, -0.307203f, + 0.000000f, 0.029122f, 0.000000f, 0.625494f, 0.302105f, -0.362807f, + -0.034002f, -0.573278f, 0.240021f, 0.083965f, 0.000000f, -0.018979f, + -0.147739f, -0.036990f, 0.000000f, 0.000000f, -0.026790f, -0.000036f, + -0.073448f, 0.398328f, +}; + +static const float vp9_rect_part_nn_weights_16_layer1[32 * LABELS] = { + 0.095090f, 0.831754f, 0.484433f, 0.472945f, 0.086165f, -0.442388f, + 0.176263f, -0.760247f, 0.419932f, -0.131377f, 0.075814f, 0.089844f, + -0.294718f, 0.299808f, -0.318435f, -0.623205f, -0.346703f, 0.494356f, + 0.949221f, 0.524653f, 0.044095f, 0.428540f, 0.402571f, -0.216920f, + 0.423915f, 1.023334f, -0.366449f, 0.395057f, 0.057576f, 0.094019f, + 0.247685f, -0.007200f, -0.420023f, -0.728965f, -0.063040f, -0.071321f, + 0.209298f, 0.486625f, -0.244375f, 0.263219f, -0.250463f, -0.260301f, + 0.068579f, 0.177644f, -0.155311f, -0.027606f, -0.101614f, 0.553046f, + -0.462729f, -0.237568f, -0.589316f, 0.045182f, 0.551759f, -0.196872f, + 0.183040f, 0.054341f, 0.252784f, -0.536486f, -0.024425f, 0.154942f, + -0.086636f, 0.360416f, 0.214773f, -0.170876f, -0.363522f, -0.464099f, + 0.145494f, -0.099329f, 0.343718f, 0.286427f, 0.085540f, -0.105182f, + 0.155543f, 0.290939f, -0.067069f, 0.228399f, 0.178247f, 0.113031f, + -0.067336f, 0.441062f, 0.132364f, -0.263403f, -0.263925f, -0.083613f, + -0.268577f, -0.204442f, 0.052526f, 0.334787f, -0.064285f, -0.197875f, + 0.296405f, 0.396440f, 0.033231f, 0.229087f, 0.118289f, 0.490894f, + -0.527582f, -0.897206f, -0.325708f, -0.433018f, -0.053989f, 0.223814f, + -0.352319f, 0.772440f, -0.108648f, -0.082859f, -0.342718f, 0.033022f, + -0.309199f, -0.560337f, 0.208476f, 0.520309f, -0.241035f, -0.560391f, + -1.268968f, -0.267567f, 0.129461f, -0.385547f, 0.080142f, 0.065785f, + -0.159324f, -0.580704f, -0.315150f, -0.224900f, -0.110807f, -0.230163f, + 0.307266f, 0.153446f, +}; + +static const float vp9_rect_part_nn_bias_16_layer1[LABELS] = { + -0.455437f, + 0.255310f, + 0.452974f, + -0.278733f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_16_layer0, + vp9_rect_part_nn_weights_16_layer1, + }, + { + vp9_rect_part_nn_bias_16_layer0, + vp9_rect_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_rect_part_nn_weights_32_layer0[FEATURES * 32] = { + 0.735110f, -0.238477f, 0.101978f, 0.311671f, -0.123833f, 1.596506f, + -0.341982f, -0.480170f, -0.247587f, 0.613159f, -0.279899f, -0.740856f, + 0.499051f, 0.039041f, 0.056763f, 0.258874f, 0.470812f, -0.121635f, + -0.318852f, -0.098677f, -0.214714f, -0.159974f, -0.305400f, -0.344477f, + -0.260653f, -0.007737f, -0.053016f, -0.158079f, 0.151911f, -0.057685f, + -0.230948f, -0.165940f, -0.127591f, -0.192084f, 1.890390f, -0.315123f, + -0.714531f, -0.015355f, 0.186437f, 0.305504f, 0.035343f, -0.556783f, + 0.239364f, -0.297789f, 0.202735f, -0.707576f, 0.710250f, 0.223346f, + -0.291511f, 0.235778f, 0.455338f, -0.059402f, 0.084530f, -0.115117f, + -0.103696f, -0.192821f, 0.114579f, -0.223487f, 0.306864f, 0.021887f, + -0.028040f, 0.087866f, 0.038870f, -0.081742f, -0.056052f, -0.130837f, + 0.201058f, 0.293391f, 1.880344f, 0.339162f, 0.040928f, -0.503942f, + 0.476333f, 0.259272f, 0.629416f, 0.869369f, 0.622841f, 1.012843f, + 0.715795f, 1.958844f, -1.697462f, 0.071334f, 0.074189f, 0.014585f, + -0.002536f, 0.021900f, 0.151883f, 0.169501f, -0.333018f, -0.247512f, + -0.418575f, -0.473960f, -0.004501f, -0.280939f, -0.162188f, -0.355632f, + 0.136654f, -0.100967f, -0.350435f, -0.135386f, 0.037237f, 0.136982f, + -0.084157f, -0.073248f, 0.021792f, 0.077429f, -0.083042f, -3.169569f, + 0.016261f, -3.351328f, 0.021120f, -3.572247f, 0.023870f, -4.312754f, + 0.040973f, -0.038328f, -0.015052f, 0.017702f, 0.101427f, 0.115458f, + -0.304792f, 0.021826f, -0.157998f, 0.341022f, -0.013465f, 0.105076f, + -0.261465f, 0.318730f, 0.065701f, 0.314879f, -0.064785f, 0.282824f, + 0.100542f, 0.057260f, -0.003756f, -0.026214f, -0.264641f, 0.275545f, + -0.049201f, -0.283015f, -0.057363f, 0.183570f, 0.243161f, -0.255764f, + 0.099747f, -0.156157f, -0.262494f, 0.231521f, -0.262617f, -0.186096f, + 0.171720f, 0.018983f, -0.145545f, 0.197662f, -0.001502f, -0.267526f, + 0.001960f, 0.003260f, 0.045237f, -0.377174f, -0.042499f, -0.015278f, + -0.196779f, -0.262797f, -0.318427f, -0.126092f, -0.339723f, 0.205288f, + -0.544284f, -0.507896f, -0.316622f, -0.090312f, -0.250917f, -0.337263f, + -0.220199f, -0.296591f, -0.116816f, 0.052381f, 0.145681f, 0.016521f, + -0.093549f, -0.097822f, 0.023140f, -0.010346f, 0.036181f, 0.145826f, + -0.139123f, -0.462638f, -0.007315f, 0.156533f, -0.102787f, 0.143586f, + -0.092094f, -0.144220f, -0.168994f, -0.045833f, 0.021628f, -0.421794f, + -0.055857f, 0.217931f, -0.061937f, -0.028768f, -0.078250f, -0.426939f, + -0.223118f, -0.230080f, -0.194988f, -0.197673f, -0.020918f, 0.139945f, + 0.186951f, -0.071317f, -0.084007f, -0.138597f, 0.101950f, 0.093870f, + 0.153226f, 0.017799f, -0.088539f, -0.037796f, 0.340412f, 0.183305f, + 0.391880f, -1.127417f, 0.132762f, -0.228565f, 0.399035f, 0.017483f, + -0.041619f, 0.017849f, 0.092340f, 0.054204f, 0.681185f, 0.421034f, + 0.112520f, -0.040618f, -0.040148f, -0.360647f, 0.053555f, 0.192854f, + 0.076968f, -0.179224f, -0.081617f, -0.287661f, -0.191072f, -0.310227f, + -0.332226f, -0.039786f, -0.247795f, -0.232201f, -0.333533f, -0.077995f, + -0.471732f, 0.051829f, 0.090488f, 0.142465f, -0.120490f, -0.286151f, + -0.049117f, -0.251082f, 0.211884f, -0.223366f, 0.063565f, 0.229938f, + -0.059348f, -0.029573f, -0.064303f, -0.156148f, 0.086958f, -0.297613f, + -0.125107f, 0.062718f, 0.339137f, -0.218896f, -0.057290f, -0.236670f, + -0.143783f, -0.119429f, 0.242320f, -0.323464f, -0.178377f, 0.238275f, + -0.025042f, 0.074798f, 0.111329f, -0.299773f, -0.151748f, -0.261607f, + 0.215626f, 0.202243f, -0.121896f, -0.024283f, -0.293854f, -0.018232f, + -0.012629f, -0.199297f, -0.060595f, 0.432339f, -0.158735f, -0.028380f, + 0.326639f, 0.222546f, -0.218135f, -0.495955f, -0.015055f, -0.104206f, + -0.268823f, 0.116765f, 0.041769f, -0.187095f, 0.225090f, 0.198195f, + 0.001502f, -0.219212f, -0.244779f, -0.017690f, -0.033197f, -0.339813f, + -0.325453f, 0.002499f, -0.066113f, 0.043235f, 0.324275f, -0.630642f, + -1.440551f, 0.174527f, 0.124619f, -1.187345f, 1.372693f, -0.278393f, + -0.058673f, -0.286338f, 1.708757f, -0.325094f, -0.543172f, -0.229411f, + 0.169927f, 0.175064f, 0.198321f, 0.117351f, 0.220882f, 0.138078f, + -0.158000f, -0.286708f, 0.096046f, -0.321788f, 0.206949f, -0.014473f, + -0.321234f, 0.100033f, -0.108266f, 0.166824f, 0.032904f, -0.065760f, + -0.303896f, 0.180342f, -0.301145f, -0.352554f, 0.149089f, 0.013277f, + 0.256019f, -0.109770f, 1.832588f, -0.132568f, 1.527658f, -0.164252f, + -0.857880f, -0.242694f, -0.553797f, 0.334023f, -0.332759f, -0.166203f, + -0.223175f, 0.007953f, -0.175865f, -0.134590f, -0.023858f, -0.011983f, + 0.054403f, -0.147054f, -0.176901f, -0.166893f, -0.292662f, -0.010569f, + -0.041744f, -0.060398f, -0.237584f, 0.154246f, -0.083270f, -0.314016f, + -0.374736f, 0.100063f, 0.048401f, -0.061952f, -0.178816f, 0.157243f, + 0.221991f, -0.065035f, 0.098517f, -0.190704f, -0.210613f, -0.274884f, + -0.341442f, -0.205281f, 0.073644f, 0.130667f, 0.149194f, -0.018172f, + 1.796154f, -1.017806f, -0.169655f, 0.104239f, 0.344313f, 0.643042f, + 0.730177f, 0.270776f, 0.581631f, -1.090649f, 0.707472f, 1.411035f, + 0.268739f, 0.178860f, -0.062251f, -0.118611f, -0.215759f, 0.023485f, + -0.105320f, 0.036396f, -0.059604f, 0.090024f, 0.095224f, -0.053497f, + -0.084040f, 0.055836f, 0.111678f, 0.014886f, -0.178380f, 0.079662f, + -0.123580f, 0.057379f, -0.409844f, -0.305386f, -0.987808f, -0.291094f, + 0.063966f, 0.263709f, -0.337221f, 0.720093f, 0.105030f, 0.848950f, + 0.071835f, 0.228972f, 0.057705f, -2.154561f, -0.201303f, -0.058856f, + -0.020081f, 0.029375f, 0.234837f, -0.001063f, 0.042527f, 0.014567f, + -0.299420f, -0.289117f, 0.275219f, 0.263596f, -0.186026f, -0.111364f, + -0.118393f, -0.318778f, 0.010710f, -0.286836f, -0.070330f, -0.049497f, + 0.093162f, -0.298085f, 0.204761f, -0.206633f, -0.009057f, -0.235372f, + 0.185300f, -0.271814f, 0.281732f, 0.268149f, -0.018967f, 0.162748f, + -0.086694f, -0.063839f, -0.097473f, -0.280120f, 0.324688f, 0.157911f, + -0.064794f, -0.266017f, -0.305608f, -0.196854f, -0.185767f, 0.199455f, + 0.102264f, 0.070866f, 0.172045f, 0.266433f, -0.176167f, 0.251657f, + -0.239220f, 0.229667f, 0.156115f, -0.221345f, 0.270720f, 0.109367f, + 0.230352f, -0.384561f, -0.026329f, 0.005928f, -0.087685f, -0.097995f, + -0.153864f, 0.117211f, -0.226492f, -0.379832f, -0.201714f, 0.049707f, + -0.292120f, 0.114074f, -0.085307f, -0.485356f, -0.347405f, 0.089361f, + -0.419273f, -0.320764f, -0.107254f, -0.274615f, -0.292991f, 0.095602f, + -0.078789f, 0.138927f, 0.270813f, 0.205814f, 0.065003f, 0.169171f, + 0.056142f, -0.005792f, 0.059483f, 0.060149f, +}; + +static const float vp9_rect_part_nn_bias_32_layer0[32] = { + -1.749808f, 0.000000f, 0.239736f, -0.000424f, 0.431792f, -0.150833f, + 2.866760f, 0.000000f, 0.000000f, -0.281434f, 0.000000f, -0.150086f, + 0.000000f, -0.008346f, -0.204104f, -0.006581f, 0.000000f, -0.197006f, + 0.000000f, -0.735287f, -0.028345f, -1.180116f, -0.106524f, 0.000000f, + 0.075879f, -0.150966f, -2.438914f, 0.000000f, -0.011775f, -0.024204f, + -0.138235f, -0.123763f, +}; + +static const float vp9_rect_part_nn_weights_32_layer1[32 * LABELS] = { + 0.622235f, 0.264894f, -0.424216f, 0.103989f, 1.401192f, -0.063838f, + -5.216846f, 0.329234f, -0.293113f, 0.457519f, -0.271899f, 0.043771f, + -0.203823f, 0.573535f, -0.192703f, 0.054939f, 0.163019f, 0.124803f, + 0.160664f, 0.385406f, -0.091403f, 0.320204f, 0.101181f, -0.157792f, + -0.095555f, -0.255011f, 1.326614f, -0.138076f, -0.082434f, -0.342442f, + 0.184067f, -0.076395f, 0.050263f, 0.251065f, 0.291743f, 0.197838f, + -0.950922f, 0.280202f, 2.904905f, -0.219434f, 0.284386f, 0.375005f, + 0.193817f, -0.298663f, -0.255364f, -0.297545f, 0.030518f, -0.023892f, + -0.396120f, -0.253027f, 0.237235f, -0.550249f, -0.076817f, -0.201374f, + 0.292708f, 0.341936f, -0.532215f, 0.180634f, -0.943291f, -0.217179f, + 0.251611f, -0.306310f, 0.229054f, -0.350337f, -0.192707f, 0.146781f, + 0.409007f, 0.279088f, -0.307357f, 0.199059f, 2.780962f, 0.163723f, + -0.226445f, 0.242830f, 0.220356f, -0.057621f, 0.196677f, -0.179975f, + -0.314636f, 0.218271f, -0.278653f, -0.226286f, 0.034275f, -0.320149f, + 0.154779f, 0.074937f, -0.015650f, -0.281735f, -0.495227f, -0.075036f, + -0.871024f, -0.350643f, 0.343468f, 0.095665f, 0.447121f, -0.059040f, + 0.244757f, 0.223122f, 0.272544f, 0.129678f, -1.700183f, 0.254869f, + 2.528983f, 0.217362f, 0.327765f, -0.129369f, -0.003560f, -0.532537f, + 0.080216f, -0.739488f, -0.299813f, 0.185421f, 0.265994f, 0.152268f, + -0.401829f, -0.901380f, 0.347747f, -0.524845f, -0.201163f, 0.063585f, + -0.517479f, -0.077816f, -0.735739f, -0.161411f, -0.113607f, -0.306188f, + 0.190817f, -0.362567f, +}; + +static const float vp9_rect_part_nn_bias_32_layer1[LABELS] = { + -0.833530f, + 0.860502f, + 0.708645f, + -1.083700f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_32_layer0, + vp9_rect_part_nn_weights_32_layer1, + }, + { + vp9_rect_part_nn_bias_32_layer0, + vp9_rect_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_rect_part_nn_weights_64_layer0[FEATURES * 32] = { + 0.029424f, -0.295893f, -0.313259f, -0.090484f, -0.104946f, 0.121361f, + 0.137971f, -0.137984f, -0.328158f, -0.137280f, -0.276995f, -0.153118f, + 0.187893f, 0.105787f, -0.236591f, -0.114325f, -0.000708f, 1.936191f, + 0.048491f, -0.026048f, -0.206916f, 0.830237f, -0.152354f, 0.074191f, + -0.153813f, 0.148942f, -0.103457f, 0.028252f, 1.758264f, -2.123016f, + 0.120182f, 0.049954f, 0.110450f, -0.199360f, 0.642198f, 0.040225f, + -0.140886f, 0.091833f, -0.122788f, 1.172115f, -0.833333f, -0.505218f, + 0.736050f, -0.109958f, -0.839030f, -0.399916f, 1.029718f, 0.408977f, + -0.836882f, 0.389683f, -1.134413f, -1.529672f, -0.146351f, 0.089298f, + 0.083772f, -0.697869f, 1.683311f, -0.882446f, 0.494428f, -0.122128f, + 0.659819f, -0.057178f, -0.915390f, -0.192412f, 0.046613f, 0.010697f, + 0.040782f, 0.110807f, -0.225332f, -0.327730f, -0.114825f, 0.063511f, + 0.050503f, 0.023602f, 0.006524f, -0.274547f, -0.607145f, -0.143812f, + -0.327689f, -0.333072f, -0.017138f, -0.183992f, -0.200622f, -0.262463f, + -0.132799f, -0.018155f, -0.534214f, -0.385994f, 0.116278f, -0.752879f, + -0.090734f, -0.249152f, 0.071716f, 0.029603f, -0.382456f, -0.122894f, + 1.349552f, -0.885192f, 0.257903f, -0.265945f, -0.045579f, 0.112247f, + -0.122810f, -0.258285f, -0.145427f, -0.127442f, 0.072778f, 0.072549f, + 0.182149f, 0.239403f, 0.167205f, -0.291616f, -0.281237f, 0.335735f, + 0.208511f, -0.239628f, -0.022236f, -0.177370f, 0.207808f, 0.023535f, + 0.137455f, 0.016406f, -0.138685f, 0.188732f, 0.205513f, 0.209787f, + 0.060592f, 0.239954f, -0.128341f, -0.291585f, 0.022141f, -0.311201f, + -0.010199f, -0.314224f, -0.351915f, -0.079775f, -0.260028f, -0.015953f, + 0.007404f, 0.051589f, 0.019771f, -2.337926f, 0.024596f, -2.512399f, + -0.023138f, -2.421380f, 0.016515f, -3.269775f, 0.026844f, -0.053660f, + -0.013213f, -0.029248f, 0.114357f, 0.259100f, -0.141749f, -0.106802f, + -0.117323f, -0.294698f, -0.316012f, -0.328013f, 0.016459f, 0.136175f, + 0.223327f, 0.322312f, -0.297297f, 0.118286f, -0.317197f, -0.116692f, + 0.262236f, -0.032443f, -0.392128f, -0.199989f, -0.383621f, 0.008347f, + -0.079302f, -0.005529f, 0.049261f, 0.145948f, -0.263592f, -0.317109f, + 0.260015f, -0.499341f, -0.171764f, -0.017815f, 0.149186f, 0.178294f, + -0.492198f, 0.016956f, 0.008067f, -0.057734f, -0.189979f, -0.131489f, + -0.163303f, 0.121378f, -0.172272f, 0.125891f, 0.120654f, 0.071314f, + 0.117423f, -0.242167f, 0.047170f, 0.234302f, -0.355370f, -0.336112f, + -0.255471f, -0.267792f, -0.135367f, -0.284411f, 0.254592f, 0.098749f, + 0.224989f, 0.258450f, -0.306878f, 0.153551f, -0.175806f, -0.244459f, + -0.274922f, 0.254346f, 0.110309f, 0.036054f, 0.095133f, -0.589646f, + 0.080543f, 0.154155f, 0.133797f, -0.401518f, 0.798127f, 0.066742f, + 1.449216f, 0.282498f, 1.210638f, -0.280643f, 0.572386f, -0.308133f, + -0.053143f, 0.008437f, 0.269565f, 0.347616f, 0.087180f, -0.771104f, + 0.200800f, 0.157578f, 0.474128f, -0.971488f, 0.193451f, 0.340339f, + -0.123425f, 0.560754f, -0.139621f, -0.281721f, -0.100162f, 0.250926f, + 0.281100f, 0.197680f, 0.138629f, 1.045823f, 0.339047f, 0.036698f, + -0.159210f, 0.727869f, -1.371850f, 0.116241f, -2.180194f, 0.214055f, + -0.213691f, 0.447957f, -1.129966f, 0.543598f, 0.147599f, 0.060034f, + -0.049415f, -0.095858f, 0.290599f, 0.059512f, 0.198343f, -0.211903f, + 0.158736f, -0.090220f, -0.221992f, 0.198320f, 0.028632f, -0.408238f, + -0.368266f, -0.218740f, -0.379023f, -0.173573f, -0.035179f, 0.240176f, + 0.237714f, -0.417132f, -0.184989f, 0.046818f, -0.016965f, -0.524012f, + -0.094848f, -0.225678f, 0.021766f, -0.028366f, 0.072343f, -0.039980f, + 0.023334f, -0.392397f, 0.164450f, -0.201650f, -0.519754f, -0.023352f, + -4.559466f, -0.115996f, 0.135844f, 0.152599f, -0.111570f, 1.870310f, + 0.003522f, 1.893098f, -0.134055f, 1.850787f, 0.085160f, -2.203354f, + 0.380799f, -0.074047f, 0.023760f, 0.077310f, 0.273381f, -1.163135f, + -0.024976f, 0.093252f, 0.011445f, -0.129009f, -2.200677f, -0.013703f, + -1.964109f, -0.027246f, -2.135679f, 0.049465f, -3.879032f, 0.195114f, + -0.018085f, 0.016755f, 0.036330f, 0.169138f, 0.003548f, -0.028565f, + -0.178196f, -0.020577f, -0.104330f, -0.270961f, -0.282822f, -0.228735f, + -0.292561f, 0.271648f, 0.129171f, 0.376168f, -0.265005f, -0.093002f, + -0.185514f, 0.025598f, 0.055265f, -0.212784f, -0.249005f, 0.051507f, + -0.267868f, 0.162227f, -0.237365f, 0.267479f, -0.051543f, -0.288800f, + -0.246119f, 0.216296f, 0.226888f, -0.123005f, 0.068040f, -0.096630f, + -0.100500f, 0.161640f, -0.349187f, -0.061229f, 0.042915f, 0.024949f, + -0.083086f, -0.407249f, -0.428306f, -0.381137f, -0.508822f, 0.354796f, + -0.612346f, -0.230076f, -0.734103f, -0.550571f, -0.318788f, -0.300091f, + -0.336045f, -0.494406f, -0.206900f, 0.079942f, 0.149065f, -0.533360f, + 0.940431f, -0.078860f, 1.418633f, -0.117527f, 1.349170f, 0.242658f, + 0.559328f, 0.258770f, -0.014508f, -0.204775f, -0.292631f, 0.498345f, + -0.274918f, 0.051670f, 0.157748f, -0.179721f, -0.183330f, -0.393550f, + -0.208848f, 0.060742f, -0.159654f, 0.047757f, -0.400256f, -0.084606f, + -0.080619f, -0.359664f, -0.078305f, -0.455653f, 0.227624f, -0.385606f, + -0.060326f, -0.209831f, -0.077008f, 0.148862f, 0.209908f, 0.047655f, + -0.342292f, -0.088375f, -0.115465f, 0.082700f, 0.036465f, -0.001792f, + -0.285730f, 0.114632f, 0.239254f, -0.348543f, 0.044916f, -0.299003f, + -0.244756f, -0.180802f, 0.314253f, -0.127788f, -0.221512f, 0.034787f, + -0.208388f, 0.349156f, 0.265975f, -0.068335f, 0.261372f, 0.146705f, + -0.098729f, 0.293699f, -0.111342f, 0.207402f, -0.038772f, 0.124135f, + -0.237450f, -0.191511f, -0.052240f, -0.237151f, 0.005013f, 0.139441f, + -0.153634f, -0.021596f, -0.036220f, -0.077873f, -0.085995f, -0.254555f, + -0.204382f, -0.082362f, 0.941796f, 0.253800f, -0.957468f, 0.095795f, + 0.122046f, -0.310364f, 0.087301f, 0.012704f, 0.193265f, -0.058303f, + 0.250452f, 0.835269f, 0.507383f, 0.109957f, -0.145028f, -0.114419f, + -0.225618f, 0.132387f, -0.063335f, -0.325776f, -0.346173f, -0.006653f, + -0.133534f, -0.085549f, -0.050177f, 0.173103f, 0.025421f, 0.105512f, + 0.258036f, 0.153116f, 0.290202f, -0.333699f, -0.072405f, -0.124069f, + -0.241933f, -0.313318f, 0.013623f, -0.237440f, -0.232228f, -0.170850f, + -0.039212f, 0.162468f, -0.330162f, -0.218462f, -0.287064f, -0.181673f, + -0.161059f, 0.024664f, -0.108642f, -0.231707f, 0.217994f, -1.128878f, + 0.093010f, 0.101513f, 0.055895f, -0.354538f, 0.844174f, 0.254335f, + 1.920298f, -0.230777f, 0.798144f, 0.206425f, 0.580655f, -0.177645f, + -0.412061f, 0.112629f, -0.476438f, 0.209436f, +}; + +static const float vp9_rect_part_nn_bias_64_layer0[32] = { + 0.000000f, 0.345406f, -0.499542f, -1.718246f, -0.147443f, -0.408843f, + -0.008997f, -0.107946f, 2.117510f, 0.000000f, -0.141830f, -0.049079f, + 0.000000f, -1.331136f, -1.417843f, -0.485054f, -0.100856f, -0.230750f, + -2.574372f, 2.310627f, -0.030363f, 0.000000f, -0.310119f, -1.314316f, + -0.108766f, -0.107918f, 0.000000f, 0.000000f, 0.093643f, 0.000000f, + 0.000000f, -0.902343f, +}; + +static const float vp9_rect_part_nn_weights_64_layer1[32 * LABELS] = { + 0.404567f, 1.168492f, 0.051714f, 0.827941f, 0.135334f, 0.456922f, + -0.370524f, 0.062865f, -3.076300f, -0.290613f, 0.280029f, -0.101778f, + 0.250216f, 0.347721f, 0.466400f, 0.030845f, 0.114570f, 0.089456f, + 1.519938f, -3.493788f, 0.264212f, -0.109125f, 0.306644f, 0.368206f, + -0.052168f, -0.229630f, -0.339932f, -0.080472f, 0.319845f, 0.143818f, + -0.172595f, 0.372777f, -0.082072f, -0.505781f, -0.288321f, -0.473028f, + -0.027567f, -0.034329f, -0.291965f, -0.063262f, 1.721741f, 0.118914f, + 0.183681f, 0.041611f, 0.266371f, 0.005896f, -0.484705f, 0.665535f, + -0.240945f, -0.017963f, -1.409440f, 2.031976f, 0.240327f, -0.116604f, + 0.273245f, -0.170570f, -0.085491f, -0.340315f, -0.209651f, -0.217460f, + -0.249373f, 0.009193f, 0.009467f, -0.272909f, 0.308472f, -0.551173f, + 0.168374f, -0.583229f, 0.140082f, -0.585715f, -0.010929f, 0.159779f, + 1.438104f, 0.293111f, -0.053339f, -0.101828f, -0.280573f, -0.211265f, + -0.323605f, -0.540908f, 0.101366f, -0.005288f, -1.517046f, 2.078767f, + 0.215597f, 0.144012f, 0.315888f, -0.251324f, 0.150482f, -0.137871f, + 0.235116f, -0.194202f, -0.153475f, -0.312384f, -0.375510f, 0.336488f, + -0.379837f, -1.004979f, -0.312587f, -0.406174f, 0.154290f, -0.539766f, + -0.230074f, 0.303564f, 0.719439f, -0.235108f, -0.204978f, 0.399229f, + 0.290222f, -0.278713f, -0.667069f, -0.420550f, 0.164893f, -0.459689f, + -1.035368f, 0.818909f, 0.275137f, -0.291006f, -0.061505f, 0.052737f, + -0.084871f, -0.348335f, 0.312544f, 0.120753f, -0.707222f, -0.010050f, + -0.137148f, -0.351765f, +}; + +static const float vp9_rect_part_nn_bias_64_layer1[LABELS] = { + -0.926768f, + 0.765832f, + 0.663683f, + -0.621865f, +}; + +static const NN_CONFIG vp9_rect_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 32, + }, // num_hidden_nodes + { + vp9_rect_part_nn_weights_64_layer0, + vp9_rect_part_nn_weights_64_layer1, + }, + { + vp9_rect_part_nn_bias_64_layer0, + vp9_rect_part_nn_bias_64_layer1, + }, +}; +#undef FEATURES +#undef LABELS + +#define FEATURES 7 +// Partition pruning model(neural nets). +static const float vp9_partition_nn_weights_64x64_layer0[FEATURES * 8] = { + -3.571348f, 0.014835f, -3.255393f, -0.098090f, -0.013120f, 0.000221f, + 0.056273f, 0.190179f, -0.268130f, -1.828242f, -0.010655f, 0.937244f, + -0.435120f, 0.512125f, 1.610679f, 0.190816f, -0.799075f, -0.377348f, + -0.144232f, 0.614383f, -0.980388f, 1.754150f, -0.185603f, -0.061854f, + -0.807172f, 1.240177f, 1.419531f, -0.438544f, -5.980774f, 0.139045f, + -0.032359f, -0.068887f, -1.237918f, 0.115706f, 0.003164f, 2.924212f, + 1.246838f, -0.035833f, 0.810011f, -0.805894f, 0.010966f, 0.076463f, + -4.226380f, -2.437764f, -0.010619f, -0.020935f, -0.451494f, 0.300079f, + -0.168961f, -3.326450f, -2.731094f, 0.002518f, 0.018840f, -1.656815f, + 0.068039f, 0.010586f, +}; + +static const float vp9_partition_nn_bias_64x64_layer0[8] = { + -3.469882f, 0.683989f, 0.194010f, 0.313782f, + -3.153335f, 2.245849f, -1.946190f, -3.740020f, +}; + +static const float vp9_partition_nn_weights_64x64_layer1[8] = { + -8.058566f, 0.108306f, -0.280620f, -0.818823f, + -6.445117f, 0.865364f, -1.127127f, -8.808660f, +}; + +static const float vp9_partition_nn_bias_64x64_layer1[1] = { + 6.46909416f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_64x64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_64x64_layer0, + vp9_partition_nn_weights_64x64_layer1, + }, + { + vp9_partition_nn_bias_64x64_layer0, + vp9_partition_nn_bias_64x64_layer1, + }, +}; + +static const float vp9_partition_nn_weights_32x32_layer0[FEATURES * 8] = { + -0.295437f, -4.002648f, -0.205399f, -0.060919f, 0.708037f, 0.027221f, + -0.039137f, -0.907724f, -3.151662f, 0.007106f, 0.018726f, -0.534928f, + 0.022744f, 0.000159f, -1.717189f, -3.229031f, -0.027311f, 0.269863f, + -0.400747f, -0.394366f, -0.108878f, 0.603027f, 0.455369f, -0.197170f, + 1.241746f, -1.347820f, -0.575636f, -0.462879f, -2.296426f, 0.196696f, + -0.138347f, -0.030754f, -0.200774f, 0.453795f, 0.055625f, -3.163116f, + -0.091003f, -0.027028f, -0.042984f, -0.605185f, 0.143240f, -0.036439f, + -0.801228f, 0.313409f, -0.159942f, 0.031267f, 0.886454f, -1.531644f, + -0.089655f, 0.037683f, -0.163441f, -0.130454f, -0.058344f, 0.060011f, + 0.275387f, 1.552226f, +}; + +static const float vp9_partition_nn_bias_32x32_layer0[8] = { + -0.838372f, -2.609089f, -0.055763f, 1.329485f, + -1.297638f, -2.636622f, -0.826909f, 1.012644f, +}; + +static const float vp9_partition_nn_weights_32x32_layer1[8] = { + -1.792632f, -7.322353f, -0.683386f, 0.676564f, + -1.488118f, -7.527719f, 1.240163f, 0.614309f, +}; + +static const float vp9_partition_nn_bias_32x32_layer1[1] = { + 4.97422546f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_32x32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_32x32_layer0, + vp9_partition_nn_weights_32x32_layer1, + }, + { + vp9_partition_nn_bias_32x32_layer0, + vp9_partition_nn_bias_32x32_layer1, + }, +}; + +static const float vp9_partition_nn_weights_16x16_layer0[FEATURES * 8] = { + -1.717673f, -4.718130f, -0.125725f, -0.183427f, -0.511764f, 0.035328f, + 0.130891f, -3.096753f, 0.174968f, -0.188769f, -0.640796f, 1.305661f, + 1.700638f, -0.073806f, -4.006781f, -1.630999f, -0.064863f, -0.086410f, + -0.148617f, 0.172733f, -0.018619f, 2.152595f, 0.778405f, -0.156455f, + 0.612995f, -0.467878f, 0.152022f, -0.236183f, 0.339635f, -0.087119f, + -3.196610f, -1.080401f, -0.637704f, -0.059974f, 1.706298f, -0.793705f, + -6.399260f, 0.010624f, -0.064199f, -0.650621f, 0.338087f, -0.001531f, + 1.023655f, -3.700272f, -0.055281f, -0.386884f, 0.375504f, -0.898678f, + 0.281156f, -0.314611f, 0.863354f, -0.040582f, -0.145019f, 0.029329f, + -2.197880f, -0.108733f, +}; + +static const float vp9_partition_nn_bias_16x16_layer0[8] = { + 0.411516f, -2.143737f, -3.693192f, 2.123142f, + -1.356910f, -3.561016f, -0.765045f, -2.417082f, +}; + +static const float vp9_partition_nn_weights_16x16_layer1[8] = { + -0.619755f, -2.202391f, -4.337171f, 0.611319f, + 0.377677f, -4.998723f, -1.052235f, 1.949922f, +}; + +static const float vp9_partition_nn_bias_16x16_layer1[1] = { + 3.20981717f, +}; + +static const NN_CONFIG vp9_partition_nnconfig_16x16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_partition_nn_weights_16x16_layer0, + vp9_partition_nn_weights_16x16_layer1, + }, + { + vp9_partition_nn_bias_16x16_layer0, + vp9_partition_nn_bias_16x16_layer1, + }, +}; +#undef FEATURES + +#if CONFIG_ML_VAR_PARTITION +#define FEATURES 6 +static const float vp9_var_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.249572f, 0.205532f, -2.175608f, 1.094836f, -2.986370f, 0.193160f, + -0.143823f, 0.378511f, -1.997788f, -2.166866f, -1.930158f, -1.202127f, + -0.611875f, -0.506422f, -0.432487f, 0.071205f, 0.578172f, -0.154285f, + -0.051830f, 0.331681f, -1.457177f, -2.443546f, -2.000302f, -1.389283f, + 0.372084f, -0.464917f, 2.265235f, 2.385787f, 2.312722f, 2.127868f, + -0.403963f, -0.177860f, -0.436751f, -0.560539f, 0.254903f, 0.193976f, + -0.305611f, 0.256632f, 0.309388f, -0.437439f, 1.702640f, -5.007069f, + -0.323450f, 0.294227f, 1.267193f, 1.056601f, 0.387181f, -0.191215f, +}; + +static const float vp9_var_part_nn_bias_64_layer0[8] = { + -0.044396f, -0.938166f, 0.000000f, -0.916375f, + 1.242299f, 0.000000f, -0.405734f, 0.014206f, +}; + +static const float vp9_var_part_nn_weights_64_layer1[8] = { + 1.635945f, 0.979557f, 0.455315f, 1.197199f, + -2.251024f, -0.464953f, 1.378676f, -0.111927f, +}; + +static const float vp9_var_part_nn_bias_64_layer1[1] = { + -0.37972447f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_64 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_64_layer0, + vp9_var_part_nn_weights_64_layer1, + }, + { + vp9_var_part_nn_bias_64_layer0, + vp9_var_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.067243f, -0.083598f, -2.191159f, 2.726434f, -3.324013f, 3.477977f, + 0.323736f, -0.510199f, 2.960693f, 2.937661f, 2.888476f, 2.938315f, + -0.307602f, -0.503353f, -0.080725f, -0.473909f, -0.417162f, 0.457089f, + 0.665153f, -0.273210f, 0.028279f, 0.972220f, -0.445596f, 1.756611f, + -0.177892f, -0.091758f, 0.436661f, -0.521506f, 0.133786f, 0.266743f, + 0.637367f, -0.160084f, -1.396269f, 1.020841f, -1.112971f, 0.919496f, + -0.235883f, 0.651954f, 0.109061f, -0.429463f, 0.740839f, -0.962060f, + 0.299519f, -0.386298f, 1.550231f, 2.464915f, 1.311969f, 2.561612f, +}; + +static const float vp9_var_part_nn_bias_32_layer0[8] = { + 0.368242f, 0.736617f, 0.000000f, 0.757287f, + 0.000000f, 0.613248f, -0.776390f, 0.928497f, +}; + +static const float vp9_var_part_nn_weights_32_layer1[8] = { + 0.939884f, -2.420850f, -0.410489f, -0.186690f, + 0.063287f, -0.522011f, 0.484527f, -0.639625f, +}; + +static const float vp9_var_part_nn_bias_32_layer1[1] = { + -0.6455006f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_32 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_32_layer0, + vp9_var_part_nn_weights_32_layer1, + }, + { + vp9_var_part_nn_bias_32_layer0, + vp9_var_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_part_nn_weights_16_layer0[FEATURES * 8] = { + 0.742567f, -0.580624f, -0.244528f, 0.331661f, -0.113949f, -0.559295f, + -0.386061f, 0.438653f, 1.467463f, 0.211589f, 0.513972f, 1.067855f, + -0.876679f, 0.088560f, -0.687483f, -0.380304f, -0.016412f, 0.146380f, + 0.015318f, 0.000351f, -2.764887f, 3.269717f, 2.752428f, -2.236754f, + 0.561539f, -0.852050f, -0.084667f, 0.202057f, 0.197049f, 0.364922f, + -0.463801f, 0.431790f, 1.872096f, -0.091887f, -0.055034f, 2.443492f, + -0.156958f, -0.189571f, -0.542424f, -0.589804f, -0.354422f, 0.401605f, + 0.642021f, -0.875117f, 2.040794f, 1.921070f, 1.792413f, 1.839727f, +}; + +static const float vp9_var_part_nn_bias_16_layer0[8] = { + 2.901234f, -1.940932f, -0.198970f, -0.406524f, + 0.059422f, -1.879207f, -0.232340f, 2.979821f, +}; + +static const float vp9_var_part_nn_weights_16_layer1[8] = { + -0.528731f, 0.375234f, -0.088422f, 0.668629f, + 0.870449f, 0.578735f, 0.546103f, -1.957207f, +}; + +static const float vp9_var_part_nn_bias_16_layer1[1] = { + -1.95769405f, +}; + +static const NN_CONFIG vp9_var_part_nnconfig_16 = { + FEATURES, // num_inputs + 1, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_part_nn_weights_16_layer0, + vp9_var_part_nn_weights_16_layer1, + }, + { + vp9_var_part_nn_bias_16_layer0, + vp9_var_part_nn_bias_16_layer1, + }, +}; +#undef FEATURES +#endif // CONFIG_ML_VAR_PARTITION + +#define FEATURES 6 +#define LABELS 1 +static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.100129f, 0.128867f, -1.375086f, -2.268096f, -1.470368f, -2.296274f, + 0.034445f, -0.062993f, -2.151904f, 0.523215f, 1.611269f, 1.530051f, + 0.418182f, -1.330239f, 0.828388f, 0.386546f, -0.026188f, -0.055459f, + -0.474437f, 0.861295f, -2.208743f, -0.652991f, -2.985873f, -1.728956f, + 0.388052f, -0.420720f, 2.015495f, 1.280342f, 3.040914f, 1.760749f, + -0.009062f, 0.009623f, 1.579270f, -2.012891f, 1.629662f, -1.796016f, + -0.279782f, -0.288359f, 1.875618f, 1.639855f, 0.903020f, 0.906438f, + 0.553394f, -1.621589f, 0.185063f, 0.605207f, -0.133560f, 0.588689f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer0[8] = { + 0.659717f, 0.120912f, 0.329894f, -1.586385f, + 1.715839f, 0.085754f, 2.038774f, 0.268119f, +}; + +static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = { + -3.445586f, 2.375620f, 1.236970f, 0.804030f, + -2.448384f, 2.827254f, 2.291478f, 0.790252f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = { + -1.16608453f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_64_layer0, + vp9_var_rd_part_nn_weights_64_layer1, + }, + { + vp9_var_rd_part_nn_bias_64_layer0, + vp9_var_rd_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.022420f, -0.032201f, 1.228065f, -2.767655f, 1.928743f, 0.566863f, + 0.459229f, 0.422048f, 0.833395f, 0.822960f, -0.232227f, 0.586895f, + 0.442856f, -0.018564f, 0.227672f, -1.291306f, 0.119428f, -0.776563f, + -0.042947f, 0.183129f, 0.592231f, 1.174859f, -0.503868f, 0.270102f, + -0.330537f, -0.036340f, 1.144630f, 1.783710f, 1.216929f, 2.038085f, + 0.373782f, -0.430258f, 1.957002f, 1.383908f, 2.012261f, 1.585693f, + -0.394399f, -0.337523f, -0.238335f, 0.007819f, -0.368294f, 0.437875f, + -0.318923f, -0.242000f, 2.276263f, 1.501432f, 0.645706f, 0.344774f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer0[8] = { + -0.023846f, -1.348117f, 1.365007f, -1.644164f, + 0.062992f, 1.257980f, -0.098642f, 1.388472f, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = { + 3.016729f, 0.622684f, -1.021302f, 1.490383f, + 1.702046f, -2.964618f, 0.689045f, 1.711754f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = { + -1.28798676f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_32_layer0, + vp9_var_rd_part_nn_weights_32_layer1, + }, + { + vp9_var_rd_part_nn_bias_32_layer0, + vp9_var_rd_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = { + -0.726813f, -0.026748f, 1.376946f, 1.467961f, 1.961810f, 1.690412f, + 0.596484f, -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f, + 0.040520f, -0.032391f, -1.194214f, 2.438063f, -3.915334f, 1.997270f, + 0.673696f, -0.676393f, 1.654886f, 1.553838f, 1.129691f, 1.360201f, + 0.255001f, 0.336442f, -0.487759f, -0.634555f, 0.479170f, -0.110475f, + -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f, + -0.262463f, 0.228079f, -1.688776f, -1.594502f, -2.261078f, -1.802535f, + 0.034748f, -0.028476f, 2.713258f, 0.212446f, -1.529202f, -2.560178f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer0[8] = { + 0.495983f, 1.858545f, 0.162974f, 1.992247f, + -2.698863f, 0.110020f, 0.550830f, 0.420941f, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = { + 1.768409f, -1.394240f, 1.076846f, -1.762808f, + 1.517405f, 0.535195f, -0.426827f, 1.002272f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = { + -1.65894794f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_16_layer0, + vp9_var_rd_part_nn_weights_16_layer1, + }, + { + vp9_var_rd_part_nn_bias_16_layer0, + vp9_var_rd_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = { + -0.804900f, -1.214983f, 0.840202f, 0.686566f, 0.155804f, 0.025542f, + -1.244635f, -0.368403f, 0.364150f, 1.081073f, 0.552387f, 0.452715f, + 0.652968f, -0.293058f, 0.048967f, 0.021240f, -0.662981f, 0.424700f, + 0.008293f, -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f, + -0.239557f, -0.143766f, 0.064311f, 1.320998f, -0.477411f, 0.026374f, + 0.730884f, -0.675124f, 0.965521f, 0.863658f, 0.809186f, 0.812280f, + 0.513131f, 0.185102f, 0.211354f, 0.793666f, 0.121714f, -0.015383f, + -0.650980f, -0.046581f, 0.911141f, 0.806319f, 0.974773f, 0.815893f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer0[8] = { + 0.176134f, 0.651308f, 2.007761f, 0.068812f, + 1.061517f, 1.487161f, -2.308147f, 1.099828f, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = { + 0.683032f, 1.326393f, -1.661539f, 1.438920f, + 1.118023f, -2.237380f, 1.518468f, 2.010416f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = { + -1.65423989f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_8_layer0, + vp9_var_rd_part_nn_weights_8_layer1, + }, + { + vp9_var_rd_part_nn_bias_8_layer0, + vp9_var_rd_part_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef LABELS + +// Partition pruning model(linear). +static const float vp9_partition_feature_mean[24] = { + 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, + 689.413511f, 162.027012f, 1.478213f, 0.0, + 135382.260230f, 912738.513263f, 28.845217f, 1.515230f, + 544.158492f, 131.807995f, 1.436863f, 0.0f, + 43682.377587f, 208131.711766f, 28.084737f, 1.356677f, + 138.254122f, 119.522553f, 1.252322f, 0.0f, +}; + +static const float vp9_partition_feature_std[24] = { + 673689.212982f, 5996652.516628f, 0.024449f, 1.989792f, + 985.880847f, 0.014638f, 2.001898f, 0.0f, + 208798.775332f, 1812548.443284f, 0.018693f, 1.838009f, + 396.986910f, 0.015657f, 1.332541f, 0.0f, + 55888.847031f, 448587.962714f, 0.017900f, 1.904776f, + 98.652832f, 0.016598f, 1.320992f, 0.0f, +}; + +// Error tolerance: 0.01%-0.0.05%-0.1% +static const float vp9_partition_linear_weights[24] = { + 0.111736f, 0.289977f, 0.042219f, 0.204765f, 0.120410f, -0.143863f, + 0.282376f, 0.847811f, 0.637161f, 0.131570f, 0.018636f, 0.202134f, + 0.112797f, 0.028162f, 0.182450f, 1.124367f, 0.386133f, 0.083700f, + 0.050028f, 0.150873f, 0.061119f, 0.109318f, 0.127255f, 0.625211f, +}; + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_PARTITION_MODELS_H_ diff --git a/libvpx/vp9/encoder/vp9_picklpf.c b/libvpx/vp9/encoder/vp9_picklpf.c index 1c2c55b9e..f3c11700f 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.c +++ b/libvpx/vp9/encoder/vp9_picklpf.c @@ -150,7 +150,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, VP9_COMMON *const cm = &cpi->common; struct loopfilter *const lf = &cm->lf; - lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; + lf->sharpness_level = 0; if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) { lf->filter_level = 0; @@ -169,14 +169,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, case VPX_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); break; - default: - assert(0 && - "bit_depth should be VPX_BITS_8, VPX_BITS_10 " - "or VPX_BITS_12"); - return; } #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); diff --git a/libvpx/vp9/encoder/vp9_picklpf.h b/libvpx/vp9/encoder/vp9_picklpf.h index cecca058b..8881b44da 100644 --- a/libvpx/vp9/encoder/vp9_picklpf.h +++ b/libvpx/vp9/encoder/vp9_picklpf.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKLPF_H_ -#define VP9_ENCODER_VP9_PICKLPF_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKLPF_H_ +#define VPX_VP9_ENCODER_VP9_PICKLPF_H_ #ifdef __cplusplus extern "C" { @@ -26,4 +26,4 @@ void vp9_pick_filter_level(const struct yv12_buffer_config *sd, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKLPF_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKLPF_H_ diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c index f2f323a28..a3240513f 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.c +++ b/libvpx/vp9/encoder/vp9_pickmode.c @@ -41,6 +41,17 @@ typedef struct { int in_use; } PRED_BUFFER; +typedef struct { + PRED_BUFFER *best_pred; + PREDICTION_MODE best_mode; + TX_SIZE best_tx_size; + TX_SIZE best_intra_tx_size; + MV_REFERENCE_FRAME best_ref_frame; + MV_REFERENCE_FRAME best_second_ref_frame; + uint8_t best_mode_skip_txfm; + INTERP_FILTER best_pred_filter; +} BEST_PICKMODE; + static const int pos_shift_16x16[4][4] = { { 9, 10, 13, 14 }, { 11, 12, 15, 16 }, { 17, 18, 21, 22 }, { 19, 20, 23, 24 } }; @@ -222,13 +233,22 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } if (rv && search_subpel) { - int subpel_force_stop = cpi->sf.mv.subpel_force_stop; - if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2; + SUBPEL_FORCE_STOP subpel_force_stop = cpi->sf.mv.subpel_force_stop; + if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = HALF_PEL; + if (cpi->sf.mv.enable_adaptive_subpel_force_stop) { + const int mv_thresh = cpi->sf.mv.adapt_subpel_force_stop.mv_thresh; + if (abs(tmp_mv->as_mv.row) >= mv_thresh || + abs(tmp_mv->as_mv.col) >= mv_thresh) + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_above; + else + subpel_force_stop = cpi->sf.mv.adapt_subpel_force_stop.force_stop_below; + } cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } @@ -326,6 +346,35 @@ static int ac_thr_factor(const int speed, const int width, const int height, return 1; } +static TX_SIZE calculate_tx_size(VP9_COMP *const cpi, BLOCK_SIZE bsize, + MACROBLOCKD *const xd, unsigned int var, + unsigned int sse, int64_t ac_thr) { + TX_SIZE tx_size; + if (cpi->common.tx_mode == TX_MODE_SELECT) { + if (sse > (var << 2)) + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + else + tx_size = TX_8X8; + + if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && + cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) + tx_size = TX_8X8; + else if (tx_size > TX_16X16) + tx_size = TX_16X16; + + // For screen-content force 4X4 tx_size over 8X8, for large variance. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && tx_size == TX_8X8 && + bsize <= BLOCK_16X16 && var > (ac_thr << 6)) + tx_size = TX_4X4; + } else { + tx_size = VPXMIN(max_txsize_lookup[bsize], + tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); + } + + return tx_size; +} + static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, @@ -342,7 +391,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, struct macroblockd_plane *const pd = &xd->plane[0]; const uint32_t dc_quant = pd->dequant[0]; const uint32_t ac_quant = pd->dequant[1]; - const int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t dc_thr = dc_quant * dc_quant >> 6; int64_t ac_thr = ac_quant * ac_quant >> 6; unsigned int var; int sum; @@ -386,26 +435,16 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize, cpi->common.height, abs(sum) >> (bw + bh)); #endif - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - tx_size = TX_8X8; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - tx_size = TX_8X8; - else if (tx_size > TX_16X16) - tx_size = TX_16X16; - } else { - tx_size = VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } - - assert(tx_size >= TX_8X8); + tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr); + // The code below for setting skip flag assumes tranform size of at least 8x8, + // so force this lower limit on transform. + if (tx_size < TX_8X8) tx_size = TX_8X8; xd->mi[0]->tx_size = tx_size; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && x->zero_temp_sad_source && + x->source_variance == 0) + dc_thr = dc_thr << 1; + // Evaluate if the partition block is a skippable block in Y plane. { unsigned int sse16x16[16] = { 0 }; @@ -563,24 +602,7 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, *var_y = var; *sse_y = sse; - if (cpi->common.tx_mode == TX_MODE_SELECT) { - if (sse > (var << 2)) - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - else - xd->mi[0]->tx_size = TX_8X8; - - if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && - cyclic_refresh_segment_id_boosted(xd->mi[0]->segment_id)) - xd->mi[0]->tx_size = TX_8X8; - else if (xd->mi[0]->tx_size > TX_16X16) - xd->mi[0]->tx_size = TX_16X16; - } else { - xd->mi[0]->tx_size = - VPXMIN(max_txsize_lookup[bsize], - tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); - } + xd->mi[0]->tx_size = calculate_tx_size(cpi, bsize, xd, var, sse, ac_thr); // Evaluate if the partition block is a skippable block in Y plane. { @@ -726,13 +748,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } *skippable &= (*eob == 0); eob_cost += 1; @@ -876,6 +898,7 @@ static void encode_breakout_test( // Skipping threshold for dc. unsigned int thresh_dc; int motion_low = 1; + if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return; if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 || mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64) @@ -1292,18 +1315,16 @@ static void vp9_pickmode_ctx_den_update( VP9_PICKMODE_CTX_DEN *ctx_den, int64_t zero_last_cost_orig, int ref_frame_cost[MAX_REF_FRAMES], int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int reuse_inter_pred, - TX_SIZE best_tx_size, PREDICTION_MODE best_mode, - MV_REFERENCE_FRAME best_ref_frame, INTERP_FILTER best_pred_filter, - uint8_t best_mode_skip_txfm) { + BEST_PICKMODE *bp) { ctx_den->zero_last_cost_orig = zero_last_cost_orig; ctx_den->ref_frame_cost = ref_frame_cost; ctx_den->frame_mv = frame_mv; ctx_den->reuse_inter_pred = reuse_inter_pred; - ctx_den->best_tx_size = best_tx_size; - ctx_den->best_mode = best_mode; - ctx_den->best_ref_frame = best_ref_frame; - ctx_den->best_pred_filter = best_pred_filter; - ctx_den->best_mode_skip_txfm = best_mode_skip_txfm; + ctx_den->best_tx_size = bp->best_tx_size; + ctx_den->best_mode = bp->best_mode; + ctx_den->best_ref_frame = bp->best_ref_frame; + ctx_den->best_pred_filter = bp->best_pred_filter; + ctx_den->best_mode_skip_txfm = bp->best_mode_skip_txfm; } static void recheck_zeromv_after_denoising( @@ -1332,6 +1353,7 @@ static void recheck_zeromv_after_denoising( mi->ref_frame[1] = NONE; mi->mv[0].as_int = 0; mi->interp_filter = EIGHTTAP; + if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR; xd->plane[0].pre[0] = yv12_mb[LAST_FRAME][0]; vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); model_rd_for_sb_y(cpi, bsize, x, xd, &rate, &dist, &var_y, &sse_y); @@ -1416,27 +1438,200 @@ static INLINE int get_force_skip_low_temp_var(uint8_t *variance_low, int mi_row, return force_skip_low_temp_var; } +static void search_filter_ref(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, + int mi_row, int mi_col, PRED_BUFFER *tmp, + BLOCK_SIZE bsize, int reuse_inter_pred, + PRED_BUFFER **this_mode_pred, unsigned int *var_y, + unsigned int *sse_y) { + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + struct macroblockd_plane *const pd = &xd->plane[0]; + const int bw = num_4x4_blocks_wide_lookup[bsize] << 2; + + int pf_rate[3] = { 0 }; + int64_t pf_dist[3] = { 0 }; + int curr_rate[3] = { 0 }; + unsigned int pf_var[3] = { 0 }; + unsigned int pf_sse[3] = { 0 }; + TX_SIZE pf_tx_size[3] = { 0 }; + int64_t best_cost = INT64_MAX; + INTERP_FILTER best_filter = SWITCHABLE, filter; + PRED_BUFFER *current_pred = *this_mode_pred; + uint8_t skip_txfm = SKIP_TXFM_NONE; + + for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { + int64_t cost; + mi->interp_filter = filter; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], + &pf_var[filter], &pf_sse[filter]); + curr_rate[filter] = pf_rate[filter]; + pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); + cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); + pf_tx_size[filter] = mi->tx_size; + if (cost < best_cost) { + best_filter = filter; + best_cost = cost; + skip_txfm = x->skip_txfm[0]; + + if (reuse_inter_pred) { + if (*this_mode_pred != current_pred) { + free_pred_buffer(*this_mode_pred); + *this_mode_pred = current_pred; + } + current_pred = &tmp[get_pred_buffer(tmp, 3)]; + pd->dst.buf = current_pred->data; + pd->dst.stride = bw; + } + } + } + + if (reuse_inter_pred && *this_mode_pred != current_pred) + free_pred_buffer(current_pred); + + mi->interp_filter = best_filter; + mi->tx_size = pf_tx_size[best_filter]; + this_rdc->rate = curr_rate[best_filter]; + this_rdc->dist = pf_dist[best_filter]; + *var_y = pf_var[best_filter]; + *sse_y = pf_sse[best_filter]; + x->skip_txfm[0] = skip_txfm; + if (reuse_inter_pred) { + pd->dst.buf = (*this_mode_pred)->data; + pd->dst.stride = (*this_mode_pred)->stride; + } +} + +static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x, + int_mv frame_mv[][MAX_REF_FRAMES], + MV_REFERENCE_FRAME ref_frame, int gf_temporal_ref, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int best_pred_sad, int *rate_mv, + unsigned int best_sse_sofar, RD_COST *best_rdc) { + SVC *const svc = &cpi->svc; + MACROBLOCKD *const xd = &x->e_mbd; + MODE_INFO *const mi = xd->mi[0]; + SPEED_FEATURES *const sf = &cpi->sf; + + if (ref_frame > LAST_FRAME && gf_temporal_ref && + cpi->oxcf.rc_mode == VPX_CBR) { + int tmp_sad; + uint32_t dis; + int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + + if (bsize < BLOCK_16X16) return -1; + + tmp_sad = vp9_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv); + + if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) return -1; + if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) return -1; + + frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; + *rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); + frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; + frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; + + cpi->find_fractional_mv_step( + x, &frame_mv[NEWMV][ref_frame].as_mv, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, + cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); + } else if (svc->use_base_mv && svc->spatial_layer_id) { + if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { + const int pre_stride = xd->plane[0].pre[0].stride; + unsigned int base_mv_sse = UINT_MAX; + int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; + const uint8_t *const pre_buf = + xd->plane[0].pre[0].buf + + (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + + (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); + cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, + pre_buf, pre_stride, &base_mv_sse); + + // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, + // for SVC encoding. + if (cpi->use_svc && svc->use_base_mv && bsize < BLOCK_16X16 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + + // Exit NEWMV search if base_mv_sse is large. + if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) + return -1; + if (base_mv_sse < (best_sse_sofar << 1)) { + // Base layer mv is good. + // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since + // (0, 0) mode is already tested. + unsigned int base_mv_sse_normalized = + base_mv_sse >> + (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); + if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && + base_mv_sse_normalized < 400 && + frame_mv[NEWMV][ref_frame].as_mv.row == 0 && + frame_mv[NEWMV][ref_frame].as_mv.col == 0) + return -1; + if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 1)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, + &frame_mv[NEWMV][ref_frame], rate_mv, + best_rdc->rdcost, 0)) { + return -1; + } + + return 0; +} + +static INLINE void init_best_pickmode(BEST_PICKMODE *bp) { + bp->best_mode = ZEROMV; + bp->best_ref_frame = LAST_FRAME; + bp->best_tx_size = TX_SIZES; + bp->best_intra_tx_size = TX_SIZES; + bp->best_pred_filter = EIGHTTAP; + bp->best_mode_skip_txfm = SKIP_TXFM_NONE; + bp->best_second_ref_frame = NONE; + bp->best_pred = NULL; +} + void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int mi_row, int mi_col, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *const cm = &cpi->common; SPEED_FEATURES *const sf = &cpi->sf; - const SVC *const svc = &cpi->svc; + SVC *const svc = &cpi->svc; MACROBLOCKD *const xd = &x->e_mbd; MODE_INFO *const mi = xd->mi[0]; struct macroblockd_plane *const pd = &xd->plane[0]; - PREDICTION_MODE best_mode = ZEROMV; - MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME; + + BEST_PICKMODE best_pickmode; + + MV_REFERENCE_FRAME ref_frame; MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame; - TX_SIZE best_tx_size = TX_SIZES; - INTERP_FILTER best_pred_filter = EIGHTTAP; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; RD_COST this_rdc, best_rdc; - uint8_t skip_txfm = SKIP_TXFM_NONE, best_mode_skip_txfm = SKIP_TXFM_NONE; // var_y and sse_y are saved to be used in skipping checking unsigned int var_y = UINT_MAX; unsigned int sse_y = UINT_MAX; @@ -1472,7 +1667,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, DECLARE_ALIGNED(16, uint16_t, pred_buf_16[3 * 64 * 64]); #endif struct buf_2d orig_dst = pd->dst; - PRED_BUFFER *best_pred = NULL; PRED_BUFFER *this_mode_pred = NULL; const int pixels_in_block = bh * bw; int reuse_inter_pred = cpi->sf.reuse_inter_pred_sby && ctx->pred_pixel_ready; @@ -1488,22 +1682,79 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int skip_ref_find_pred[4] = { 0 }; unsigned int sse_zeromv_normalized = UINT_MAX; unsigned int best_sse_sofar = UINT_MAX; - unsigned int thresh_svc_skip_golden = 500; + int gf_temporal_ref = 0; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; int denoise_svc_pickmode = 1; #endif INTERP_FILTER filter_gf_svc = EIGHTTAP; - MV_REFERENCE_FRAME best_second_ref_frame = NONE; + MV_REFERENCE_FRAME inter_layer_ref = GOLDEN_FRAME; + const struct segmentation *const seg = &cm->seg; int comp_modes = 0; int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES; int flag_svc_subpel = 0; int svc_mv_col = 0; int svc_mv_row = 0; + int no_scaling = 0; + unsigned int thresh_svc_skip_golden = 500; + int scene_change_detected = + cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe); - init_ref_frame_cost(cm, xd, ref_frame_cost); + init_best_pickmode(&best_pickmode); + + x->encode_breakout = seg->enabled + ? cpi->segment_encode_breakout[mi->segment_id] + : cpi->encode_breakout; + + x->source_variance = UINT_MAX; + if (cpi->sf.default_interp_filter == BILINEAR) { + best_pickmode.best_pred_filter = BILINEAR; + filter_gf_svc = BILINEAR; + } + if (cpi->use_svc && svc->spatial_layer_id > 0) { + int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id - 1, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; + } + if (svc->spatial_layer_id > 0 && + (svc->high_source_sad_superframe || no_scaling)) + thresh_svc_skip_golden = 0; + // Lower the skip threshold if lower spatial layer is better quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex > 150 && + cm->base_qindex > svc->lower_layer_qindex + 15) + thresh_svc_skip_golden = 100; + // Increase skip threshold if lower spatial layer is lower quality relative + // to current layer. + else if (svc->spatial_layer_id > 0 && cm->base_qindex < 140 && + cm->base_qindex < svc->lower_layer_qindex - 20) + thresh_svc_skip_golden = 1000; + + if (!cpi->use_svc || + (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame)) { + struct scale_factors *const sf_last = &cm->frame_refs[LAST_FRAME - 1].sf; + struct scale_factors *const sf_golden = + &cm->frame_refs[GOLDEN_FRAME - 1].sf; + gf_temporal_ref = 1; + // For temporal long term prediction, check that the golden reference + // is same scale as last reference, otherwise disable. + if ((sf_last->x_scale_fp != sf_golden->x_scale_fp) || + (sf_last->y_scale_fp != sf_golden->y_scale_fp)) { + gf_temporal_ref = 0; + } else { + if (cpi->rc.avg_frame_low_motion > 70) + thresh_svc_skip_golden = 500; + else + thresh_svc_skip_golden = 0; + } + } + init_ref_frame_cost(cm, xd, ref_frame_cost); memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); if (reuse_inter_pred) { @@ -1532,12 +1783,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // filter_ref, we use a less strict condition on assigning filter_ref. // This is to reduce the probabily of entering the flow of not assigning // filter_ref and then skip filter search. - if (xd->above_mi && is_inter_block(xd->above_mi)) - filter_ref = xd->above_mi->interp_filter; - else if (xd->left_mi && is_inter_block(xd->left_mi)) - filter_ref = xd->left_mi->interp_filter; - else - filter_ref = cm->interp_filter; + filter_ref = cm->interp_filter; + if (cpi->sf.default_interp_filter != BILINEAR) { + if (xd->above_mi && is_inter_block(xd->above_mi)) + filter_ref = xd->above_mi->interp_filter; + else if (xd->left_mi && is_inter_block(xd->left_mi)) + filter_ref = xd->left_mi->interp_filter; + } // initialize mode decisions vp9_rd_cost_reset(&best_rdc); @@ -1558,23 +1810,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, #endif // CONFIG_VP9_HIGHBITDEPTH x->source_variance = vp9_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize); + + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && mi->segment_id > 0 && + x->zero_temp_sad_source && x->source_variance == 0) { + mi->segment_id = 0; + vp9_init_plane_quantizers(cpi, x); + } } #if CONFIG_VP9_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity > 0) { - if (cpi->use_svc) { - int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - denoise_svc_pickmode = denoise_svc(cpi) && !lc->is_key_frame; - } + if (cpi->use_svc) denoise_svc_pickmode = vp9_denoise_svc_non_key(cpi); if (cpi->denoiser.denoising_level > kDenLowLow && denoise_svc_pickmode) vp9_denoiser_reset_frame_stats(ctx); } #endif - if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc && + if (cpi->rc.frames_since_golden == 0 && gf_temporal_ref && !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) { usable_ref_frame = LAST_FRAME; } else { @@ -1601,14 +1853,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && - cpi->svc.spatial_layer_id > 0) { + svc->spatial_layer_id > 0 && !gf_temporal_ref) { if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[LAST_FRAME - 1] = 1; + if (vp9_is_scaled(sf)) { + svc_force_zero_mode[LAST_FRAME - 1] = 1; + inter_layer_ref = LAST_FRAME; + } } if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) { struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf; - if (vp9_is_scaled(sf)) svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + if (vp9_is_scaled(sf)) { + svc_force_zero_mode[GOLDEN_FRAME - 1] = 1; + inter_layer_ref = GOLDEN_FRAME; + } } } @@ -1624,6 +1882,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } } + if (sf->disable_golden_ref && (x->content_state_sb != kVeryHighSad || + cpi->rc.avg_frame_low_motion < 60)) + usable_ref_frame = LAST_FRAME; + if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var)) use_golden_nonzeromv = 0; @@ -1638,7 +1900,21 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME) comp_modes = 2; + // If the segment reference frame feature is enabled and it's set to GOLDEN + // reference, then make sure we don't skip checking GOLDEN, this is to + // prevent possibility of not picking any mode. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) { + usable_ref_frame = GOLDEN_FRAME; + skip_ref_find_pred[GOLDEN_FRAME] = 0; + thresh_svc_skip_golden = 0; + } + for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { + // Skip find_predictor if the reference frame is not in the + // ref_frame_flags (i.e., not used as a reference for this frame). + skip_ref_find_pred[ref_frame] = + !(cpi->ref_frame_flags & flag_list[ref_frame]); if (!skip_ref_find_pred[ref_frame]) { find_predictors(cpi, x, ref_frame, frame_mv, const_motion, &ref_frame_skip_mask, flag_list, tile_data, mi_row, @@ -1652,11 +1928,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used // an averaging filter for downsampling (phase = 8). If so, we will test - // a nonzero motion mode on the spatial (goldeen) reference. + // a nonzero motion mode on the spatial reference. // The nonzero motion is half pixel shifted to left and top (-4, -4). - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 && - svc_force_zero_mode[GOLDEN_FRAME - 1] && - cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) { + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc_force_zero_mode[inter_layer_ref - 1] && + svc->downsample_filter_phase[svc->spatial_layer_id - 1] == 8 && + !gf_temporal_ref) { svc_mv_col = -4; svc_mv_row = -4; flag_svc_subpel = 1; @@ -1675,7 +1952,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int inter_mv_mode = 0; int skip_this_mv = 0; int comp_pred = 0; - int force_gf_mv = 0; + int force_mv_inter_layer = 0; PREDICTION_MODE this_mode; second_ref_frame = NONE; @@ -1699,8 +1976,19 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (ref_frame > usable_ref_frame) continue; if (skip_ref_find_pred[ref_frame]) continue; - if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) { - force_gf_mv = 1; + if (svc->previous_frame_is_intra_only) { + if (ref_frame != LAST_FRAME || frame_mv[this_mode][ref_frame].as_int != 0) + continue; + } + + // If the segment reference frame feature is enabled then do nothing if the + // current ref frame is not allowed. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) + continue; + + if (flag_svc_subpel && ref_frame == inter_layer_ref) { + force_mv_inter_layer = 1; // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row), // otherwise set NEWMV to (svc_mv_col, svc_mv_row). if (this_mode == NEWMV) { @@ -1713,7 +2001,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (comp_pred) { - const struct segmentation *const seg = &cm->seg; if (!cpi->allow_comp_inter_inter) continue; // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -1728,8 +2015,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, sse_zeromv_normalized < thresh_svc_skip_golden) continue; + if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; + if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != NEARESTMV) { + (frame_mv[this_mode][ref_frame].as_int != 0 || + (cpi->oxcf.content == VP9E_CONTENT_SCREEN && !svc->spatial_layer_id && + !x->zero_temp_sad_source))) { continue; } @@ -1759,14 +2050,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; - if (const_motion[ref_frame] && this_mode == NEARMV) continue; // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped // later. - if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME && + if (!force_mv_inter_layer && force_skip_low_temp_var && + ref_frame == GOLDEN_FRAME && frame_mv[this_mode][ref_frame].as_int != 0) { continue; } @@ -1780,34 +2070,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } if (cpi->use_svc) { - if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] && + if (!force_mv_inter_layer && svc_force_zero_mode[ref_frame - 1] && frame_mv[this_mode][ref_frame].as_int != 0) continue; } - if (sf->reference_masking && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == LAST_FRAME)) { - if (usable_ref_frame < ALTREF_FRAME) { - if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i])) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) - ref_frame_skip_mask |= (1 << ref_frame); + // Disable this drop out case if the ref frame segment level feature is + // enabled for this segment. This is to prevent the possibility that we end + // up unable to pick any mode. + if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) { + if (sf->reference_masking && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == LAST_FRAME)) { + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & flag_list[i])) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & flag_list[ref1]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & flag_list[ref2]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) + ref_frame_skip_mask |= (1 << ref_frame); } - } else if (!cpi->rc.is_src_frame_alt_ref && - !(frame_mv[this_mode][ref_frame].as_int == 0 && - ref_frame == ALTREF_FRAME)) { - int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; - int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; - if (((cpi->ref_frame_flags & flag_list[ref1]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || - ((cpi->ref_frame_flags & flag_list[ref2]) && - (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) - ref_frame_skip_mask |= (1 << ref_frame); } + if (ref_frame_skip_mask & (1 << ref_frame)) continue; } - if (ref_frame_skip_mask & (1 << ref_frame)) continue; // Select prediction reference frames. for (i = 0; i < MAX_MB_PLANE; i++) { @@ -1820,8 +2115,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, set_ref_ptrs(cm, xd, ref_frame, second_ref_frame); mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)]; - mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1 - : rd_threshes[mode_index]; + mode_rd_thresh = best_pickmode.best_mode_skip_txfm + ? rd_threshes[mode_index] << 1 + : rd_threshes[mode_index]; // Increase mode_rd_thresh value for GOLDEN_FRAME for improved encoding // speed with little/no subjective quality loss. @@ -1835,92 +2131,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->sf.adaptive_rd_thresh_row_mt && rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index]))) - continue; + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; - if (this_mode == NEWMV && !force_gf_mv) { - if (ref_frame > LAST_FRAME && !cpi->use_svc && - cpi->oxcf.rc_mode == VPX_CBR) { - int tmp_sad; - uint32_t dis; - int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX }; - - if (bsize < BLOCK_16X16) continue; - - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); - - if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; - if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) - continue; - - frame_mv[NEWMV][ref_frame].as_int = mi->mv[0].as_int; - rate_mv = vp9_mv_bit_cost(&frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - frame_mv[NEWMV][ref_frame].as_mv.row >>= 3; - frame_mv[NEWMV][ref_frame].as_mv.col >>= 3; - - cpi->find_fractional_mv_step( - x, &frame_mv[NEWMV][ref_frame].as_mv, - &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv, - cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref_frame], NULL, 0, - 0); - } else if (svc->use_base_mv && svc->spatial_layer_id) { - if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) { - const int pre_stride = xd->plane[0].pre[0].stride; - unsigned int base_mv_sse = UINT_MAX; - int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4; - const uint8_t *const pre_buf = - xd->plane[0].pre[0].buf + - (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride + - (frame_mv[NEWMV][ref_frame].as_mv.col >> 3); - cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride, - pre_buf, pre_stride, &base_mv_sse); - - // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16, - // for SVC encoding. - if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 && - frame_mv[NEWMV][ref_frame].as_mv.row == 0 && - frame_mv[NEWMV][ref_frame].as_mv.col == 0) - continue; - - // Exit NEWMV search if base_mv_sse is large. - if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale)) - continue; - if (base_mv_sse < (best_sse_sofar << 1)) { - // Base layer mv is good. - // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since - // (0, 0) mode is already tested. - unsigned int base_mv_sse_normalized = - base_mv_sse >> - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); - if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar && - base_mv_sse_normalized < 400 && - frame_mv[NEWMV][ref_frame].as_mv.row == 0 && - frame_mv[NEWMV][ref_frame].as_mv.col == 0) - continue; - if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 1)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], - &rate_mv, best_rdc.rdcost, 0)) { - continue; - } - } else if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col, - &frame_mv[NEWMV][ref_frame], &rate_mv, - best_rdc.rdcost, 0)) { + if (this_mode == NEWMV && !force_mv_inter_layer) { + if (search_new_mv(cpi, x, frame_mv, ref_frame, gf_temporal_ref, bsize, + mi_row, mi_col, best_pred_sad, &rate_mv, best_sse_sofar, + &best_rdc)) continue; - } } // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector @@ -1978,61 +2195,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if ((this_mode == NEWMV || filter_ref == SWITCHABLE) && pred_filter_search && (ref_frame == LAST_FRAME || - (ref_frame == GOLDEN_FRAME && !force_gf_mv && + (ref_frame == GOLDEN_FRAME && !force_mv_inter_layer && (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) && (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) { - int pf_rate[3]; - int64_t pf_dist[3]; - int curr_rate[3]; - unsigned int pf_var[3]; - unsigned int pf_sse[3]; - TX_SIZE pf_tx_size[3]; - int64_t best_cost = INT64_MAX; - INTERP_FILTER best_filter = SWITCHABLE, filter; - PRED_BUFFER *current_pred = this_mode_pred; rd_computed = 1; - - for (filter = EIGHTTAP; filter <= EIGHTTAP_SMOOTH; ++filter) { - int64_t cost; - mi->interp_filter = filter; - vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); - model_rd_for_sb_y(cpi, bsize, x, xd, &pf_rate[filter], &pf_dist[filter], - &pf_var[filter], &pf_sse[filter]); - curr_rate[filter] = pf_rate[filter]; - pf_rate[filter] += vp9_get_switchable_rate(cpi, xd); - cost = RDCOST(x->rdmult, x->rddiv, pf_rate[filter], pf_dist[filter]); - pf_tx_size[filter] = mi->tx_size; - if (cost < best_cost) { - best_filter = filter; - best_cost = cost; - skip_txfm = x->skip_txfm[0]; - - if (reuse_inter_pred) { - if (this_mode_pred != current_pred) { - free_pred_buffer(this_mode_pred); - this_mode_pred = current_pred; - } - current_pred = &tmp[get_pred_buffer(tmp, 3)]; - pd->dst.buf = current_pred->data; - pd->dst.stride = bw; - } - } - } - - if (reuse_inter_pred && this_mode_pred != current_pred) - free_pred_buffer(current_pred); - - mi->interp_filter = best_filter; - mi->tx_size = pf_tx_size[best_filter]; - this_rdc.rate = curr_rate[best_filter]; - this_rdc.dist = pf_dist[best_filter]; - var_y = pf_var[best_filter]; - sse_y = pf_sse[best_filter]; - x->skip_txfm[0] = skip_txfm; - if (reuse_inter_pred) { - pd->dst.buf = this_mode_pred->data; - pd->dst.stride = this_mode_pred->stride; - } + search_filter_ref(cpi, x, &this_rdc, mi_row, mi_col, tmp, bsize, + reuse_inter_pred, &this_mode_pred, &var_y, &sse_y); } else { // For low motion content use x->sb_is_skin in addition to VeryHighSad // for setting large_block. @@ -2138,7 +2306,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Skipping checking: test to see if this block can be reconstructed by // prediction only. - if (cpi->allow_encode_breakout) { + if (cpi->allow_encode_breakout && !xd->lossless && !scene_change_detected) { encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode, var_y, sse_y, yv12_mb, &this_rdc.rate, &this_rdc.dist, flag_preduv_computed); @@ -2165,17 +2333,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (this_rdc.rdcost < best_rdc.rdcost || x->skip) { best_rdc = this_rdc; - best_mode = this_mode; - best_pred_filter = mi->interp_filter; - best_tx_size = mi->tx_size; - best_ref_frame = ref_frame; - best_mode_skip_txfm = x->skip_txfm[0]; best_early_term = this_early_term; - best_second_ref_frame = second_ref_frame; + best_pickmode.best_mode = this_mode; + best_pickmode.best_pred_filter = mi->interp_filter; + best_pickmode.best_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = ref_frame; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_second_ref_frame = second_ref_frame; if (reuse_inter_pred) { - free_pred_buffer(best_pred); - best_pred = this_mode_pred; + free_pred_buffer(best_pickmode.best_pred); + best_pickmode.best_pred = this_mode_pred; } } else { if (reuse_inter_pred) free_pred_buffer(this_mode_pred); @@ -2185,38 +2353,50 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // If early termination flag is 1 and at least 2 modes are checked, // the mode search is terminated. - if (best_early_term && idx > 0) { + if (best_early_term && idx > 0 && !scene_change_detected) { x->skip = 1; break; } } - mi->mode = best_mode; - mi->interp_filter = best_pred_filter; - mi->tx_size = best_tx_size; - mi->ref_frame[0] = best_ref_frame; - mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int; + mi->mode = best_pickmode.best_mode; + mi->interp_filter = best_pickmode.best_pred_filter; + mi->tx_size = best_pickmode.best_tx_size; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->mv[0].as_int = + frame_mv[best_pickmode.best_mode][best_pickmode.best_ref_frame].as_int; xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int; - x->skip_txfm[0] = best_mode_skip_txfm; - mi->ref_frame[1] = best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; // For spatial enhancemanent layer: perform intra prediction only if base // layer is chosen as the reference. Always perform intra prediction if - // LAST is the only reference or is_key_frame is set. - if (cpi->svc.spatial_layer_id) { + // LAST is the only reference, or is_key_frame is set, or on base + // temporal layer. + if (svc->spatial_layer_id && !gf_temporal_ref) { perform_intra_pred = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame || + svc->temporal_layer_id == 0 || + svc->layer_context[svc->temporal_layer_id].is_key_frame || !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) || - (!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame && - svc_force_zero_mode[best_ref_frame - 1]); + (!svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc_force_zero_mode[best_pickmode.best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } - if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && - cpi->rc.is_src_frame_alt_ref) + if ((cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) || + svc->previous_frame_is_intra_only) perform_intra_pred = 0; + + // If the segment reference frame feature is enabled and set then + // skip the intra prediction. + if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) && + get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0) + perform_intra_pred = 0; + // Perform intra prediction search, if the best SAD is above a certain // threshold. if (best_rdc.rdcost == INT64_MAX || + (scene_change_detected && perform_intra_pred) || ((!force_skip_low_temp_var || bsize < BLOCK_32X32 || x->content_state_sb == kVeryHighSad) && perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh && @@ -2224,7 +2404,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, !x->lowvar_highsumdiff)) { struct estimate_block_intra_args args = { cpi, x, DC_PRED, 1, 0 }; int i; - TX_SIZE best_intra_tx_size = TX_SIZES; + PRED_BUFFER *const best_pred = best_pickmode.best_pred; TX_SIZE intra_tx_size = VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cpi->common.tx_mode]); @@ -2249,7 +2429,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, this_mode_pred->data, this_mode_pred->stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH - best_pred = this_mode_pred; + best_pickmode.best_pred = this_mode_pred; } } pd->dst = orig_dst; @@ -2309,36 +2489,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; - best_mode = this_mode; - best_intra_tx_size = mi->tx_size; - best_ref_frame = INTRA_FRAME; - best_second_ref_frame = NONE; + best_pickmode.best_mode = this_mode; + best_pickmode.best_intra_tx_size = mi->tx_size; + best_pickmode.best_ref_frame = INTRA_FRAME; + best_pickmode.best_second_ref_frame = NONE; mi->uv_mode = this_mode; mi->mv[0].as_int = INVALID_MV; mi->mv[1].as_int = INVALID_MV; - best_mode_skip_txfm = x->skip_txfm[0]; + best_pickmode.best_mode_skip_txfm = x->skip_txfm[0]; } } // Reset mb_mode_info to the best inter mode. - if (best_ref_frame != INTRA_FRAME) { - mi->tx_size = best_tx_size; + if (best_pickmode.best_ref_frame != INTRA_FRAME) { + mi->tx_size = best_pickmode.best_tx_size; } else { - mi->tx_size = best_intra_tx_size; + mi->tx_size = best_pickmode.best_intra_tx_size; } } pd->dst = orig_dst; - mi->mode = best_mode; - mi->ref_frame[0] = best_ref_frame; - mi->ref_frame[1] = best_second_ref_frame; - x->skip_txfm[0] = best_mode_skip_txfm; + mi->mode = best_pickmode.best_mode; + mi->ref_frame[0] = best_pickmode.best_ref_frame; + mi->ref_frame[1] = best_pickmode.best_second_ref_frame; + x->skip_txfm[0] = best_pickmode.best_mode_skip_txfm; if (!is_inter_block(mi)) { mi->interp_filter = SWITCHABLE_FILTERS; } - if (reuse_inter_pred && best_pred != NULL) { + if (reuse_inter_pred && best_pickmode.best_pred != NULL) { + PRED_BUFFER *const best_pred = best_pickmode.best_pred; if (best_pred->data != orig_dst.buf && is_inter_mode(mi->mode)) { #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) @@ -2367,25 +2548,26 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, // Remove this condition when the issue is resolved. if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1; vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost, - frame_mv, reuse_inter_pred, best_tx_size, - best_mode, best_ref_frame, best_pred_filter, - best_mode_skip_txfm); - vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision); + frame_mv, reuse_inter_pred, &best_pickmode); + vp9_denoiser_denoise(cpi, x, mi_row, mi_col, bsize, ctx, &decision, + gf_temporal_ref); recheck_zeromv_after_denoising(cpi, mi, x, xd, decision, &ctx_den, yv12_mb, &best_rdc, bsize, mi_row, mi_col); - best_ref_frame = ctx_den.best_ref_frame; + best_pickmode.best_ref_frame = ctx_den.best_ref_frame; } #endif - if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME) + if (best_pickmode.best_ref_frame == ALTREF_FRAME || + best_pickmode.best_second_ref_frame == ALTREF_FRAME) x->arf_frame_usage++; - else if (best_ref_frame != INTRA_FRAME) + else if (best_pickmode.best_ref_frame != INTRA_FRAME) x->lastgolden_frame_usage++; if (cpi->sf.adaptive_rd_thresh) { - THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)]; + THR_MODES best_mode_idx = + mode_idx[best_pickmode.best_ref_frame][mode_offset(mi->mode)]; - if (best_ref_frame == INTRA_FRAME) { + if (best_pickmode.best_ref_frame == INTRA_FRAME) { // Only consider the modes that are included in the intra_mode_list. int intra_modes = sizeof(intra_mode_list) / sizeof(PREDICTION_MODE); int i; @@ -2405,7 +2587,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, } else { for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ++ref_frame) { PREDICTION_MODE this_mode; - if (best_ref_frame != ref_frame) continue; + if (best_pickmode.best_ref_frame != ref_frame) continue; for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { if (cpi->sf.adaptive_rd_thresh_row_mt) update_thresh_freq_fact_row_mt(cpi, tile_data, x->source_variance, @@ -2585,9 +2767,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, x, &tmp_mv, &mbmi_ext->ref_mvs[ref_frame][0].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, - cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, - &dummy_dist, &x->pred_sse[ref_frame], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dummy_dist, + &x->pred_sse[ref_frame], NULL, 0, 0, + cpi->sf.use_accurate_subpel_search); xd->mi[0]->bmi[i].as_mv[0].as_mv = tmp_mv; } else { diff --git a/libvpx/vp9/encoder/vp9_pickmode.h b/libvpx/vp9/encoder/vp9_pickmode.h index 9aa00c4fa..15207e6cf 100644 --- a/libvpx/vp9/encoder/vp9_pickmode.h +++ b/libvpx/vp9/encoder/vp9_pickmode.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_PICKMODE_H_ -#define VP9_ENCODER_VP9_PICKMODE_H_ +#ifndef VPX_VP9_ENCODER_VP9_PICKMODE_H_ +#define VPX_VP9_ENCODER_VP9_PICKMODE_H_ #include "vp9/encoder/vp9_encoder.h" @@ -32,4 +32,4 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_PICKMODE_H_ +#endif // VPX_VP9_ENCODER_VP9_PICKMODE_H_ diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c index 09f61ead2..75f3a8ba7 100644 --- a/libvpx/vp9/encoder/vp9_quantize.c +++ b/libvpx/vp9/encoder/vp9_quantize.c @@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); - case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); } #else (void)bit_depth; @@ -221,13 +220,20 @@ void vp9_init_quantizer(VP9_COMP *cpi) { int i, q, quant; for (q = 0; q < QINDEX_RANGE; q++) { - const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); - const int qrounding_factor = q == 0 ? 64 : 48; + int qzbin_factor = get_qzbin_factor(q, cm->bit_depth); + int qrounding_factor = q == 0 ? 64 : 48; + const int sharpness_adjustment = 16 * (7 - cpi->oxcf.sharpness) / 7; + + if (cpi->oxcf.sharpness > 0 && q > 0) { + qzbin_factor = 64 + sharpness_adjustment; + qrounding_factor = 64 - sharpness_adjustment; + } for (i = 0; i < 2; ++i) { int qrounding_factor_fp = i == 0 ? 48 : 42; if (q == 0) qrounding_factor_fp = 64; - + if (cpi->oxcf.sharpness > 0) + qrounding_factor_fp = 64 - sharpness_adjustment; // y quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) : vp9_ac_quant(q, 0, cm->bit_depth); diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h index 61320361b..ed9b84958 100644 --- a/libvpx/vp9/encoder/vp9_quantize.h +++ b/libvpx/vp9/encoder/vp9_quantize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_QUANTIZE_H_ -#define VP9_ENCODER_VP9_QUANTIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_QUANTIZE_H_ +#define VPX_VP9_ENCODER_VP9_QUANTIZE_H_ #include "./vpx_config.h" #include "vp9/encoder/vp9_block.h" @@ -59,4 +59,4 @@ int vp9_qindex_to_quantizer(int qindex); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_QUANTIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_QUANTIZE_H_ diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c index b7f3a0e89..5ad68e2e5 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/libvpx/vp9/encoder/vp9_ratectrl.c @@ -31,10 +31,13 @@ #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_ratectrl.h" -// Max rate target for 1080P and below encodes under normal circumstances -// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB +// Max rate per frame for 1080P and below encodes if no level requirement given. +// For larger formats limit to MAX_MB_RATE bits per MB +// 4Mbits is derived from the level requirement for level 4 (1080P 30) which +// requires that HW can sustain a rate of 16Mbits over a 4 frame group. +// If a lower level requirement is specified then this may over ride this value. #define MAX_MB_RATE 250 -#define MAXRATE_1080P 2025000 +#define MAXRATE_1080P 4000000 #define DEFAULT_KF_BOOST 2000 #define DEFAULT_GF_BOOST 2000 @@ -45,18 +48,16 @@ #define MAX_BPB_FACTOR 50 #if CONFIG_VP9_HIGHBITDEPTH -#define ASSIGN_MINQ_TABLE(bit_depth, name) \ - do { \ - switch (bit_depth) { \ - case VPX_BITS_8: name = name##_8; break; \ - case VPX_BITS_10: name = name##_10; break; \ - case VPX_BITS_12: name = name##_12; break; \ - default: \ - assert(0 && \ - "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ - " or VPX_BITS_12"); \ - name = NULL; \ - } \ +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: name = name##_8; break; \ + case VPX_BITS_10: name = name##_10; break; \ + default: \ + assert(bit_depth == VPX_BITS_12); \ + name = name##_12; \ + break; \ + } \ } while (0) #else #define ASSIGN_MINQ_TABLE(bit_depth, name) \ @@ -97,8 +98,8 @@ static int kf_low = 400; #else static int gf_high = 2000; static int gf_low = 400; -static int kf_high = 5000; -static int kf_low = 400; +static int kf_high = 4800; +static int kf_low = 300; #endif // Functions to compute the active minq lookup table entries based on a @@ -128,7 +129,7 @@ static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low, for (i = 0; i < QINDEX_RANGE; i++) { const double maxq = vp9_convert_qindex_to_q(i, bit_depth); kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth); - kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth); #ifdef AGGRESSIVE_VBR arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.275, bit_depth); inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.80, bit_depth); @@ -164,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; - case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1.0; + assert(bit_depth == VPX_BITS_12); + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; } #else return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; @@ -247,20 +247,65 @@ int vp9_rc_clamp_iframe_target_size(const VP9_COMP *const cpi, int target) { return target; } +// Update the buffer level before encoding with the per-frame-bandwidth, +static void update_buffer_level_preencode(VP9_COMP *cpi) { + RATE_CONTROL *const rc = &cpi->rc; + rc->bits_off_target += rc->avg_frame_bandwidth; + // Clip the buffer level to the maximum specified buffer size. + rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); + rc->buffer_level = rc->bits_off_target; +} + +// Update the buffer level before encoding with the per-frame-bandwidth +// for SVC. The current and all upper temporal layers are updated, needed +// for the layered rate control which involves cumulative buffer levels for +// the temporal layers. Allow for using the timestamp(pts) delta for the +// framerate when the set_ref_frame_config is used. +static void update_buffer_level_svc_preencode(VP9_COMP *cpi) { + SVC *const svc = &cpi->svc; + int i; + // Set this to 1 to use timestamp delta for "framerate" under + // ref_frame_config usage. + int use_timestamp = 1; + const int64_t ts_delta = + svc->time_stamp_superframe - svc->time_stamp_prev[svc->spatial_layer_id]; + for (i = svc->temporal_layer_id; i < svc->number_temporal_layers; ++i) { + const int layer = + LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + if (use_timestamp && cpi->svc.use_set_ref_frame_config && + svc->number_temporal_layers == 1 && ts_delta > 0 && + svc->current_superframe > 0) { + // TODO(marpan): This may need to be modified for temporal layers. + const double framerate_pts = 10000000.0 / ts_delta; + lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts); + } else { + lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate); + } + // Clip buffer level to maximum buffer size for the layer. + lrc->bits_off_target = + VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); + lrc->buffer_level = lrc->bits_off_target; + if (i == svc->temporal_layer_id) { + cpi->rc.bits_off_target = lrc->bits_off_target; + cpi->rc.buffer_level = lrc->buffer_level; + } + } +} + // Update the buffer level for higher temporal layers, given the encoded current // temporal layer. -static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { +static void update_layer_buffer_level_postencode(SVC *svc, + int encoded_frame_size) { int i = 0; - int current_temporal_layer = svc->temporal_layer_id; + const int current_temporal_layer = svc->temporal_layer_id; for (i = current_temporal_layer + 1; i < svc->number_temporal_layers; ++i) { const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, svc->number_temporal_layers); LAYER_CONTEXT *lc = &svc->layer_context[layer]; RATE_CONTROL *lrc = &lc->rc; - int bits_off_for_this_layer = - (int)(lc->target_bandwidth / lc->framerate - encoded_frame_size); - lrc->bits_off_target += bits_off_for_this_layer; - + lrc->bits_off_target -= encoded_frame_size; // Clip buffer level to maximum buffer size for the layer. lrc->bits_off_target = VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size); @@ -268,21 +313,13 @@ static void update_layer_buffer_level(SVC *svc, int encoded_frame_size) { } } -// Update the buffer level: leaky bucket model. -static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { - const VP9_COMMON *const cm = &cpi->common; +// Update the buffer level after encoding with encoded frame size. +static void update_buffer_level_postencode(VP9_COMP *cpi, + int encoded_frame_size) { RATE_CONTROL *const rc = &cpi->rc; - - // Non-viewable frames are a special case and are treated as pure overhead. - if (!cm->show_frame) { - rc->bits_off_target -= encoded_frame_size; - } else { - rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size; - } - + rc->bits_off_target -= encoded_frame_size; // Clip the buffer level to the maximum specified buffer size. rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size); - // For screen-content mode, and if frame-dropper is off, don't let buffer // level go below threshold, given here as -rc->maximum_ buffer_size. if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && @@ -292,7 +329,7 @@ static void update_buffer_level(VP9_COMP *cpi, int encoded_frame_size) { rc->buffer_level = rc->bits_off_target; if (is_one_pass_cbr_svc(cpi)) { - update_layer_buffer_level(&cpi->svc, encoded_frame_size); + update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size); } } @@ -355,6 +392,9 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->high_source_sad = 0; rc->reset_high_source_sad = 0; rc->high_source_sad_lagindex = -1; + rc->high_num_blocks_with_motion = 0; + rc->hybrid_intra_scene_change = 0; + rc->re_encode_maxq_scene_change = 0; rc->alt_ref_gf_group = 0; rc->last_frame_is_src_altref = 0; rc->fac_active_worst_inter = 150; @@ -377,6 +417,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { rc->rate_correction_factors[i] = 1.0; + rc->damped_adjustment[i] = 0; } rc->min_gf_interval = oxcf->min_gf_interval; @@ -388,27 +429,110 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( oxcf->init_framerate, rc->min_gf_interval); rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2; + + rc->force_max_q = 0; + rc->last_post_encode_dropped_scene_change = 0; + rc->use_post_encode_drop = 0; + rc->ext_use_post_encode_drop = 0; } -int vp9_rc_drop_frame(VP9_COMP *cpi) { +static int check_buffer_above_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode != FULL_SUPERFRAME_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level > drop_mark); + } else { + int i; + // For SVC in the FULL_SUPERFRAME_DROP): the condition on + // buffer (if its above threshold, so no drop) is checked on current and + // upper spatial layers. If any spatial layer is not above threshold then + // we return 0. + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (!(lrc->buffer_level > drop_mark_layer)) return 0; + } + } + return 1; + } +} + +static int check_buffer_below_thresh(VP9_COMP *cpi, int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->use_svc || cpi->svc.framedrop_mode == LAYER_DROP) { + RATE_CONTROL *const rc = &cpi->rc; + return (rc->buffer_level <= drop_mark); + } else { + int i; + // For SVC in the constrained framedrop mode (svc->framedrop_mode = + // CONSTRAINED_LAYER_DROP or FULL_SUPERFRAME_DROP): the condition on + // buffer (if its below threshold, so drop frame) is checked on current + // and upper spatial layers. For FULL_SUPERFRAME_DROP mode if any + // spatial layer is <= threshold, then we return 1 (drop). + for (i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_mark_layer = (int)(cpi->svc.framedrop_thresh[i] * + lrc->optimal_buffer_level / 100); + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) { + if (lrc->buffer_level <= drop_mark_layer) return 1; + } else { + if (!(lrc->buffer_level <= drop_mark_layer)) return 0; + } + } + } + if (cpi->svc.framedrop_mode == FULL_SUPERFRAME_DROP) + return 0; + else + return 1; + } +} + +static int drop_frame(VP9_COMP *cpi) { const VP9EncoderConfig *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - if (!oxcf->drop_frames_water_mark || - (is_one_pass_cbr_svc(cpi) && - cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode)) { + SVC *svc = &cpi->svc; + int drop_frames_water_mark = oxcf->drop_frames_water_mark; + if (cpi->use_svc) { + // If we have dropped max_consec_drop frames, then we don't + // drop this spatial layer, and reset counter to 0. + if (svc->drop_count[svc->spatial_layer_id] == svc->max_consec_drop) { + svc->drop_count[svc->spatial_layer_id] = 0; + return 0; + } else { + drop_frames_water_mark = svc->framedrop_thresh[svc->spatial_layer_id]; + } + } + if (!drop_frames_water_mark || + (svc->spatial_layer_id > 0 && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { return 0; } else { - if (rc->buffer_level < 0) { + if ((rc->buffer_level < 0 && svc->framedrop_mode != FULL_SUPERFRAME_DROP) || + (check_buffer_below_thresh(cpi, -1) && + svc->framedrop_mode == FULL_SUPERFRAME_DROP)) { // Always drop if buffer is below 0. return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. int drop_mark = - (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100); - if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + (int)(drop_frames_water_mark * rc->optimal_buffer_level / 100); + if (check_buffer_above_thresh(cpi, drop_mark) && + (rc->decimation_factor > 0)) { --rc->decimation_factor; - } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) { + } else if (check_buffer_below_thresh(cpi, drop_mark) && + rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -427,11 +551,129 @@ int vp9_rc_drop_frame(VP9_COMP *cpi) { } } +int post_encode_drop_cbr(VP9_COMP *cpi, size_t *size) { + size_t frame_size = *size << 3; + int64_t new_buffer_level = + cpi->rc.buffer_level + cpi->rc.avg_frame_bandwidth - (int64_t)frame_size; + + // For now we drop if new buffer level (given the encoded frame size) goes + // below 0. + if (new_buffer_level < 0) { + *size = 0; + vp9_rc_postencode_update_drop_frame(cpi); + // Update flag to use for next frame. + if (cpi->rc.high_source_sad || + (cpi->use_svc && cpi->svc.high_source_sad_superframe)) + cpi->rc.last_post_encode_dropped_scene_change = 1; + // Force max_q on next fame. + cpi->rc.force_max_q = 1; + cpi->rc.avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + cpi->last_frame_dropped = 1; + cpi->ext_refresh_frame_flags_pending = 0; + if (cpi->use_svc) { + SVC *svc = &cpi->svc; + int sl = 0; + int tl = 0; + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + // Postencode drop is only checked on base spatial layer, + // for now if max-q is set on base we force it on all layers. + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->force_max_q = 1; + lrc->avg_frame_qindex[INTER_FRAME] = cpi->rc.worst_quality; + } + } + } + return 1; + } + + cpi->rc.force_max_q = 0; + cpi->rc.last_post_encode_dropped_scene_change = 0; + return 0; +} + +int vp9_rc_drop_frame(VP9_COMP *cpi) { + SVC *svc = &cpi->svc; + int svc_prev_layer_dropped = 0; + // In the constrained or full_superframe framedrop mode for svc + // (framedrop_mode != LAYER_DROP), if the previous spatial layer was + // dropped, drop the current spatial layer. + if (cpi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) + svc_prev_layer_dropped = 1; + if ((svc_prev_layer_dropped && svc->framedrop_mode != LAYER_DROP) || + drop_frame(cpi)) { + vp9_rc_postencode_update_drop_frame(cpi); + cpi->ext_refresh_frame_flags_pending = 0; + cpi->last_frame_dropped = 1; + if (cpi->use_svc) { + svc->last_layer_dropped[svc->spatial_layer_id] = 1; + svc->drop_spatial_layer[svc->spatial_layer_id] = 1; + svc->drop_count[svc->spatial_layer_id]++; + svc->skip_enhancement_layer = 1; + if (svc->framedrop_mode == LAYER_DROP || + svc->drop_spatial_layer[0] == 0) { + // For the case of constrained drop mode where the base is dropped + // (drop_spatial_layer[0] == 1), which means full superframe dropped, + // we don't increment the svc frame counters. In particular temporal + // layer counter (which is incremented in vp9_inc_frame_in_layer()) + // won't be incremented, so on a dropped frame we try the same + // temporal_layer_id on next incoming frame. This is to avoid an + // issue with temporal alignement with full superframe dropping. + vp9_inc_frame_in_layer(cpi); + } + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + int all_layers_drop = 1; + for (i = 0; i < svc->spatial_layer_id; i++) { + if (svc->drop_spatial_layer[i] == 0) { + all_layers_drop = 0; + break; + } + } + if (all_layers_drop == 1) svc->skip_enhancement_layer = 0; + } + } + return 1; + } + return 0; +} + +static int adjust_q_cbr(const VP9_COMP *cpi, int q) { + // This makes sure q is between oscillating Qs to prevent resonance. + if (!cpi->rc.reset_high_source_sad && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && + (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && + cpi->rc.q_1_frame != cpi->rc.q_2_frame) { + int qclamp = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), + VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); + // If the previous frame had overshoot and the current q needs to increase + // above the clamped value, reduce the clamp for faster reaction to + // overshoot. + if (cpi->rc.rc_1_frame == -1 && q > qclamp) + q = (q + qclamp) >> 1; + else + q = qclamp; + } + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + vp9_cyclic_refresh_limit_q(cpi, &q); + return q; +} + static double get_rate_correction_factor(const VP9_COMP *cpi) { const RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; double rcf; - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rcf = rc->rate_correction_factors[KF_STD]; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -451,13 +693,14 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { RATE_CONTROL *const rc = &cpi->rc; + const VP9_COMMON *const cm = &cpi->common; // Normalize RCF to account for the size-dependent scaling factor. factor /= rcf_mult[cpi->rc.frame_size_selector]; factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR); - if (cpi->common.frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->rate_correction_factors[KF_STD] = factor; } else if (cpi->oxcf.pass == 2) { RATE_FACTOR_LEVEL rf_lvl = @@ -478,6 +721,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { int correction_factor = 100; double rate_correction_factor = get_rate_correction_factor(cpi); double adjustment_limit; + RATE_FACTOR_LEVEL rf_lvl = + cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index]; int projected_size_based_on_q = 0; @@ -494,8 +739,9 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { projected_size_based_on_q = vp9_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; projected_size_based_on_q = - vp9_estimate_bits_at_q(cpi->common.frame_type, cm->base_qindex, cm->MBs, + vp9_estimate_bits_at_q(frame_type, cm->base_qindex, cm->MBs, rate_correction_factor, cm->bit_depth); } // Work out a size correction factor. @@ -503,10 +749,16 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi) { correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) / projected_size_based_on_q); - // More heavily damped adjustment used if we have been oscillating either side - // of target. - adjustment_limit = - 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + // Do not use damped adjustment for the first frame of each frame type + if (!cpi->rc.damped_adjustment[rf_lvl]) { + adjustment_limit = 1.0; + cpi->rc.damped_adjustment[rf_lvl] = 1; + } else { + // More heavily damped adjustment used if we have been oscillating either + // side of target. + adjustment_limit = + 0.25 + 0.5 * VPXMIN(1, fabs(log10(0.01 * correction_factor))); + } cpi->rc.q_2_frame = cpi->rc.q_1_frame; cpi->rc.q_1_frame = cm->base_qindex; @@ -569,8 +821,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, bits_per_mb_at_this_q = (int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor); } else { + FRAME_TYPE frame_type = cm->intra_only ? KEY_FRAME : cm->frame_type; bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb( - cm->frame_type, i, correction_factor, cm->bit_depth); + frame_type, i, correction_factor, cm->bit_depth); } if (bits_per_mb_at_this_q <= target_bits_per_mb) { @@ -585,16 +838,9 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, } } while (++i <= active_worst_quality); - // In CBR mode, this makes sure q is between oscillating Qs to prevent - // resonance. - if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad && - (!cpi->oxcf.gf_cbr_boost_pct || - !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && - (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && - cpi->rc.q_1_frame != cpi->rc.q_2_frame) { - q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), - VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); - } + // Adjustment to q for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) return adjust_q_cbr(cpi, q); + return q; } @@ -623,13 +869,19 @@ static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, +static int get_gf_active_quality(const VP9_COMP *const cpi, int q, vpx_bit_depth_t bit_depth) { + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; + const RATE_CONTROL *const rc = &cpi->rc; + int *arfgf_low_motion_minq; int *arfgf_high_motion_minq; + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : rc->gfu_boost; ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); - return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, + return get_active_quality(q, gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } @@ -674,7 +926,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { int active_worst_quality; int ambient_qp; unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers; - if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad) + if (frame_is_intra_only(cm) || rc->reset_high_source_sad || rc->force_max_q) return rc->worst_quality; // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME] // for the first few frames following key frame. These are both initialized @@ -685,6 +937,7 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { ? VPXMIN(rc->avg_frame_qindex[INTER_FRAME], rc->avg_frame_qindex[KEY_FRAME]) : rc->avg_frame_qindex[INTER_FRAME]; + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 5) >> 2); // For SVC if the current base spatial layer was key frame, use the QP from // that base layer for ambient_qp. if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { @@ -694,13 +947,15 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) { if (lc->is_key_frame) { const RATE_CONTROL *lrc = &lc->rc; ambient_qp = VPXMIN(ambient_qp, lrc->last_q[KEY_FRAME]); + active_worst_quality = VPXMIN(rc->worst_quality, (ambient_qp * 9) >> 3); } } - active_worst_quality = VPXMIN(rc->worst_quality, ambient_qp * 5 >> 2); if (rc->buffer_level > rc->optimal_buffer_level) { // Adjust down. - // Maximum limit for down adjustment, ~30%. + // Maximum limit for down adjustment ~30%; make it lower for screen content. int max_adjustment_down = active_worst_quality / 3; + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN) + max_adjustment_down = active_worst_quality >> 3; if (max_adjustment_down) { buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) / max_adjustment_down); @@ -779,7 +1034,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -804,21 +1059,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, *top_index = active_worst_quality; *bottom_index = active_best_quality; -#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY - // Limit Q range for the adaptive loop. - if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced && - !(cm->current_video_frame == 0)) { - int qdelta = 0; - vpx_clear_system_state(); - qdelta = vp9_compute_qdelta_by_rate( - &cpi->rc, cm->frame_type, active_worst_quality, 2.0, cm->bit_depth); - *top_index = active_worst_quality + qdelta; - *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; - } -#endif - // Special case code to try and match quality with forced key frames - if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { q = rc->last_boosted_qindex; } else { q = vp9_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality, @@ -939,7 +1181,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -954,7 +1196,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth); active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -1045,78 +1287,143 @@ int vp9_frame_type_qdelta(const VP9_COMP *cpi, int rf_level, int q) { 1.75, // GF_ARF_STD 2.00, // KF_STD }; - static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = { - INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME - }; const VP9_COMMON *const cm = &cpi->common; - int qdelta = - vp9_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q, - rate_factor_deltas[rf_level], cm->bit_depth); + + int qdelta = vp9_compute_qdelta_by_rate( + &cpi->rc, cm->frame_type, q, rate_factor_deltas[rf_level], cm->bit_depth); return qdelta; } #define STATIC_MOTION_THRESH 95 -static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, - int *top_index) { + +static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index) { const VP9_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); int active_best_quality; int active_worst_quality = cpi->twopass.active_worst_quality; - int q; - int *inter_minq; - ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); - if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { + if (rc->this_key_frame_forced) { // Handle the special case for key frames forced when we have reached // the maximum key frame interval. Here force the Q to a range // based on the ambient Q to reduce the risk of popping. - if (rc->this_key_frame_forced) { - double last_boosted_q; - int delta_qindex; - int qindex; - - if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { - qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); - active_best_quality = qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 1.25, cm->bit_depth); - active_worst_quality = - VPXMIN(qindex + delta_qindex, active_worst_quality); - } else { - qindex = rc->last_boosted_qindex; - last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); - delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75, cm->bit_depth); - active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); - } + double last_boosted_q; + int delta_qindex; + int qindex; + + if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { + qindex = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); + active_best_quality = qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 1.25, cm->bit_depth); + active_worst_quality = + VPXMIN(qindex + delta_qindex, active_worst_quality); } else { - // Not forced keyframe. - double q_adj_factor = 1.0; - double q_val; - // Baseline value derived from cpi->active_worst_quality and kf boost. - active_best_quality = - get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + qindex = rc->last_boosted_qindex; + last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); + delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, + last_boosted_q * 0.75, cm->bit_depth); + active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality); + } + } else { + // Not forced keyframe. + double q_adj_factor = 1.0; + double q_val; + // Baseline value derived from cpi->active_worst_quality and kf boost. + active_best_quality = + get_kf_active_quality(rc, active_worst_quality, cm->bit_depth); + if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) { + active_best_quality /= 4; + } - // Allow somewhat lower kf minq with small image formats. - if ((cm->width * cm->height) <= (352 * 288)) { - q_adj_factor -= 0.25; - } + // Dont allow the active min to be lossless (q0) unlesss the max q + // already indicates lossless. + active_best_quality = + VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality)); - // Make a further adjustment based on the kf zero motion measure. - q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + // Allow somewhat lower kf minq with small image formats. + if ((cm->width * cm->height) <= (352 * 288)) { + q_adj_factor -= 0.25; + } - // Convert the adjustment factor to a qindex delta - // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); - active_best_quality += - vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + // Make a further adjustment based on the kf zero motion measure. + q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct); + + // Convert the adjustment factor to a qindex delta + // on active_best_quality. + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); + active_best_quality += + vp9_compute_qdelta(rc, q_val, q_val * q_adj_factor, cm->bit_depth); + } + *top_index = active_worst_quality; + *bottom_index = active_best_quality; +} + +static int rc_constant_q(const VP9_COMP *cpi, int *bottom_index, int *top_index, + int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int is_intra_frame = frame_is_intra_only(cm); + + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + + int q = cq_level; + int active_best_quality = cq_level; + int active_worst_quality = cq_level; + + // Key frame qp decision + if (is_intra_frame && rc->frames_to_key > 1) + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + + // ARF / GF qp decision + if (!is_intra_frame && !rc->is_src_frame_alt_ref && + cpi->refresh_alt_ref_frame) { + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); + + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = ((layer_depth - 1) * cq_level + + active_best_quality + layer_depth / 2) / + layer_depth; } - } else if (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { + } + + q = active_best_quality; + *top_index = active_worst_quality; + *bottom_index = active_best_quality; + return q; +} + +static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, + int *top_index, int gf_group_index) { + const VP9_COMMON *const cm = &cpi->common; + const RATE_CONTROL *const rc = &cpi->rc; + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + const int cq_level = get_active_cq_level_two_pass(&cpi->twopass, rc, oxcf); + int active_best_quality; + int active_worst_quality = cpi->twopass.active_worst_quality; + int q; + int *inter_minq; + const int boost_frame = + !rc->is_src_frame_alt_ref && + (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame); + + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); + + if (oxcf->rc_mode == VPX_Q) + return rc_constant_q(cpi, bottom_index, top_index, gf_group_index); + + if (frame_is_intra_only(cm)) { + pick_kf_q_bound_two_pass(cpi, &active_best_quality, &active_worst_quality); + } else if (boost_frame) { // Use the lower of active_worst_quality and recent // average Q as basis for GF/ARF best Q limit unless last frame was // a key frame. @@ -1130,62 +1437,56 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, if (oxcf->rc_mode == VPX_CQ) { if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; - } else if (oxcf->rc_mode == VPX_Q) { - if (!cpi->refresh_alt_ref_frame) { - active_best_quality = cq_level; - } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); - - // Modify best quality for second level arfs. For mode VPX_Q this - // becomes the baseline frame q. - if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW) - active_best_quality = (active_best_quality + cq_level + 1) / 2; + // Modify best quality for second level arfs. For mode VPX_Q this + // becomes the baseline frame q. + if (gf_group->rf_level[gf_group_index] == GF_ARF_LOW) { + const int layer_depth = gf_group->layer_depth[gf_group_index]; + // linearly fit the frame q depending on the layer depth index from + // the base layer ARF. + active_best_quality = + ((layer_depth - 1) * q + active_best_quality + layer_depth / 2) / + layer_depth; } } else { - active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); + active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth); } } else { - if (oxcf->rc_mode == VPX_Q) { - active_best_quality = cq_level; - } else { - active_best_quality = inter_minq[active_worst_quality]; + active_best_quality = inter_minq[active_worst_quality]; - // For the constrained quality mode we don't want - // q to fall below the cq level. - if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { - active_best_quality = cq_level; - } + // For the constrained quality mode we don't want + // q to fall below the cq level. + if ((oxcf->rc_mode == VPX_CQ) && (active_best_quality < cq_level)) { + active_best_quality = cq_level; } } // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. - if (cpi->oxcf.rc_mode != VPX_Q) { - if (frame_is_intra_only(cm) || - (!rc->is_src_frame_alt_ref && - (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); - active_worst_quality += (cpi->twopass.extend_maxq / 2); - } else { - active_best_quality -= - (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; - active_worst_quality += cpi->twopass.extend_maxq; - } + if (frame_is_intra_only(cm) || boost_frame) { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast); + active_worst_quality += (cpi->twopass.extend_maxq / 2); + } else { + active_best_quality -= + (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2; + active_worst_quality += cpi->twopass.extend_maxq; + + // For normal frames do not allow an active minq lower than the q used for + // the last boosted frame. + active_best_quality = VPXMAX(active_best_quality, rc->last_boosted_qindex); } #if LIMIT_QRANGE_FOR_ALTREF_AND_KEY vpx_clear_system_state(); // Static forced key frames Q restrictions dealt with elsewhere. - if (!((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi))) || - !rc->this_key_frame_forced || - (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) { - int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index], + if (!frame_is_intra_only(cm) || !rc->this_key_frame_forced || + cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH) { + int qdelta = vp9_frame_type_qdelta(cpi, gf_group->rf_level[gf_group_index], active_worst_quality); active_worst_quality = VPXMAX(active_worst_quality + qdelta, active_best_quality); @@ -1205,11 +1506,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, active_worst_quality = clamp(active_worst_quality, active_best_quality, rc->worst_quality); - if (oxcf->rc_mode == VPX_Q) { - q = active_best_quality; - // Special case code to try and match quality with forced key frames. - } else if ((frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) && - rc->this_key_frame_forced) { + if (frame_is_intra_only(cm) && rc->this_key_frame_forced) { // If static since last kf use better of last boosted and last kf q. if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) { q = VPXMIN(rc->last_kf_qindex, rc->last_boosted_qindex); @@ -1242,13 +1539,15 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index, int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, int *top_index) { int q; + const int gf_group_index = cpi->twopass.gf_group.index; if (cpi->oxcf.pass == 0) { if (cpi->oxcf.rc_mode == VPX_CBR) q = rc_pick_q_and_bounds_one_pass_cbr(cpi, bottom_index, top_index); else q = rc_pick_q_and_bounds_one_pass_vbr(cpi, bottom_index, top_index); } else { - q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index); + q = rc_pick_q_and_bounds_two_pass(cpi, bottom_index, top_index, + gf_group_index); } if (cpi->sf.use_nonrd_pick_mode) { if (cpi->sf.force_frame_boost == 1) q -= cpi->sf.max_delta_qindex; @@ -1261,6 +1560,82 @@ int vp9_rc_pick_q_and_bounds(const VP9_COMP *cpi, int *bottom_index, return q; } +void vp9_configure_buffer_updates(VP9_COMP *cpi, int gf_group_index) { + VP9_COMMON *cm = &cpi->common; + TWO_PASS *const twopass = &cpi->twopass; + + cpi->rc.is_src_frame_alt_ref = 0; + cm->show_existing_frame = 0; + switch (twopass->gf_group.update_type[gf_group_index]) { + case KF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 1; + break; + case LF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + break; + case GF_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + break; + case OVERLAY_UPDATE: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 1; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case MID_OVERLAY_UPDATE: + cpi->refresh_last_frame = 1; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + break; + case USE_BUF_FRAME: + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 0; + cpi->rc.is_src_frame_alt_ref = 1; + cm->show_existing_frame = 1; + cm->refresh_frame_context = 0; + break; + default: + assert(twopass->gf_group.update_type[gf_group_index] == ARF_UPDATE); + cpi->refresh_last_frame = 0; + cpi->refresh_golden_frame = 0; + cpi->refresh_alt_ref_frame = 1; + break; + } +} + +void vp9_estimate_qp_gop(VP9_COMP *cpi) { + int gop_length = cpi->twopass.gf_group.gf_group_size; + int bottom_index, top_index; + int idx; + const int gf_index = cpi->twopass.gf_group.index; + const int is_src_frame_alt_ref = cpi->rc.is_src_frame_alt_ref; + const int refresh_frame_context = cpi->common.refresh_frame_context; + + for (idx = 1; idx <= gop_length; ++idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[idx]; + int target_rate = cpi->twopass.gf_group.bit_allocation[idx]; + cpi->twopass.gf_group.index = idx; + vp9_rc_set_frame_target(cpi, target_rate); + vp9_configure_buffer_updates(cpi, idx); + tpl_frame->base_qindex = + rc_pick_q_and_bounds_two_pass(cpi, &bottom_index, &top_index, idx); + tpl_frame->base_qindex = VPXMAX(tpl_frame->base_qindex, 1); + } + // Reset the actual index and frame update + cpi->twopass.gf_group.index = gf_index; + cpi->rc.is_src_frame_alt_ref = is_src_frame_alt_ref; + cpi->common.refresh_frame_context = refresh_frame_context; + vp9_configure_buffer_updates(cpi, gf_index); +} + void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi, int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { @@ -1367,7 +1742,8 @@ static void compute_frame_low_motion(VP9_COMP *const cpi) { int cnt_zeromv = 0; for (mi_row = 0; mi_row < rows; mi_row++) { for (mi_col = 0; mi_col < cols; mi_col++) { - if (abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) + if (mi[0]->ref_frame[0] == LAST_FRAME && + abs(mi[0]->mv[0].as_mv.row) < 16 && abs(mi[0]->mv[0].as_mv.col) < 16) cnt_zeromv++; mi++; } @@ -1381,6 +1757,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { const VP9_COMMON *const cm = &cpi->common; const VP9EncoderConfig *const oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; const int qindex = cm->base_qindex; // Update rate control heuristics @@ -1390,7 +1767,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { vp9_rc_update_rate_correction_factors(cpi); // Keep a record of last Q and ambient average Q. - if (cm->frame_type == KEY_FRAME) { + if (frame_is_intra_only(cm)) { rc->last_q[KEY_FRAME] = qindex; rc->avg_frame_qindex[KEY_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2); @@ -1434,13 +1811,13 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) { rc->last_boosted_qindex = qindex; } - if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex; + if (frame_is_intra_only(cm)) rc->last_kf_qindex = qindex; - update_buffer_level(cpi, rc->projected_frame_size); + update_buffer_level_postencode(cpi, rc->projected_frame_size); // Rolling monitors of whether we are over or underspending used to help // regulate min and Max Q in two pass. - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm)) { rc->rolling_target_bits = ROUND_POWER_OF_TWO( rc->rolling_target_bits * 3 + rc->this_frame_target, 2); rc->rolling_actual_bits = ROUND_POWER_OF_TWO( @@ -1457,9 +1834,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits; - if (!cpi->use_svc || is_two_pass_svc(cpi)) { + if (!cpi->use_svc) { if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame && - (cm->frame_type != KEY_FRAME)) + (!frame_is_intra_only(cm))) // Update the alternate reference frame stats as appropriate. update_alt_ref_frame_stats(cpi); else @@ -1467,7 +1844,28 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { update_golden_frame_stats(cpi); } - if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0; + // If second (long term) temporal reference is used for SVC, + // update the golden frame counter, only for base temporal layer. + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id == 0) { + int i = 0; + if (cpi->refresh_golden_frame) + rc->frames_since_golden = 0; + else + rc->frames_since_golden++; + // Decrement count down till next gf + if (rc->frames_till_gf_update_due > 0) rc->frames_till_gf_update_due--; + // Update the frames_since_golden for all upper temporal layers. + for (i = 1; i < svc->number_temporal_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->frames_since_golden = rc->frames_since_golden; + } + } + + if (frame_is_intra_only(cm)) rc->frames_since_key = 0; if (cm->show_frame) { rc->frames_since_key++; rc->frames_to_key--; @@ -1481,24 +1879,51 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { } if (oxcf->pass == 0) { - if (cm->frame_type != KEY_FRAME) { + if (!frame_is_intra_only(cm) && + (!cpi->use_svc || + (cpi->use_svc && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + svc->spatial_layer_id == svc->number_spatial_layers - 1))) { compute_frame_low_motion(cpi); if (cpi->sf.use_altref_onepass) update_altref_usage(cpi); } + // For SVC: set avg_frame_low_motion (only computed on top spatial layer) + // to all lower spatial layers. + if (cpi->use_svc && + svc->spatial_layer_id == svc->number_spatial_layers - 1) { + int i; + for (i = 0; i < svc->number_spatial_layers - 1; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_frame_low_motion = rc->avg_frame_low_motion; + } + } cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref; } - if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0; + if (!frame_is_intra_only(cm)) rc->reset_high_source_sad = 0; rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth; + if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1) + svc->lower_layer_qindex = cm->base_qindex; } void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { - // Update buffer level with zero size, update frame counters, and return. - update_buffer_level(cpi, 0); + cpi->common.current_video_frame++; cpi->rc.frames_since_key++; cpi->rc.frames_to_key--; cpi->rc.rc_2_frame = 0; cpi->rc.rc_1_frame = 0; + cpi->rc.last_avg_frame_bandwidth = cpi->rc.avg_frame_bandwidth; + // For SVC on dropped frame when framedrop_mode != LAYER_DROP: + // in this mode the whole superframe may be dropped if only a single layer + // has buffer underflow (below threshold). Since this can then lead to + // increasing buffer levels/overflow for certain layers even though whole + // superframe is dropped, we cap buffer level if its already stable. + if (cpi->use_svc && cpi->svc.framedrop_mode != LAYER_DROP && + cpi->rc.buffer_level > cpi->rc.optimal_buffer_level) + cpi->rc.buffer_level = cpi->rc.optimal_buffer_level; } static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { @@ -1544,10 +1969,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. if (!cpi->refresh_alt_ref_frame && (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + rc->frames_to_key == 0)) { cm->frame_type = KEY_FRAME; rc->this_key_frame_forced = cm->current_video_frame != 0 && rc->frames_to_key == 0; @@ -1582,9 +2006,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). rc->gfu_boost = - VPXMAX(500, - DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / - (rc->avg_frame_low_motion + 100)); + VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) / + (rc->avg_frame_low_motion + 100)); rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400)); } adjust_gfint_frame_constraint(cpi, rc->frames_to_key); @@ -1684,30 +2107,80 @@ static int calc_iframe_target_size_one_pass_cbr(const VP9_COMP *cpi) { return vp9_rc_clamp_iframe_target_size(cpi, target); } +static void set_intra_only_frame(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + // Don't allow intra_only frame for bypass/flexible SVC mode, or if number + // of spatial layers is 1 or if number of spatial or temporal layers > 3. + // Also if intra-only is inserted on very first frame, don't allow if + // if number of temporal layers > 1. This is because on intra-only frame + // only 3 reference buffers can be updated, but for temporal layers > 1 + // we generally need to use buffer slots 4 and 5. + if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) || + svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 || + svc->number_spatial_layers == 1) + return; + cm->show_frame = 0; + cm->intra_only = 1; + cm->frame_type = INTER_FRAME; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->ext_refresh_last_frame = 1; + cpi->ext_refresh_golden_frame = 1; + cpi->ext_refresh_alt_ref_frame = 1; + if (cm->current_video_frame == 0) { + cpi->lst_fb_idx = 0; + cpi->gld_fb_idx = 1; + cpi->alt_fb_idx = 2; + } else { + int i; + int count = 0; + cpi->lst_fb_idx = -1; + cpi->gld_fb_idx = -1; + cpi->alt_fb_idx = -1; + // For intra-only frame we need to refresh all slots that were + // being used for the base layer (fb_idx_base[i] == 1). + // Start with assigning last first, then golden and then alt. + for (i = 0; i < REF_FRAMES; ++i) { + if (svc->fb_idx_base[i] == 1) count++; + if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i; + if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i; + if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i; + } + // If golden or alt is not being used for base layer, then set them + // to the lst_fb_idx. + if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx; + if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx; + } +} + void vp9_rc_get_svc_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + SVC *const svc = &cpi->svc; int target = rc->avg_frame_bandwidth; - int layer = - LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); + int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + if (svc->first_spatial_layer_to_encode) + svc->layer_context[svc->temporal_layer_id].is_key_frame = 0; // Periodic key frames is based on the super-frame counter // (svc.current_superframe), also only base spatial layer is key frame. - if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + // Key frame is set for any of the following: very first frame, frame flags + // indicates key, superframe counter hits key frequencey, or (non-intra) sync + // flag is set for spatial layer 0. + if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) || + (cpi->frame_flags & FRAMEFLAGS_KEY) || (cpi->oxcf.auto_key && - (cpi->svc.current_superframe % cpi->oxcf.key_freq == 0) && - cpi->svc.spatial_layer_id == 0)) { + (svc->current_superframe % cpi->oxcf.key_freq == 0) && + !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) || + (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) { cm->frame_type = KEY_FRAME; rc->source_alt_ref_active = 0; - if (is_two_pass_svc(cpi)) { - cpi->svc.layer_context[layer].is_key_frame = 1; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi); - layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, - cpi->svc.temporal_layer_id, - cpi->svc.number_temporal_layers); - cpi->svc.layer_context[layer].is_key_frame = 1; + if (is_one_pass_cbr_svc(cpi)) { + if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1); + layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id, + svc->number_temporal_layers); + svc->layer_context[layer].is_key_frame = 1; cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG); // Assumption here is that LAST_FRAME is being updated for a keyframe. // Thus no change in update flags. @@ -1715,48 +2188,84 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) { } } else { cm->frame_type = INTER_FRAME; - if (is_two_pass_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == 0) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - if (lc->is_key_frame) cpi->ref_frame_flags &= (~VP9_LAST_FLAG); - } - cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else if (is_one_pass_cbr_svc(cpi)) { - LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) { - lc->is_key_frame = 0; - } else { - lc->is_key_frame = - cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame; - } + if (is_one_pass_cbr_svc(cpi)) { + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + // Add condition current_video_frame > 0 for the case where first frame + // is intra only followed by overlay/copy frame. In this case we don't + // want to reset is_key_frame to 0 on overlay/copy frame. + lc->is_key_frame = + (svc->spatial_layer_id == 0 && cm->current_video_frame > 0) + ? 0 + : svc->layer_context[svc->temporal_layer_id].is_key_frame; target = calc_pframe_target_size_one_pass_cbr(cpi); } } + // Check if superframe contains a sync layer request. + vp9_svc_check_spatial_layer_sync(cpi); + + // If long term termporal feature is enabled, set the period of the update. + // The update/refresh of this reference frame is always on base temporal + // layer frame. + if (svc->use_gf_temporal_ref_current_layer) { + // Only use gf long-term prediction on non-key superframes. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // Use golden for this reference, which will be used for prediction. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->gld_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + // Enable prediction off LAST (last reference) and golden (which will + // generally be further behind/long-term reference). + cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; + } + // Check for update/refresh of reference: only refresh on base temporal + // layer. + if (svc->temporal_layer_id == 0) { + if (svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On key frame we update the buffer index used for long term reference. + // Use the alt_ref since it is not used or updated on key frames. + int index = svc->spatial_layer_id; + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } else if (rc->frames_till_gf_update_due == 0) { + // Set perdiod of next update. Make it a multiple of 10, as the cyclic + // refresh is typically ~10%, and we'd like the update to happen after + // a few cylces of the refresh (so it better quality frame). Note the + // cyclic refresh for SVC only operates on base temporal layer frames. + // Choose 20 as perdiod for now (2 cycles). + rc->baseline_gf_interval = 20; + rc->frames_till_gf_update_due = rc->baseline_gf_interval; + cpi->ext_refresh_golden_frame = 1; + rc->gfu_boost = DEFAULT_GF_BOOST; + } + } + } else if (!svc->use_gf_temporal_ref) { + rc->frames_till_gf_update_due = INT_MAX; + rc->baseline_gf_interval = INT_MAX; + } + if (svc->set_intra_only_frame) { + set_intra_only_frame(cpi); + target = calc_iframe_target_size_one_pass_cbr(cpi); + } // Any update/change of global cyclic refresh parameters (amount/delta-qp) // should be done here, before the frame qp is selected. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); vp9_rc_set_frame_target(cpi, target); - rc->frames_till_gf_update_due = INT_MAX; - rc->baseline_gf_interval = INT_MAX; + if (cm->show_frame) update_buffer_level_svc_preencode(cpi); } void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; int target; - // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic. - if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) || - rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) { + if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) || + rc->frames_to_key == 0) { cm->frame_type = KEY_FRAME; - rc->this_key_frame_forced = - cm->current_video_frame != 0 && rc->frames_to_key == 0; rc->frames_to_key = cpi->oxcf.key_freq; rc->kf_boost = DEFAULT_KF_BOOST; rc->source_alt_ref_active = 0; @@ -1782,12 +2291,15 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_update_parameters(cpi); - if (cm->frame_type == KEY_FRAME) + if (frame_is_intra_only(cm)) target = calc_iframe_target_size_one_pass_cbr(cpi); else target = calc_pframe_target_size_one_pass_cbr(cpi); vp9_rc_set_frame_target(cpi, target); + + if (cm->show_frame) update_buffer_level_preencode(cpi); + if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC) cpi->resize_pending = vp9_resize_one_pass_cbr(cpi); else @@ -1859,13 +2371,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi, rc->max_gf_interval = vp9_rc_get_default_max_gf_interval( cpi->framerate, rc->min_gf_interval); - // Extended interval for genuinely static scenes - rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2; - - if (is_altref_enabled(cpi)) { - if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1) - rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1; - } + // Extended max interval for genuinely static scenes like slide shows. + rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH; if (rc->max_gf_interval > rc->static_scene_max_gf_interval) rc->max_gf_interval = rc->static_scene_max_gf_interval; @@ -1909,12 +2416,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) { VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS); // A maximum bitrate for a frame is defined. - // The baseline for this aligns with HW implementations that - // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits - // per 16x16 MB (averaged over a frame). However this limit is extended if - // a very high rate is given on the command line or the the rate cannnot - // be acheived because of a user specificed max q (e.g. when the user - // specifies lossless encode. + // However this limit is extended if a very high rate is given on the command + // line or the the rate cannnot be acheived because of a user specificed max q + // (e.g. when the user specifies lossless encode). + // + // If a level is specified that requires a lower maximum rate then the level + // value take precedence. vbr_max_bits = (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) / 100); @@ -2271,30 +2778,56 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, void vp9_scene_detection_onepass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; + YV12_BUFFER_CONFIG const *unscaled_src = cpi->un_scaled_source; + YV12_BUFFER_CONFIG const *unscaled_last_src = cpi->unscaled_last_source; + uint8_t *src_y; + int src_ystride; + int src_width; + int src_height; + uint8_t *last_src_y; + int last_src_ystride; + int last_src_width; + int last_src_height; + if (cpi->un_scaled_source == NULL || cpi->unscaled_last_source == NULL || + (cpi->use_svc && cpi->svc.current_superframe == 0)) + return; + src_y = unscaled_src->y_buffer; + src_ystride = unscaled_src->y_stride; + src_width = unscaled_src->y_width; + src_height = unscaled_src->y_height; + last_src_y = unscaled_last_src->y_buffer; + last_src_ystride = unscaled_last_src->y_stride; + last_src_width = unscaled_last_src->y_width; + last_src_height = unscaled_last_src->y_height; #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) return; #endif rc->high_source_sad = 0; - if (cpi->Last_Source != NULL && - cpi->Last_Source->y_width == cpi->Source->y_width && - cpi->Last_Source->y_height == cpi->Source->y_height) { + rc->high_num_blocks_with_motion = 0; + // For SVC: scene detection is only checked on first spatial layer of + // the superframe using the original/unscaled resolutions. + if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode && + src_width == last_src_width && src_height == last_src_height) { YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL }; - uint8_t *src_y = cpi->Source->y_buffer; - int src_ystride = cpi->Source->y_stride; - uint8_t *last_src_y = cpi->Last_Source->y_buffer; - int last_src_ystride = cpi->Last_Source->y_stride; + int num_mi_cols = cm->mi_cols; + int num_mi_rows = cm->mi_rows; int start_frame = 0; int frames_to_buffer = 1; int frame = 0; int scene_cut_force_key_frame = 0; + int num_zero_temp_sad = 0; uint64_t avg_sad_current = 0; - uint32_t min_thresh = 4000; + uint32_t min_thresh = 10000; float thresh = 8.0f; uint32_t thresh_key = 140000; if (cpi->oxcf.speed <= 5) thresh_key = 240000; - if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 65000; - thresh = 2.1f; + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000; + if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f; + if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) { + const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2); + const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2); + num_mi_cols = aligned_width >> MI_SIZE_LOG2; + num_mi_rows = aligned_height >> MI_SIZE_LOG2; } if (cpi->oxcf.lag_in_frames > 0) { frames_to_buffer = (cm->current_video_frame == 1) @@ -2342,14 +2875,15 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { uint64_t avg_sad = 0; uint64_t tmp_sad = 0; int num_samples = 0; - int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; - int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_cols = (num_mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; + int sb_rows = (num_mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; if (cpi->oxcf.lag_in_frames > 0) { src_y = frames[frame]->y_buffer; src_ystride = frames[frame]->y_stride; last_src_y = frames[frame + 1]->y_buffer; last_src_ystride = frames[frame + 1]->y_stride; } + num_zero_temp_sad = 0; for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. @@ -2361,6 +2895,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { last_src_ystride); avg_sad += tmp_sad; num_samples++; + if (tmp_sad == 0) num_zero_temp_sad++; } src_y += 64; last_src_y += 64; @@ -2377,7 +2912,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { if (avg_sad > VPXMAX(min_thresh, (unsigned int)(rc->avg_source_sad[0] * thresh)) && - rc->frames_since_key > 1) + rc->frames_since_key > 1 + cpi->svc.number_spatial_layers && + num_zero_temp_sad < 3 * (num_samples >> 2)) rc->high_source_sad = 1; else rc->high_source_sad = 0; @@ -2388,6 +2924,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { } else { rc->avg_source_sad[lagframe_idx] = avg_sad; } + if (num_zero_temp_sad < (num_samples >> 1)) + rc->high_num_blocks_with_motion = 1; } } // For CBR non-screen content mode, check if we should reset the rate @@ -2407,6 +2945,19 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad) rc->this_frame_target = rc->avg_frame_bandwidth; } + // For SVC the new (updated) avg_source_sad[0] for the current superframe + // updates the setting for all layers. + if (cpi->use_svc) { + int sl, tl; + SVC *const svc = &cpi->svc; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *const lc = &svc->layer_context[layer]; + RATE_CONTROL *const lrc = &lc->rc; + lrc->avg_source_sad[0] = rc->avg_source_sad[0]; + } + } // For VBR, under scene change/high content change, force golden refresh. if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME && rc->high_source_sad && rc->frames_to_key > 3 && @@ -2437,12 +2988,26 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { // Test if encoded frame will significantly overshoot the target bitrate, and // if so, set the QP, reset/adjust some rate control parameters, and return 1. +// frame_size = -1 means frame has not been encoded. int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; - int thresh_qp = 3 * (rc->worst_quality >> 2); - int thresh_rate = rc->avg_frame_bandwidth * 10; - if (cm->base_qindex < thresh_qp && frame_size > thresh_rate) { + SPEED_FEATURES *const sf = &cpi->sf; + int thresh_qp = 7 * (rc->worst_quality >> 3); + int thresh_rate = rc->avg_frame_bandwidth << 3; + // Lower thresh_qp for video (more overshoot at lower Q) to be + // more conservative for video. + if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) + thresh_qp = rc->worst_quality >> 1; + // If this decision is not based on an encoded frame size but just on + // scene/slide change detection (i.e., re_encode_overshoot_cbr_rt == + // FAST_DETECTION_MAXQ), for now skip the (frame_size > thresh_rate) + // condition in this case. + // TODO(marpan): Use a better size/rate condition for this case and + // adjust thresholds. + if ((sf->overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ || + frame_size > thresh_rate) && + cm->base_qindex < thresh_qp) { double rate_correction_factor = cpi->rc.rate_correction_factors[INTER_NORMAL]; const int target_size = cpi->rc.avg_frame_bandwidth; @@ -2452,6 +3017,29 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { int enumerator; // Force a re-encode, and for now use max-QP. *q = cpi->rc.worst_quality; + cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + cpi->rc.re_encode_maxq_scene_change = 1; + // If the frame_size is much larger than the threshold (big content change) + // and the encoded frame used alot of Intra modes, then force hybrid_intra + // encoding for the re-encode on this scene change. hybrid_intra will + // use rd-based intra mode selection for small blocks. + if (sf->overshoot_detection_cbr_rt == RE_ENCODE_MAXQ && + frame_size > (thresh_rate << 1) && cpi->svc.spatial_layer_id == 0) { + MODE_INFO **mi = cm->mi_grid_visible; + int sum_intra_usage = 0; + int mi_row, mi_col; + int tot = 0; + for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) { + if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++; + tot++; + mi++; + } + mi += 8; + } + sum_intra_usage = 100 * sum_intra_usage / (cm->mi_rows * cm->mi_cols); + if (sum_intra_usage > 60) cpi->rc.hybrid_intra_scene_change = 1; + } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting @@ -2479,21 +3067,27 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) { cpi->rc.rate_correction_factors[INTER_NORMAL] = rate_correction_factor; } // For temporal layers, reset the rate control parametes across all - // temporal layers. + // temporal layers. If the first_spatial_layer_to_encode > 0, then this + // superframe has skipped lower base layers. So in this case we should also + // reset and force max-q for spatial layers < first_spatial_layer_to_encode. if (cpi->use_svc) { - int i = 0; + int tl = 0; + int sl = 0; SVC *svc = &cpi->svc; - for (i = 0; i < svc->number_temporal_layers; ++i) { - const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i, - svc->number_temporal_layers); - LAYER_CONTEXT *lc = &svc->layer_context[layer]; - RATE_CONTROL *lrc = &lc->rc; - lrc->avg_frame_qindex[INTER_FRAME] = *q; - lrc->buffer_level = rc->optimal_buffer_level; - lrc->bits_off_target = rc->optimal_buffer_level; - lrc->rc_1_frame = 0; - lrc->rc_2_frame = 0; - lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + for (sl = 0; sl < svc->first_spatial_layer_to_encode; ++sl) { + for (tl = 0; tl < svc->number_temporal_layers; ++tl) { + const int layer = + LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + RATE_CONTROL *lrc = &lc->rc; + lrc->avg_frame_qindex[INTER_FRAME] = *q; + lrc->buffer_level = lrc->optimal_buffer_level; + lrc->bits_off_target = lrc->optimal_buffer_level; + lrc->rc_1_frame = 0; + lrc->rc_2_frame = 0; + lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor; + lrc->force_max_q = 1; + } } } return 1; diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h index c1b210677..a5c1f4cf0 100644 --- a/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/libvpx/vp9/encoder/vp9_ratectrl.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RATECTRL_H_ -#define VP9_ENCODER_VP9_RATECTRL_H_ +#ifndef VPX_VP9_ENCODER_VP9_RATECTRL_H_ +#define VPX_VP9_ENCODER_VP9_RATECTRL_H_ #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" @@ -34,6 +34,14 @@ extern "C" { #define FRAME_OVERHEAD_BITS 200 +// Threshold used to define a KF group as static (e.g. a slide show). +// Essentially this means that no frame in the group has more than 1% of MBs +// that are not marked as coded with 0,0 motion in the first pass. +#define STATIC_KF_GROUP_THRESH 99 + +// The maximum duration of a GF group that is static (for example a slide show). +#define MAX_STATIC_GF_GROUP_LENGTH 250 + typedef enum { INTER_NORMAL = 0, INTER_HIGH = 1, @@ -167,15 +175,28 @@ typedef struct { uint64_t avg_source_sad[MAX_LAG_BUFFERS]; uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; + int high_num_blocks_with_motion; int alt_ref_gf_group; int last_frame_is_src_altref; int high_source_sad; int count_last_scene_change; + int hybrid_intra_scene_change; + int re_encode_maxq_scene_change; int avg_frame_low_motion; int af_ratio_onepass_vbr; int force_qpmin; int reset_high_source_sad; double perc_arf_usage; + int force_max_q; + // Last frame was dropped post encode on scene change. + int last_post_encode_dropped_scene_change; + // Enable post encode frame dropping for screen content. Only enabled when + // ext_use_post_encode_drop is enabled by user. + int use_post_encode_drop; + // External flag to enable post encode frame dropping, controlled by user. + int ext_use_post_encode_drop; + + int damped_adjustment[RATE_FACTOR_LEVELS]; } RATE_CONTROL; struct VP9_COMP; @@ -184,7 +205,7 @@ struct VP9EncoderConfig; void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc); -int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, double correction_factor, vpx_bit_depth_t bit_depth); double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); @@ -195,9 +216,9 @@ void vp9_rc_init_minq_luts(void); int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to -// be passed in to ensure that the max_gf_interval returned is at least as bis +// be passed in to ensure that the max_gf_interval returned is at least as big // as that. -int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -237,13 +258,16 @@ void vp9_rc_postencode_update_drop_frame(struct VP9_COMP *cpi); // Changes only the rate correction factors in the rate control structure. void vp9_rc_update_rate_correction_factors(struct VP9_COMP *cpi); +// Post encode drop for CBR mode. +int post_encode_drop_cbr(struct VP9_COMP *cpi, size_t *size); + // Decide if we should drop this frame: For 1-pass CBR. // Changes only the decimation count in the rate control structure int vp9_rc_drop_frame(struct VP9_COMP *cpi); // Computes frame size bounds. void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, - int this_frame_target, + int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); @@ -294,8 +318,12 @@ void vp9_scene_detection_onepass(struct VP9_COMP *cpi); int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); +void vp9_configure_buffer_updates(struct VP9_COMP *cpi, int gf_group_index); + +void vp9_estimate_qp_gop(struct VP9_COMP *cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RATECTRL_H_ +#endif // VPX_VP9_ENCODER_VP9_RATECTRL_H_ diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c index 6b2306ce9..894b1497b 100644 --- a/libvpx/vp9/encoder/vp9_rd.c +++ b/libvpx/vp9/encoder/vp9_rd.c @@ -69,10 +69,12 @@ static void fill_mode_costs(VP9_COMP *cpi) { const FRAME_CONTEXT *const fc = cpi->common.fc; int i, j; - for (i = 0; i < INTRA_MODES; ++i) - for (j = 0; j < INTRA_MODES; ++j) + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES; ++j) { vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], vp9_intra_mode_tree); + } + } vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); for (i = 0; i < INTRA_MODES; ++i) { @@ -82,9 +84,28 @@ static void fill_mode_costs(VP9_COMP *cpi) { fc->uv_mode_prob[i], vp9_intra_mode_tree); } - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { vp9_cost_tokens(cpi->switchable_interp_costs[i], fc->switchable_interp_prob[i], vp9_switchable_interp_tree); + } + + for (i = TX_8X8; i < TX_SIZES; ++i) { + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) { + const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs); + int k; + for (k = 0; k <= i; ++k) { + int cost = 0; + int m; + for (m = 0; m <= k - (k == i); ++m) { + if (m == k) + cost += vp9_cost_zero(tx_probs[m]); + else + cost += vp9_cost_one(tx_probs[m]); + } + cpi->tx_size_cost[i - 1][j][k] = cost; + } + } + } } static void fill_token_costs(vp9_coeff_cost *c, @@ -143,40 +164,74 @@ void vp9_init_me_luts(void) { static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; -static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, - 128, 144 }; -int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { - const int64_t q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); +// Note that the element below for frame type "USE_BUF_FRAME", which indicates +// that the show frame flag is set, should not be used as no real frame +// is encoded so we should not reach here. However, a dummy value +// is inserted here to make sure the data structure has the right number +// of values assigned. +static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128, + 128, 144, 144 }; + +int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { + // largest dc_quant is 21387, therefore rdmult should always fit in int32_t + const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); + uint32_t rdmult = q * q; + + if (cpi->common.frame_type != KEY_FRAME) { + if (qindex < 128) + rdmult = rdmult * 4; + else if (qindex < 190) + rdmult = rdmult * 4 + rdmult / 2; + else + rdmult = rdmult * 3; + } else { + if (qindex < 64) + rdmult = rdmult * 4; + else if (qindex <= 128) + rdmult = rdmult * 3 + rdmult / 2; + else if (qindex < 190) + rdmult = rdmult * 4 + rdmult / 2; + else + rdmult = rdmult * 7 + rdmult / 2; + } #if CONFIG_VP9_HIGHBITDEPTH - int64_t rdmult = 0; switch (cpi->common.bit_depth) { - case VPX_BITS_8: rdmult = 88 * q * q / 24; break; - case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; - case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break; + case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break; + default: break; } -#else - int64_t rdmult = 88 * q * q / 24; #endif // CONFIG_VP9_HIGHBITDEPTH - return rdmult; + return rdmult > 0 ? rdmult : 1; } -int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { - int64_t rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); - +static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) { + int64_t rdmult_64 = rdmult; if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; - const int boost_index = VPXMIN(15, (cpi->rc.gfu_boost / 100)); + const int gfu_boost = cpi->multi_layer_arf + ? gf_group->gfu_boost[gf_group->index] + : cpi->rc.gfu_boost; + const int boost_index = VPXMIN(15, (gfu_boost / 100)); - rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7; - rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7); + rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7; + rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7); } - if (rdmult < 1) rdmult = 1; - return (int)rdmult; + return (int)rdmult_64; +} + +int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { + int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex); + return modulate_rdmult(cpi, rdmult); +} + +int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) { + int rdmult = + vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex); + rdmult = (int)((double)rdmult / beta); + rdmult = rdmult > 0 ? rdmult : 1; + return modulate_rdmult(cpi, rdmult); } static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { @@ -185,10 +240,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break; case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break; - case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; } #else (void)bit_depth; @@ -209,12 +264,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) { x->sadperbit16 = sad_per_bit16lut_10[qindex]; x->sadperbit4 = sad_per_bit4lut_10[qindex]; break; - case VPX_BITS_12: + default: + assert(cpi->common.bit_depth == VPX_BITS_12); x->sadperbit16 = sad_per_bit16lut_12[qindex]; x->sadperbit4 = sad_per_bit4lut_12[qindex]; break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); } #else (void)cpi; @@ -471,13 +525,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, for (i = 0; i < num_4x4_h; i += 4) t_left[i] = !!*(const uint32_t *)&left[i]; break; - case TX_32X32: + default: + assert(tx_size == TX_32X32); for (i = 0; i < num_4x4_w; i += 8) t_above[i] = !!*(const uint64_t *)&above[i]; for (i = 0; i < num_4x4_h; i += 8) t_left[i] = !!*(const uint64_t *)&left[i]; break; - default: assert(0 && "Invalid transform size."); break; } } @@ -493,8 +547,7 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, uint8_t *src_y_ptr = x->plane[0].src.buf; uint8_t *ref_y_ptr; const int num_mv_refs = - MAX_MV_REF_CANDIDATES + - (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size); + MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size); MV pred_mv[3]; pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv; @@ -504,11 +557,12 @@ void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer, near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int == x->mbmi_ext->ref_mvs[ref_frame][1].as_int; + // Get the sad for each candidate reference mv. for (i = 0; i < num_mv_refs; ++i) { const MV *this_mv = &pred_mv[i]; int fp_row, fp_col; - + if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue; if (i == 1 && near_same_nearest) continue; fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3; fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3; @@ -573,6 +627,7 @@ YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi, const VP9_COMMON *const cm = &cpi->common; const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1]; const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame); + assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME); return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX) ? &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL; diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h index 59022c106..fa85f2176 100644 --- a/libvpx/vp9/encoder/vp9_rd.h +++ b/libvpx/vp9/encoder/vp9_rd.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RD_H_ -#define VP9_ENCODER_VP9_RD_H_ +#ifndef VPX_VP9_ENCODER_VP9_RD_H_ +#define VPX_VP9_ENCODER_VP9_RD_H_ #include <limits.h> @@ -108,9 +108,14 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#if CONFIG_CONSISTENT_RECODE + int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; + int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; +#endif int RDMULT; int RDDIV; + double r0; } RD_OPT; typedef struct RD_COST { @@ -129,16 +134,17 @@ struct TileDataEnc; struct VP9_COMP; struct macroblock; -int64_t vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, - int qindex); +int vp9_compute_rd_mult_based_on_qindex(const struct VP9_COMP *cpi, int qindex); int vp9_compute_rd_mult(const struct VP9_COMP *cpi, int qindex); +int vp9_get_adaptive_rdmult(const struct VP9_COMP *cpi, double beta); + void vp9_initialize_rd_consts(struct VP9_COMP *cpi); void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], @@ -169,8 +175,8 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); -void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, - int best_mode_index); +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, const int *const thresh_fact) { @@ -212,4 +218,4 @@ unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RD_H_ +#endif // VPX_VP9_ENCODER_VP9_RD_H_ diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c index 2ba6378c5..debe88f9d 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.c +++ b/libvpx/vp9/encoder/vp9_rdopt.c @@ -59,7 +59,9 @@ typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } MODE_DEFINITION; -typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION; +typedef struct { + MV_REFERENCE_FRAME ref_frame[2]; +} REF_DEFINITION; struct rdcost_block_args { const VP9_COMP *cpi; @@ -541,8 +543,9 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = p->eobs[block]; - if (x->block_tx_domain) { + if (x->block_tx_domain && eob) { const int ss_txfrm_size = tx_size << 1; int64_t this_sse; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -582,14 +585,13 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const uint8_t *src = &p->src.buf[src_idx]; const uint8_t *dst = &pd->dst.buf[dst_idx]; const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const uint16_t *eob = &p->eobs[block]; unsigned int tmp; tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, blk_col, plane_bsize, tx_bsize); *out_sse = (int64_t)tmp * 16; - if (*eob) { + if (eob) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(16, uint16_t, recon16[1024]); uint8_t *recon = (uint8_t *)recon16; @@ -602,22 +604,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd); break; - case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); + default: + assert(tx_size == TX_32X32); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd); break; - default: assert(0 && "Invalid transform size"); } } recon = CONVERT_TO_BYTEPTR(recon16); @@ -625,16 +627,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #endif // CONFIG_VP9_HIGHBITDEPTH vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { - case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; - case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; - case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break; - case TX_4X4: + case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break; + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->inv_txfm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, eob); break; - default: assert(0 && "Invalid transform size"); break; } #if CONFIG_VP9_HIGHBITDEPTH } @@ -699,17 +701,18 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, blk_row, blk_col, plane_bsize, tx_bsize); dist = (int64_t)tmp * 16; } - } else if (max_txsize_lookup[plane_bsize] == tx_size) { - if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_NONE) { + } else { + int skip_txfm_flag = SKIP_TXFM_NONE; + if (max_txsize_lookup[plane_bsize] == tx_size) + skip_txfm_flag = x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))]; + if (skip_txfm_flag == SKIP_TXFM_NONE) { // full forward transform and quantization vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); if (x->block_qcoeff_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size, &dist, &sse); - } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == - SKIP_TXFM_AC_ONLY) { + } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); @@ -736,13 +739,6 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; dist = sse; } - } else { - // full forward transform and quantization - vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); - if (x->block_qcoeff_opt) - vp9_optimize_b(x, plane, block, tx_size, coeff_ctx); - dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse); } rd = RDCOST(x->rdmult, x->rddiv, 0, dist); @@ -761,7 +757,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, rd = VPXMIN(rd1, rd2); if (plane == 0) { x->zcoeff_blk[tx_size][block] = - !x->plane[plane].eobs[block] || (rd1 > rd2 && !xd->lossless); + !x->plane[plane].eobs[block] || + (x->sharpness == 0 && rd1 > rd2 && !xd->lossless); x->sum_y_eobs[tx_size] += x->plane[plane].eobs[block]; } @@ -781,7 +778,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *distortion, int *skippable, int64_t *sse, int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, - TX_SIZE tx_size, int use_fast_coef_casting) { + TX_SIZE tx_size, int use_fast_coef_costing) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; @@ -789,7 +786,7 @@ static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, args.cpi = cpi; args.x = x; args.best_rd = ref_best_rd; - args.use_fast_coef_costing = use_fast_coef_casting; + args.use_fast_coef_costing = use_fast_coef_costing; args.skippable = 1; if (plane == 0) xd->mi[0]->tx_size = tx_size; @@ -843,20 +840,20 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX } }; - int n, m; + int n; int s0, s1; - int64_t best_rd = INT64_MAX; + int64_t best_rd = ref_best_rd; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; - - const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs); + const int tx_size_ctx = get_tx_size_context(xd); assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); if (cm->tx_mode == TX_MODE_SELECT) { start_tx = max_tx_size; - end_tx = 0; + end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0); + if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx); } else { TX_SIZE chosen_tx_size = VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -865,15 +862,9 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } for (n = start_tx; n >= end_tx; n--) { - int r_tx_size = 0; - for (m = 0; m <= n - (n == (int)max_tx_size); m++) { - if (m == n) - r_tx_size += vp9_cost_zero(tx_probs[m]); - else - r_tx_size += vp9_cost_one(tx_probs[m]); - } - txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, - bs, n, cpi->sf.use_fast_coef_costing); + const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], best_rd, 0, bs, n, + cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { r[n][1] += r_tx_size; @@ -1466,11 +1457,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, if (is_compound) this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int; break; - case ZEROMV: + default: + assert(mode == ZEROMV); this_mv[0].as_int = 0; if (is_compound) this_mv[1].as_int = 0; break; - default: break; } mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int; @@ -1829,8 +1820,8 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, bestsme = cpi->find_fractional_mv_step( x, &tmp_mv, &ref_mv[id].as_mv, cpi->common.allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], 0, - cpi->sf.mv.subpel_iters_per_step, NULL, x->nmvjointcost, x->mvcost, - &dis, &sse, second_pred, pw, ph); + cpi->sf.mv.subpel_search_level, NULL, x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, pw, ph, cpi->sf.use_accurate_subpel_search); } // Restore the pointer to the first (possibly scaled) prediction buffer. @@ -1884,6 +1875,8 @@ static int64_t rd_pick_best_sub8x8_mode( const BLOCK_SIZE bsize = mi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; + const int pw = num_4x4_blocks_wide << 2; + const int ph = num_4x4_blocks_high << 2; ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; SPEED_FEATURES *const sf = &cpi->sf; @@ -1992,8 +1985,11 @@ static int64_t rd_pick_best_sub8x8_mode( mvp_full.col = bsi->mvp.as_mv.col >> 3; if (sf->adaptive_motion_search) { - mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; - mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + if (x->pred_mv[mi->ref_frame[0]].row != INT16_MAX && + x->pred_mv[mi->ref_frame[0]].col != INT16_MAX) { + mvp_full.row = x->pred_mv[mi->ref_frame[0]].row >> 3; + mvp_full.col = x->pred_mv[mi->ref_frame[0]].col >> 3; + } step_param = VPXMAX(step_param, 8); } @@ -2015,16 +2011,16 @@ static int64_t rd_pick_best_sub8x8_mode( cpi->find_fractional_mv_step( x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], sf->mv.subpel_force_stop, - sf->mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), + sf->mv.subpel_search_level, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &distortion, - &x->pred_sse[mi->ref_frame[0]], NULL, 0, 0); + &x->pred_sse[mi->ref_frame[0]], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); // save motion search result for use in compound prediction seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv; } - if (sf->adaptive_motion_search) - x->pred_mv[mi->ref_frame[0]] = *new_mv; + x->pred_mv[mi->ref_frame[0]] = *new_mv; // restore src pointers mi_buf_restore(x, orig_src, orig_pre); @@ -2319,6 +2315,61 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, block_size); } +#if CONFIG_NON_GREEDY_MV +#define MAX_PREV_NB_FULL_MV_NUM 8 +static int find_prev_nb_full_mvs(const VP9_COMMON *cm, const MACROBLOCKD *xd, + int ref_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col, int_mv *nb_full_mvs) { + int i; + const TileInfo *tile = &xd->tile; + int full_mv_num = 0; + assert(bsize >= BLOCK_8X8); + for (i = 0; i < MVREF_NEIGHBOURS; ++i) { + const POSITION *mv_ref = &mv_ref_blocks[bsize][i]; + if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) { + const MODE_INFO *nb_mi = + xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride]; + if (nb_mi->sb_type >= BLOCK_8X8) { + if (nb_mi->ref_frame[0] == ref_frame) { + nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[0].as_mv); + ++full_mv_num; + if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { + return full_mv_num; + } + } else if (nb_mi->ref_frame[1] == ref_frame) { + nb_full_mvs[full_mv_num].as_mv = get_full_mv(&nb_mi->mv[1].as_mv); + ++full_mv_num; + if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { + return full_mv_num; + } + } + } else { + int j; + for (j = 0; j < 4; ++j) { + // TODO(angiebird): avoid using duplicated mvs + if (nb_mi->ref_frame[0] == ref_frame) { + nb_full_mvs[full_mv_num].as_mv = + get_full_mv(&nb_mi->bmi[j].as_mv[0].as_mv); + ++full_mv_num; + if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { + return full_mv_num; + } + } else if (nb_mi->ref_frame[1] == ref_frame) { + nb_full_mvs[full_mv_num].as_mv = + get_full_mv(&nb_mi->bmi[j].as_mv[1].as_mv); + ++full_mv_num; + if (full_mv_num >= MAX_PREV_NB_FULL_MV_NUM) { + return full_mv_num; + } + } + } + } + } + } + return full_mv_num; +} +#endif // CONFIG_NON_GREEDY_MV + static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv) { @@ -2326,19 +2377,33 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, const VP9_COMMON *cm = &cpi->common; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; - int bestsme = INT_MAX; int step_param; - int sadpb = x->sadperbit16; MV mvp_full; int ref = mi->ref_frame[0]; MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv; const MvLimits tmp_mv_limits = x->mv_limits; int cost_list[5]; - + const int best_predmv_idx = x->mv_best_ref_index[ref]; const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref); - + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; MV pred_mv[3]; + +#if CONFIG_NON_GREEDY_MV + double mv_dist = 0; + double mv_cost = 0; + double lambda = (pw * ph) / 4; + double bestsme; + int_mv nb_full_mvs[MAX_PREV_NB_FULL_MV_NUM]; + + const int nb_full_mv_num = + find_prev_nb_full_mvs(cm, xd, ref, bsize, mi_row, mi_col, nb_full_mvs); +#else // CONFIG_NON_GREEDY_MV + int bestsme = INT_MAX; + int sadpb = x->sadperbit16; +#endif // CONFIG_NON_GREEDY_MV + pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv; pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv; pred_mv[2] = x->pred_mv[ref]; @@ -2367,7 +2432,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) { - int boffset = + const int boffset = 2 * (b_width_log2_lookup[BLOCK_64X64] - VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize])); step_param = VPXMAX(step_param, boffset); @@ -2385,8 +2450,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int i; for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) { if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) { - x->pred_mv[ref].row = 0; - x->pred_mv[ref].col = 0; + x->pred_mv[ref].row = INT16_MAX; + x->pred_mv[ref].col = INT16_MAX; tmp_mv->as_int = INVALID_MV; if (scaled_ref_frame) { @@ -2404,14 +2469,69 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, // after full-pixel motion search. vp9_set_mv_search_range(&x->mv_limits, &ref_mv); - mvp_full = pred_mv[x->mv_best_ref_index[ref]]; - + mvp_full = pred_mv[best_predmv_idx]; mvp_full.col >>= 3; mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + bestsme = vp9_full_pixel_diamond_new( + cpi, x, &mvp_full, step_param, lambda, 1, &cpi->fn_ptr[bsize], + nb_full_mvs, nb_full_mv_num, &tmp_mv->as_mv, &mv_dist, &mv_cost); +#else // CONFIG_NON_GREEDY_MV bestsme = vp9_full_pixel_search( cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + + if (cpi->sf.enhanced_full_pixel_motion_search) { + int i; + for (i = 0; i < 3; ++i) { +#if CONFIG_NON_GREEDY_MV + double this_me; +#else // CONFIG_NON_GREEDY_MV + int this_me; +#endif // CONFIG_NON_GREEDY_MV + MV this_mv; + int diff_row; + int diff_col; + int step; + + if (pred_mv[i].row == INT16_MAX || pred_mv[i].col == INT16_MAX) continue; + if (i == best_predmv_idx) continue; + + diff_row = ((int)pred_mv[i].row - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].row) >> + 3; + diff_col = ((int)pred_mv[i].col - + pred_mv[i > 0 ? (i - 1) : best_predmv_idx].col) >> + 3; + if (diff_row == 0 && diff_col == 0) continue; + if (diff_row < 0) diff_row = -diff_row; + if (diff_col < 0) diff_col = -diff_col; + step = get_msb((diff_row + diff_col + 1) >> 1); + if (step <= 0) continue; + + mvp_full = pred_mv[i]; + mvp_full.col >>= 3; + mvp_full.row >>= 3; +#if CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_diamond_new( + cpi, x, &mvp_full, VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + lambda, 1, &cpi->fn_ptr[bsize], nb_full_mvs, nb_full_mv_num, &this_mv, + &mv_dist, &mv_cost); +#else // CONFIG_NON_GREEDY_MV + this_me = vp9_full_pixel_search( + cpi, x, bsize, &mvp_full, + VPXMAX(step_param, MAX_MVSEARCH_STEPS - step), + cpi->sf.mv.search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &this_mv, INT_MAX, 1); +#endif // CONFIG_NON_GREEDY_MV + if (this_me < bestsme) { + tmp_mv->as_mv = this_mv; + bestsme = this_me; + } + } + } x->mv_limits = tmp_mv_limits; @@ -2420,13 +2540,14 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, cpi->find_fractional_mv_step( x, &tmp_mv->as_mv, &ref_mv, cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop, - cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list), - x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + cpi->sf.mv.subpel_search_level, cond_cost_list(cpi, cost_list), + x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, pw, ph, + cpi->sf.use_accurate_subpel_search); } *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); - if (cpi->sf.adaptive_motion_search) x->pred_mv[ref] = tmp_mv->as_mv; + x->pred_mv[ref] = tmp_mv->as_mv; if (scaled_ref_frame) { int i; @@ -2771,7 +2892,7 @@ static int64_t handle_inter_mode( memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm)); memcpy(x->bsse, bsse, sizeof(bsse)); - if (!skip_txfm_sb) { + if (!skip_txfm_sb || xd->lossless) { int skippable_y, skippable_uv; int64_t sseuv = INT64_MAX; int64_t rdcosty = INT64_MAX; @@ -2898,7 +3019,7 @@ static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - if (source_variance > 0) { + if (source_variance > 100) { rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd); src_variance = source_variance; @@ -2909,7 +3030,7 @@ static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd); } } else { - if (source_variance > 0) { + if (source_variance > 100) { rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); src_variance = source_variance; @@ -2919,7 +3040,7 @@ static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x, } } #else - if (source_variance > 0) { + if (source_variance > 100) { rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize); src_variance = source_variance; } else { @@ -3066,17 +3187,19 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, const int intra_cost_penalty = vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q); int best_skip2 = 0; - uint8_t ref_frame_skip_mask[2] = { 0 }; + uint8_t ref_frame_skip_mask[2] = { 0, 1 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; int mode_skip_start = sf->mode_skip_start + 1; const int *const rd_threshes = rd_opt->threshes[segment_id][bsize]; const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize]; int64_t mode_threshold[MAX_MODES]; - int *tile_mode_map = tile_data->mode_map[bsize]; - int mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid - // lock mechanism involved with reads from - // tile_mode_map + int8_t *tile_mode_map = tile_data->mode_map[bsize]; + int8_t mode_map[MAX_MODES]; // Maintain mode_map information locally to avoid + // lock mechanism involved with reads from + // tile_mode_map const int mode_search_skip_flags = sf->mode_search_skip_flags; + const int is_rect_partition = + num_4x4_blocks_wide_lookup[bsize] != num_4x4_blocks_high_lookup[bsize]; int64_t mask_filter = 0; int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS]; @@ -3105,7 +3228,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; - if (cpi->ref_frame_flags & flag_list[ref_frame]) { + if ((cpi->ref_frame_flags & flag_list[ref_frame]) && + !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) { assert(get_ref_frame_buffer(cpi, ref_frame) != NULL); setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); @@ -3228,18 +3352,21 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_zero(x->sum_y_eobs); + if (is_rect_partition) { + if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; + if (second_ref_frame > 0 && + (ctx->skip_ref_frame_mask & (1 << second_ref_frame))) + continue; + } + // Look at the reference frame of the best mode so far and set the // skip mask to look at a subset of the remaining modes. if (midx == mode_skip_start && best_mode_index >= 0) { switch (best_mbmode.ref_frame[0]) { case INTRA_FRAME: break; - case LAST_FRAME: - ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; - break; + case LAST_FRAME: ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK; break; case GOLDEN_FRAME: ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK; - ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK; break; case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break; case NONE: @@ -3313,6 +3440,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + // Skip compound inter modes if ARF is not available. if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; @@ -3616,9 +3747,13 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { - // If adaptive interp filter is enabled, then the current leaf node of 8x8 - // data is needed for sub8x8. Hence preserve the context. +// If adaptive interp filter is enabled, then the current leaf node of 8x8 +// data is needed for sub8x8. Hence preserve the context. +#if CONFIG_CONSISTENT_RECODE + if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#else if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; +#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; @@ -3894,7 +4029,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, #if CONFIG_BETTER_HW_COMPATIBILITY // forbid 8X4 and 4X8 partitions if any reference frame is scaled. if (bsize == BLOCK_8X4 || bsize == BLOCK_4X8) { - int ref_scaled = vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); + int ref_scaled = ref_frame > INTRA_FRAME && + vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf); if (second_ref_frame > INTRA_FRAME) ref_scaled += vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf); if (ref_scaled) continue; @@ -3940,6 +4076,11 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data, comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; + + if (cm->ref_frame_sign_bias[ref_frame] == + cm->ref_frame_sign_bias[second_ref_frame]) + continue; + if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue; // Do not allow compound prediction if the segment level reference frame // feature is in use as in this case there can only be one reference. diff --git a/libvpx/vp9/encoder/vp9_rdopt.h b/libvpx/vp9/encoder/vp9_rdopt.h index 795c91aef..8b810bc47 100644 --- a/libvpx/vp9/encoder/vp9_rdopt.h +++ b/libvpx/vp9/encoder/vp9_rdopt.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RDOPT_H_ -#define VP9_ENCODER_VP9_RDOPT_H_ +#ifndef VPX_VP9_ENCODER_VP9_RDOPT_H_ +#define VPX_VP9_ENCODER_VP9_RDOPT_H_ #include "vp9/common/vp9_blockd.h" @@ -56,4 +56,4 @@ void vp9_rd_pick_inter_mode_sub8x8(struct VP9_COMP *cpi, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RDOPT_H_ +#endif // VPX_VP9_ENCODER_VP9_RDOPT_H_ diff --git a/libvpx/vp9/encoder/vp9_resize.c b/libvpx/vp9/encoder/vp9_resize.c index f6c4aad4d..23a320ae5 100644 --- a/libvpx/vp9/encoder/vp9_resize.c +++ b/libvpx/vp9/encoder/vp9_resize.c @@ -424,11 +424,11 @@ void vp9_resize_plane(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; - uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height); + uint8_t *intbuf = (uint8_t *)calloc(width2 * height, sizeof(*intbuf)); uint8_t *tmpbuf = - (uint8_t *)malloc(sizeof(uint8_t) * (width < height ? height : width)); - uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * height); - uint8_t *arrbuf2 = (uint8_t *)malloc(sizeof(uint8_t) * height2); + (uint8_t *)calloc(width < height ? height : width, sizeof(*tmpbuf)); + uint8_t *arrbuf = (uint8_t *)calloc(height, sizeof(*arrbuf)); + uint8_t *arrbuf2 = (uint8_t *)calloc(height2, sizeof(*arrbuf2)); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; assert(width > 0); @@ -720,6 +720,10 @@ void vp9_highbd_resize_plane(const uint8_t *const input, int height, int width, uint16_t *arrbuf2 = (uint16_t *)malloc(sizeof(uint16_t) * height2); if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) goto Error; + assert(width > 0); + assert(height > 0); + assert(width2 > 0); + assert(height2 > 0); for (i = 0; i < height; ++i) { highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width, intbuf + width2 * i, width2, tmpbuf, bd); diff --git a/libvpx/vp9/encoder/vp9_resize.h b/libvpx/vp9/encoder/vp9_resize.h index d3282ee19..5d4ce97eb 100644 --- a/libvpx/vp9/encoder/vp9_resize.h +++ b/libvpx/vp9/encoder/vp9_resize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_RESIZE_H_ -#define VP9_ENCODER_VP9_RESIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_RESIZE_H_ +#define VPX_VP9_ENCODER_VP9_RESIZE_H_ #include <stdio.h> #include "vpx/vpx_integer.h" @@ -65,4 +65,4 @@ void vp9_highbd_resize_frame444(const uint8_t *const y, int y_stride, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_RESIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_RESIZE_H_ diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c index 4a5a68e07..812d3fccd 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.c +++ b/libvpx/vp9/encoder/vp9_segmentation.c @@ -46,6 +46,19 @@ void vp9_clear_segdata(struct segmentation *seg, int segment_id, seg->feature_data[segment_id][feature_id] = 0; } +void vp9_psnr_aq_mode_setup(struct segmentation *seg) { + int i; + + vp9_enable_segmentation(seg); + vp9_clearall_segfeatures(seg); + seg->abs_delta = SEGMENT_DELTADATA; + + for (i = 0; i < MAX_SEGMENTS; ++i) { + vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, 2 * (i - (MAX_SEGMENTS / 2))); + vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q); + } +} + // Based on set of segment counts calculate a probability tree static void calc_segtree_probs(int *segcounts, vpx_prob *segment_tree_probs) { // Work out probabilities of each segment diff --git a/libvpx/vp9/encoder/vp9_segmentation.h b/libvpx/vp9/encoder/vp9_segmentation.h index 562805543..aa34dc88b 100644 --- a/libvpx/vp9/encoder/vp9_segmentation.h +++ b/libvpx/vp9/encoder/vp9_segmentation.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SEGMENTATION_H_ -#define VP9_ENCODER_VP9_SEGMENTATION_H_ +#ifndef VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ +#define VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ #include "vp9/common/vp9_blockd.h" #include "vp9/encoder/vp9_encoder.h" @@ -26,6 +26,8 @@ void vp9_disable_segfeature(struct segmentation *seg, int segment_id, void vp9_clear_segdata(struct segmentation *seg, int segment_id, SEG_LVL_FEATURES feature_id); +void vp9_psnr_aq_mode_setup(struct segmentation *seg); + // The values given for each segment can be either deltas (from the default // value chosen for the frame) or absolute values. // @@ -47,4 +49,4 @@ void vp9_reset_segment_features(struct segmentation *seg); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SEGMENTATION_H_ +#endif // VPX_VP9_ENCODER_VP9_SEGMENTATION_H_ diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h index 8880bff46..46a722af9 100644 --- a/libvpx/vp9/encoder/vp9_skin_detection.h +++ b/libvpx/vp9/encoder/vp9_skin_detection.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SKIN_MAP_H_ -#define VP9_ENCODER_VP9_SKIN_MAP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ +#define VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ #include "vp9/common/vp9_blockd.h" #include "vpx_dsp/skin_detection.h" @@ -37,4 +37,4 @@ void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SKIN_MAP_H_ +#endif // VPX_VP9_ENCODER_VP9_SKIN_DETECTION_H_ diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c index a05db60c6..5aede927b 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.c +++ b/libvpx/vp9/encoder/vp9_speed_features.c @@ -32,7 +32,7 @@ static MESH_PATTERN // Intra only frames, golden frames (except alt ref overlays) and // alt ref frames tend to be coded at a higher than ambient quality static int frame_is_boosted(const VP9_COMP *cpi) { - return frame_is_kf_gf_arf(cpi) || vp9_is_upper_layer_key_frame(cpi); + return frame_is_kf_gf_arf(cpi); } // Sets a partition size down to which the auto partition code will always @@ -61,46 +61,92 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed) { VP9_COMMON *const cm = &cpi->common; + const int min_frame_size = VPXMIN(cm->width, cm->height); + const int is_480p_or_larger = min_frame_size >= 480; + const int is_720p_or_larger = min_frame_size >= 720; + const int is_1080p_or_larger = min_frame_size >= 1080; + const int is_2160p_or_larger = min_frame_size >= 2160; // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); sf->partition_search_breakout_thr.rate = 80; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; - // Currently, the machine-learning based partition search early termination - // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. - if (VPXMIN(cm->width, cm->height) >= 480) { + if (is_480p_or_larger) { + // Currently, the machine-learning based partition search early termination + // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0. sf->ml_partition_search_early_termination = 1; + } else { + sf->use_square_only_thresh_high = BLOCK_32X32; + } + + if (!is_1080p_or_larger) { + sf->use_ml_partition_search_breakout = 1; + if (is_720p_or_larger) { + sf->ml_partition_search_breakout_thresh[0] = 0.0f; + sf->ml_partition_search_breakout_thresh[1] = 0.0f; + sf->ml_partition_search_breakout_thresh[2] = 0.0f; + } else { + sf->ml_partition_search_breakout_thresh[0] = 2.5f; + sf->ml_partition_search_breakout_thresh[1] = 1.5f; + sf->ml_partition_search_breakout_thresh[2] = 1.5f; + } } if (speed >= 1) { sf->ml_partition_search_early_termination = 0; - - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->use_ml_partition_search_breakout = 1; + if (is_480p_or_larger) + sf->use_square_only_thresh_high = BLOCK_64X64; + else + sf->use_square_only_thresh_high = BLOCK_32X32; + sf->use_square_only_thresh_low = BLOCK_16X16; + if (is_720p_or_larger) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; - sf->partition_search_breakout_thr.dist = (1 << 23); + sf->partition_search_breakout_thr.dist = (1 << 22); + sf->ml_partition_search_breakout_thresh[0] = -5.0f; + sf->ml_partition_search_breakout_thresh[1] = -5.0f; + sf->ml_partition_search_breakout_thresh[2] = -9.0f; } else { sf->disable_split_mask = DISABLE_COMPOUND_SPLIT; sf->partition_search_breakout_thr.dist = (1 << 21); + sf->ml_partition_search_breakout_thresh[0] = -1.0f; + sf->ml_partition_search_breakout_thresh[1] = -1.0f; + sf->ml_partition_search_breakout_thresh[2] = -1.0f; + } +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) { + sf->ml_partition_search_breakout_thresh[0] -= 1.0f; + sf->ml_partition_search_breakout_thresh[1] -= 1.0f; + sf->ml_partition_search_breakout_thresh[2] -= 1.0f; } +#endif // CONFIG_VP9_HIGHBITDEPTH } if (speed >= 2) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->use_square_only_thresh_high = BLOCK_4X4; + sf->use_square_only_thresh_low = BLOCK_SIZES; + if (is_720p_or_larger) { sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT; sf->adaptive_pred_interp_filter = 0; sf->partition_search_breakout_thr.dist = (1 << 24); sf->partition_search_breakout_thr.rate = 120; + sf->use_ml_partition_search_breakout = 0; } else { sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY; sf->partition_search_breakout_thr.dist = (1 << 22); sf->partition_search_breakout_thr.rate = 100; + sf->ml_partition_search_breakout_thresh[0] = 0.0f; + sf->ml_partition_search_breakout_thresh[1] = -1.0f; + sf->ml_partition_search_breakout_thresh[2] = -4.0f; } sf->rd_auto_partition_min_limit = set_partition_min_limit(cm); // Use a set of speed features for 4k videos. - if (VPXMIN(cm->width, cm->height) >= 2160) { + if (is_2160p_or_larger) { sf->use_square_partition_only = 1; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC; @@ -112,7 +158,8 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } if (speed >= 3) { - if (VPXMIN(cm->width, cm->height) >= 720) { + sf->use_ml_partition_search_breakout = 0; + if (is_720p_or_larger) { sf->disable_split_mask = DISABLE_ALL_SPLIT; sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0; sf->partition_search_breakout_thr.dist = (1 << 25); @@ -137,7 +184,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, if (speed >= 4) { sf->partition_search_breakout_thr.rate = 300; - if (VPXMIN(cm->width, cm->height) >= 720) { + if (is_720p_or_larger) { sf->partition_search_breakout_thr.dist = (1 << 26); } else { sf->partition_search_breakout_thr.dist = (1 << 24); @@ -166,28 +213,40 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !frame_is_boosted(cpi); - sf->use_square_only_threshold = BLOCK_16X16; + sf->use_square_partition_only = !boosted; + sf->prune_ref_frame_for_rect_partitions = 1; + sf->ml_var_partition_pruning = 1; + + sf->ml_prune_rect_partition_threhold[0] = -1; + sf->ml_prune_rect_partition_threhold[1] = 350; + sf->ml_prune_rect_partition_threhold[2] = 325; + sf->ml_prune_rect_partition_threhold[3] = 250; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { sf->exhaustive_searches_thresh = (1 << 22); - for (i = 0; i < MAX_MESH_STEP; ++i) { - int mesh_density_level = 0; - sf->mesh_patterns[i].range = - good_quality_mesh_patterns[mesh_density_level][i].range; - sf->mesh_patterns[i].interval = - good_quality_mesh_patterns[mesh_density_level][i].interval; - } } else { sf->exhaustive_searches_thresh = INT_MAX; } + for (i = 0; i < MAX_MESH_STEP; ++i) { + const int mesh_density_level = 0; + sf->mesh_patterns[i].range = + good_quality_mesh_patterns[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + good_quality_mesh_patterns[mesh_density_level][i].interval; + } + if (speed >= 1) { + sf->ml_var_partition_pruning = !boosted; + sf->ml_prune_rect_partition_threhold[1] = 200; + sf->ml_prune_rect_partition_threhold[2] = 200; + sf->ml_prune_rect_partition_threhold[3] = 200; + if (oxcf->pass == 2) { TWO_PASS *const twopass = &cpi->twopass; if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) || vp9_internal_image_edge(cpi)) { - sf->use_square_partition_only = !frame_is_boosted(cpi); + sf->use_square_partition_only = !boosted; } else { sf->use_square_partition_only = !frame_is_intra_only(cm); } @@ -199,15 +258,12 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5]; sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5]; - - sf->use_square_only_threshold = BLOCK_4X4; sf->less_rectangular_check = 1; - sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; - sf->mv.subpel_iters_per_step = 1; + sf->mv.subpel_search_level = 1; sf->mode_skip_start = 10; sf->adaptive_pred_interp_filter = 1; sf->allow_acl = 0; @@ -223,9 +279,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 23) : INT_MAX; + sf->use_accurate_subpel_search = USE_4_TAPS; } if (speed >= 2) { + sf->ml_var_partition_pruning = 0; if (oxcf->vbr_corpus_complexity) sf->recode_loop = ALLOW_RECODE_FIRST; else @@ -247,6 +305,12 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX; sf->recode_tolerance_low = 15; sf->recode_tolerance_high = 45; + sf->enhanced_full_pixel_motion_search = 0; + sf->prune_ref_frame_for_rect_partitions = 0; + sf->ml_prune_rect_partition_threhold[1] = -1; + sf->ml_prune_rect_partition_threhold[2] = -1; + sf->ml_prune_rect_partition_threhold[3] = -1; + sf->mv.subpel_search_level = 0; if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { for (i = 0; i < MAX_MESH_STEP; ++i) { @@ -257,6 +321,8 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, good_quality_mesh_patterns[mesh_density_level][i].interval; } } + + sf->use_accurate_subpel_search = USE_2_TAPS; } if (speed >= 3) { @@ -358,6 +424,7 @@ static void set_rt_speed_feature_framesize_dependent(VP9_COMP *cpi, static void set_rt_speed_feature_framesize_independent( VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, vp9e_tune_content content) { VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; const int is_keyframe = cm->frame_type == KEY_FRAME; const int frames_since_key = is_keyframe ? 0 : cpi->rc.frames_since_key; sf->static_segmentation = 0; @@ -374,6 +441,12 @@ static void set_rt_speed_feature_framesize_independent( sf->use_compound_nonrd_pickmode = 0; sf->nonrd_keyframe = 0; sf->svc_use_lowres_part = 0; + sf->overshoot_detection_cbr_rt = NO_DETECTION; + sf->disable_16x16part_nonkey = 0; + sf->disable_golden_ref = 0; + sf->enable_tpl_model = 0; + sf->enhanced_full_pixel_motion_search = 0; + sf->use_accurate_subpel_search = USE_2_TAPS; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -407,7 +480,7 @@ static void set_rt_speed_feature_framesize_independent( // Reference masking only enabled for 1 spatial layer, and if none of the // references have been scaled. The latter condition needs to be checked // for external or internal dynamic resize. - sf->reference_masking = (cpi->svc.number_spatial_layers == 1); + sf->reference_masking = (svc->number_spatial_layers == 1); if (sf->reference_masking == 1 && (cpi->external_resize == 1 || cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) { @@ -440,7 +513,7 @@ static void set_rt_speed_feature_framesize_independent( sf->disable_filter_search_var_thresh = 100; sf->use_uv_intra_rd_estimate = 1; sf->skip_encode_sb = 1; - sf->mv.subpel_iters_per_step = 1; + sf->mv.subpel_search_level = 0; sf->adaptive_rd_thresh = 4; sf->mode_skip_start = 6; sf->allow_skip_recode = 0; @@ -460,7 +533,7 @@ static void set_rt_speed_feature_framesize_independent( sf->adjust_partitioning_from_last_frame = cm->last_frame_type != cm->frame_type || (0 == (frames_since_key + 1) % sf->last_partitioning_redo_frequency); - sf->mv.subpel_force_stop = 1; + sf->mv.subpel_force_stop = QUARTER_PEL; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_DC_H_V; sf->intra_uv_mode_mask[i] = INTRA_DC; @@ -489,6 +562,16 @@ static void set_rt_speed_feature_framesize_independent( (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); sf->max_delta_qindex = is_keyframe ? 20 : 15; sf->partition_search_type = REFERENCE_PARTITION; +#if CONFIG_ML_VAR_PARTITION + if (!frame_is_intra_only(cm) && cm->width >= 360 && cm->height >= 360) + sf->partition_search_type = ML_BASED_PARTITION; + else + sf->partition_search_type = REFERENCE_PARTITION; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->partition_search_type = REFERENCE_PARTITION; +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // CONFIG_ML_VAR_PARTITION if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref) { sf->partition_search_type = VAR_BASED_PARTITION; @@ -531,6 +614,17 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 1; if (!cpi->use_svc) sf->bias_golden = 1; } + // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent + // increase in encoding time. + if (cpi->use_svc && svc->spatial_layer_id > 0) sf->nonrd_keyframe = 1; + if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG && + cpi->oxcf.rc_mode == VPX_CBR) + sf->overshoot_detection_cbr_rt = FAST_DETECTION_MAXQ; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cm->width <= 1280 && cm->height <= 720) { + sf->use_altref_onepass = 1; + sf->use_compound_nonrd_pickmode = 1; + } } if (speed >= 6) { @@ -538,9 +632,16 @@ static void set_rt_speed_feature_framesize_independent( sf->use_altref_onepass = 1; sf->use_compound_nonrd_pickmode = 1; } +#if CONFIG_ML_VAR_PARTITION + if (frame_is_intra_only(cm) || cm->width < 360 || cm->height < 360) + sf->partition_search_type = VAR_BASED_PARTITION; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->Source->flags & YV12_FLAG_HIGHBITDEPTH) + sf->partition_search_type = VAR_BASED_PARTITION; +#endif // CONFIG_VP9_HIGHBITDEPTH +#else sf->partition_search_type = VAR_BASED_PARTITION; - // Turn on this to use non-RD key frame coding mode. - sf->use_nonrd_pick_mode = 1; +#endif // CONFIG_ML_VAR_PARTITION sf->mv.search_method = NSTEP; sf->mv.reduce_first_step_size = 1; sf->skip_encode_sb = 0; @@ -553,7 +654,7 @@ static void set_rt_speed_feature_framesize_independent( (cm->width * cm->height <= 640 * 360) ? 40000 : 60000; if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + svc->spatial_layer_id == svc->number_spatial_layers - 1)) { cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); } @@ -562,7 +663,7 @@ static void set_rt_speed_feature_framesize_independent( // Enable short circuit for low temporal variance. sf->short_circuit_low_temp_var = 1; } - if (cpi->svc.temporal_layer_id > 0) { + if (svc->temporal_layer_id > 0) { sf->adaptive_rd_thresh = 4; sf->limit_newmv_early_exit = 0; sf->base_mv_aggressive = 1; @@ -576,16 +677,15 @@ static void set_rt_speed_feature_framesize_independent( sf->mv.fullpel_search_step_param = 10; // For SVC: use better mv search on base temporal layer, and only // on base spatial layer if highest resolution is above 640x360. - if (cpi->svc.number_temporal_layers > 2 && - cpi->svc.temporal_layer_id == 0 && - (cpi->svc.spatial_layer_id == 0 || + if (svc->number_temporal_layers > 2 && svc->temporal_layer_id == 0 && + (svc->spatial_layer_id == 0 || cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) { sf->mv.search_method = NSTEP; sf->mv.fullpel_search_step_param = 6; } - if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) { + if (svc->temporal_layer_id > 0 || svc->spatial_layer_id > 1) { sf->use_simple_block_yrd = 1; - if (cpi->svc.non_reference_frame) + if (svc->non_reference_frame) sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE; } if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1) @@ -596,22 +696,29 @@ static void set_rt_speed_feature_framesize_independent( if (!cpi->last_frame_dropped && cpi->resize_state == ORIG && !cpi->external_resize && (!cpi->use_svc || - cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { + (svc->spatial_layer_id == svc->number_spatial_layers - 1 && + !svc->last_layer_dropped[svc->number_spatial_layers - 1]))) { sf->copy_partition_flag = 1; cpi->max_copied_frame = 2; // The top temporal enhancement layer (for number of temporal layers > 1) // are non-reference frames, so use large/max value for max_copied_frame. - if (cpi->svc.number_temporal_layers > 1 && - cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1) + if (svc->number_temporal_layers > 1 && + svc->temporal_layer_id == svc->number_temporal_layers - 1) cpi->max_copied_frame = 255; } // For SVC: enable use of lower resolution partition for higher resolution, // only for 3 spatial layers and when config/top resolution is above VGA. // Enable only for non-base temporal layer frames. - if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 && - cpi->svc.temporal_layer_id > 0 && + if (cpi->use_svc && svc->use_partition_reuse && + svc->number_spatial_layers == 3 && svc->temporal_layer_id > 0 && cpi->oxcf.width * cpi->oxcf.height > 640 * 480) sf->svc_use_lowres_part = 1; + // For SVC when golden is used as second temporal reference: to avoid + // encode time increase only use this feature on base temporal layer. + // (i.e remove golden flag from frame_flags for temporal_layer_id > 0). + if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer && + svc->temporal_layer_id > 0) + cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); } if (speed >= 8) { @@ -622,7 +729,7 @@ static void set_rt_speed_feature_framesize_independent( if (cpi->row_mt && cpi->oxcf.max_threads > 1) sf->adaptive_rd_thresh_row_mt = 1; - if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3; + if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = FULL_PEL; if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. if (!is_keyframe) { @@ -652,6 +759,22 @@ static void set_rt_speed_feature_framesize_independent( sf->limit_newmv_early_exit = 0; sf->use_simple_block_yrd = 1; } + + if (speed >= 9) { + sf->mv.enable_adaptive_subpel_force_stop = 1; + sf->mv.adapt_subpel_force_stop.mv_thresh = 2; + if (cpi->rc.avg_frame_low_motion < 40) + sf->mv.adapt_subpel_force_stop.mv_thresh = 1; + sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL; + sf->mv.adapt_subpel_force_stop.force_stop_above = HALF_PEL; + // Disable partition blocks below 16x16, except for low-resolutions. + if (cm->frame_type != KEY_FRAME && cm->width >= 320 && cm->height >= 240) + sf->disable_16x16part_nonkey = 1; + // Allow for disabling GOLDEN reference, for CBR mode. + if (cpi->oxcf.rc_mode == VPX_CBR) sf->disable_golden_ref = 1; + if (cpi->rc.avg_frame_low_motion < 65) sf->default_interp_filter = BILINEAR; + } + if (sf->use_altref_onepass) { if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) { sf->partition_search_type = FIXED_PARTITION; @@ -666,6 +789,19 @@ static void set_rt_speed_feature_framesize_independent( (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(*cpi->count_lastgolden_frame_usage)); } + if (svc->previous_frame_is_intra_only) { + sf->partition_search_type = FIXED_PARTITION; + sf->always_this_block_size = BLOCK_64X64; + } + // Special case for screen content: increase motion search on base spatial + // layer when high motion is detected or previous SL0 frame was dropped. + if (cpi->oxcf.content == VP9E_CONTENT_SCREEN && cpi->oxcf.speed >= 5 && + (svc->high_num_blocks_with_motion || svc->last_layer_dropped[0])) { + sf->mv.search_method = NSTEP; + // TODO(marpan/jianj): Tune this setting for screensharing. For now use + // small step_param for all spatial layers. + sf->mv.fullpel_search_step_param = 2; + } } void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { @@ -679,6 +815,7 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { sf->partition_search_breakout_thr.dist = (1 << 19); sf->partition_search_breakout_thr.rate = 80; sf->ml_partition_search_early_termination = 0; + sf->use_ml_partition_search_breakout = 0; if (oxcf->mode == REALTIME) { set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed); @@ -710,12 +847,6 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) { if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; - - // This is only used in motion vector unit test. - if (cpi->oxcf.motion_vector_unit_test == 1) - cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; - else if (cpi->oxcf.motion_vector_unit_test == 2) - cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { @@ -730,8 +861,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->mv.search_method = NSTEP; sf->recode_loop = ALLOW_RECODE_FIRST; sf->mv.subpel_search_method = SUBPEL_TREE; - sf->mv.subpel_iters_per_step = 2; - sf->mv.subpel_force_stop = 0; + sf->mv.subpel_search_level = 2; + sf->mv.subpel_force_stop = EIGHTH_PEL; sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf); sf->mv.reduce_first_step_size = 0; sf->coeff_prob_appx_step = 1; @@ -741,6 +872,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->tx_size_search_method = USE_FULL_RD; sf->use_lp32x32fdct = 0; sf->adaptive_motion_search = 0; + sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; sf->cb_pred_filter_search = 0; @@ -752,7 +884,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->partition_search_type = SEARCH_PARTITION; sf->less_rectangular_check = 0; sf->use_square_partition_only = 0; - sf->use_square_only_threshold = BLOCK_SIZES; + sf->use_square_only_thresh_high = BLOCK_SIZES; + sf->use_square_only_thresh_low = BLOCK_4X4; sf->auto_min_max_partition_size = NOT_IN_USE; sf->rd_auto_partition_min_limit = BLOCK_4X4; sf->default_max_partition_size = BLOCK_64X64; @@ -771,6 +904,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = 99.0; sf->allow_acl = 1; + sf->enable_tpl_model = oxcf->enable_tpl_model; + sf->prune_ref_frame_for_rect_partitions = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; @@ -804,10 +939,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->limit_newmv_early_exit = 0; sf->bias_golden = 0; sf->base_mv_aggressive = 0; + sf->ml_prune_rect_partition_threhold[0] = -1; + sf->ml_prune_rect_partition_threhold[1] = -1; + sf->ml_prune_rect_partition_threhold[2] = -1; + sf->ml_prune_rect_partition_threhold[3] = -1; + sf->ml_var_partition_pruning = 0; + sf->use_accurate_subpel_search = USE_8_TAPS; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; + sf->tx_size_search_depth = 2; sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) @@ -837,7 +979,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->optimize_coefficients = 0; } - if (sf->mv.subpel_force_stop == 3) { + if (sf->mv.subpel_force_stop == FULL_PEL) { // Whole pel only cpi->find_fractional_mv_step = vp9_skip_sub_pixel_tree; } else if (sf->mv.subpel_search_method == SUBPEL_TREE) { @@ -850,6 +992,12 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { cpi->find_fractional_mv_step = vp9_find_best_sub_pixel_tree_pruned_evenmore; } + // This is only used in motion vector unit test. + if (cpi->oxcf.motion_vector_unit_test == 1) + cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; + else if (cpi->oxcf.motion_vector_unit_test == 2) + cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; + x->optimize = sf->optimize_coefficients == 1 && oxcf->pass != 1; x->min_partition_size = sf->default_min_partition_size; @@ -867,10 +1015,4 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact && oxcf->max_threads > 1) sf->adaptive_rd_thresh = 0; - - // This is only used in motion vector unit test. - if (cpi->oxcf.motion_vector_unit_test == 1) - cpi->find_fractional_mv_step = vp9_return_max_sub_pixel_mv; - else if (cpi->oxcf.motion_vector_unit_test == 2) - cpi->find_fractional_mv_step = vp9_return_min_sub_pixel_mv; } diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h index 50d52bc23..9b09ec474 100644 --- a/libvpx/vp9/encoder/vp9_speed_features.h +++ b/libvpx/vp9/encoder/vp9_speed_features.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SPEED_FEATURES_H_ -#define VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#ifndef VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#define VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ #include "vp9/common/vp9_enums.h" @@ -57,7 +57,8 @@ typedef enum { BIGDIA = 3, SQUARE = 4, FAST_HEX = 5, - FAST_DIAMOND = 6 + FAST_DIAMOND = 6, + MESH = 7 } SEARCH_METHODS; typedef enum { @@ -135,20 +136,25 @@ typedef enum { } INTERP_FILTER_MASK; typedef enum { - // Search partitions using RD/NONRD criterion + // Search partitions using RD/NONRD criterion. SEARCH_PARTITION, - // Always use a fixed size partition + // Always use a fixed size partition. FIXED_PARTITION, REFERENCE_PARTITION, // Use an arbitrary partitioning scheme based on source variance within - // a 64X64 SB + // a 64X64 SB. VAR_BASED_PARTITION, - // Use non-fixed partitions based on source variance - SOURCE_VAR_BASED_PARTITION + // Use non-fixed partitions based on source variance. + SOURCE_VAR_BASED_PARTITION, + +#if CONFIG_ML_VAR_PARTITION + // Make partition decisions with machine learning models. + ML_BASED_PARTITION +#endif // CONFIG_ML_VAR_PARTITION } PARTITION_SEARCH_TYPE; typedef enum { @@ -161,6 +167,19 @@ typedef enum { ONE_LOOP_REDUCED = 1 } FAST_COEFF_UPDATE; +typedef enum { EIGHTH_PEL, QUARTER_PEL, HALF_PEL, FULL_PEL } SUBPEL_FORCE_STOP; + +typedef struct ADAPT_SUBPEL_FORCE_STOP { + // Threshold for full pixel motion vector; + int mv_thresh; + + // subpel_force_stop if full pixel MV is below the threshold. + SUBPEL_FORCE_STOP force_stop_below; + + // subpel_force_stop if full pixel MV is equal to or above the threshold. + SUBPEL_FORCE_STOP force_stop_above; +} ADAPT_SUBPEL_FORCE_STOP; + typedef struct MV_SPEED_FEATURES { // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc). SEARCH_METHODS search_method; @@ -179,15 +198,17 @@ typedef struct MV_SPEED_FEATURES { // the same process. Along the way it skips many diagonals. SUBPEL_SEARCH_METHODS subpel_search_method; - // Maximum number of steps in logarithmic subpel search before giving up. - int subpel_iters_per_step; + // Subpel MV search level. Can take values 0 - 2. Higher values mean more + // extensive subpel search. + int subpel_search_level; + + // When to stop subpel motion search. + SUBPEL_FORCE_STOP subpel_force_stop; + + // If it's enabled, different subpel_force_stop will be used for different MV. + int enable_adaptive_subpel_force_stop; - // Control when to stop subpel search: - // 0: Full subpel search. - // 1: Stop at quarter pixel. - // 2: Stop at half pixel. - // 3: Stop at full pixel. - int subpel_force_stop; + ADAPT_SUBPEL_FORCE_STOP adapt_subpel_force_stop; // This variable sets the step_param used in full pel motion search. int fullpel_search_step_param; @@ -205,6 +226,28 @@ typedef struct MESH_PATTERN { int interval; } MESH_PATTERN; +typedef enum { + // No reaction to rate control on a detected slide/scene change. + NO_DETECTION = 0, + + // Set to larger Q (max_q set by user) based only on the + // detected slide/scene change and current/past Q. + FAST_DETECTION_MAXQ = 1, + + // Based on (first pass) encoded frame, if large frame size is detected + // then set to higher Q for the second re-encode. This involves 2 pass + // encoding on slide change, so slower than 1, but more accurate for + // detecting overshoot. + RE_ENCODE_MAXQ = 2 +} OVERSHOOT_DETECTION_CBR_RT; + +typedef enum { + USE_2_TAPS = 0, + USE_4_TAPS, + USE_8_TAPS, + USE_8_TAPS_SHARP, +} SUBPEL_SEARCH_TYPE; + typedef struct SPEED_FEATURES { MV_SPEED_FEATURES mv; @@ -258,6 +301,9 @@ typedef struct SPEED_FEATURES { // alternate reference frames. int allow_acl; + // Temporal dependency model based encoding mode optimization + int enable_tpl_model; + // Use transform domain distortion. Use pixel domain distortion in speed 0 // and certain situations in higher speed to improve the RD model precision. int allow_txfm_domain_distortion; @@ -272,6 +318,9 @@ typedef struct SPEED_FEATURES { // for intra and model coefs for the rest. TX_SIZE_SEARCH_METHOD tx_size_search_method; + // How many levels of tx size to search, starting from the largest. + int tx_size_search_depth; + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less // precise but significantly faster than the non lp version. int use_lp32x32fdct; @@ -293,9 +342,20 @@ typedef struct SPEED_FEATURES { // rd than partition type split. int less_rectangular_check; - // Disable testing non square partitions. (eg 16x32) + // Disable testing non square partitions(eg 16x32) for block sizes larger than + // use_square_only_thresh_high or smaller than use_square_only_thresh_low. int use_square_partition_only; - BLOCK_SIZE use_square_only_threshold; + BLOCK_SIZE use_square_only_thresh_high; + BLOCK_SIZE use_square_only_thresh_low; + + // Prune reference frames for rectangular partitions. + int prune_ref_frame_for_rect_partitions; + + // Threshold values used for ML based rectangular partition search pruning. + // If < 0, the feature is turned off. + // Higher values mean more aggressiveness to skip rectangular partition + // search that results in better encoding speed but worse coding performance. + int ml_prune_rect_partition_threhold[4]; // Sets min and max partition sizes for this 64x64 region based on the // same 64x64 in last encoded frame, and the left and above neighbor. @@ -327,6 +387,9 @@ typedef struct SPEED_FEATURES { // point for this motion search and limits the search range around it. int adaptive_motion_search; + // Do extra full pixel motion search to obtain better motion vector. + int enhanced_full_pixel_motion_search; + // Threshold for allowing exhaistive motion search. int exhaustive_searches_thresh; @@ -448,9 +511,19 @@ typedef struct SPEED_FEATURES { // Partition search early breakout thresholds. PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr; + // Use ML-based partition search early breakout. + int use_ml_partition_search_breakout; + // Higher values mean more aggressiveness for partition search breakout that + // results in better encoding speed but worse compression performance. + float ml_partition_search_breakout_thresh[3]; + // Machine-learning based partition search early termination int ml_partition_search_early_termination; + // Machine-learning based partition search pruning using prediction residue + // variance. + int ml_var_partition_pruning; + // Allow skipping partition search for still image frame int allow_partition_search_skip; @@ -508,6 +581,20 @@ typedef struct SPEED_FEATURES { // For SVC: enables use of partition from lower spatial resolution. int svc_use_lowres_part; + + // Flag to indicate process for handling overshoot on slide/scene change, + // for real-time CBR mode. + OVERSHOOT_DETECTION_CBR_RT overshoot_detection_cbr_rt; + + // Disable partitioning of 16x16 blocks. + int disable_16x16part_nonkey; + + // Allow for disabling golden reference. + int disable_golden_ref; + + // Allow sub-pixel search to use interpolation filters with different taps in + // order to achieve accurate motion search result. + SUBPEL_SEARCH_TYPE use_accurate_subpel_search; } SPEED_FEATURES; struct VP9_COMP; @@ -519,4 +606,4 @@ void vp9_set_speed_features_framesize_dependent(struct VP9_COMP *cpi); } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SPEED_FEATURES_H_ +#endif // VPX_VP9_ENCODER_VP9_SPEED_FEATURES_H_ diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h index 26c89e2ea..f0d544b52 100644 --- a/libvpx/vp9/encoder/vp9_subexp.h +++ b/libvpx/vp9/encoder/vp9_subexp.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SUBEXP_H_ -#define VP9_ENCODER_VP9_SUBEXP_H_ +#ifndef VPX_VP9_ENCODER_VP9_SUBEXP_H_ +#define VPX_VP9_ENCODER_VP9_SUBEXP_H_ #ifdef __cplusplus extern "C" { @@ -37,4 +37,4 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SUBEXP_H_ +#endif // VPX_VP9_ENCODER_VP9_SUBEXP_H_ diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c index 2636bd9a5..3223f714b 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -19,6 +19,14 @@ #define SMALL_FRAME_WIDTH 32 #define SMALL_FRAME_HEIGHT 16 +static void swap_ptr(void *a, void *b) { + void **a_p = (void **)a; + void **b_p = (void **)b; + void *c = *a_p; + *a_p = *b_p; + *b_p = c; +} + void vp9_init_layer_context(VP9_COMP *const cpi) { SVC *const svc = &cpi->svc; const VP9EncoderConfig *const oxcf = &cpi->oxcf; @@ -29,24 +37,49 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->spatial_layer_id = 0; svc->temporal_layer_id = 0; - svc->first_spatial_layer_to_encode = 0; - svc->rc_drop_superframe = 0; svc->force_zero_mode_spatial_ref = 0; svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + svc->use_gf_temporal_ref = 1; + svc->use_gf_temporal_ref_current_layer = 0; svc->scaled_temp_is_alloc = 0; svc->scaled_one_half = 0; svc->current_superframe = 0; svc->non_reference_frame = 0; - - for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1; + svc->skip_enhancement_layer = 0; + svc->disable_inter_layer_pred = INTER_LAYER_PRED_ON; + svc->framedrop_mode = CONSTRAINED_LAYER_DROP; + svc->set_intra_only_frame = 0; + svc->previous_frame_is_intra_only = 0; + svc->superframe_has_layer_sync = 0; + svc->use_set_ref_frame_config = 0; + svc->num_encoded_top_layer = 0; + + for (i = 0; i < REF_FRAMES; ++i) { + svc->fb_idx_spatial_layer_id[i] = -1; + svc->fb_idx_temporal_layer_id[i] = -1; + svc->fb_idx_base[i] = 0; + } for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { + svc->last_layer_dropped[sl] = 0; + svc->drop_spatial_layer[sl] = 0; svc->ext_frame_flags[sl] = 0; - svc->ext_lst_fb_idx[sl] = 0; - svc->ext_gld_fb_idx[sl] = 1; - svc->ext_alt_fb_idx[sl] = 2; - svc->downsample_filter_type[sl] = EIGHTTAP; - svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter. + svc->lst_fb_idx[sl] = 0; + svc->gld_fb_idx[sl] = 1; + svc->alt_fb_idx[sl] = 2; + svc->downsample_filter_type[sl] = BILINEAR; + svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. + svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; + svc->fb_idx_upd_tl0[sl] = -1; + svc->drop_count[sl] = 0; + svc->spatial_layer_sync[sl] = 0; } + svc->max_consec_drop = INT_MAX; + + svc->buffer_gf_temporal_ref[1].idx = 7; + svc->buffer_gf_temporal_ref[0].idx = 6; + svc->buffer_gf_temporal_ref[1].is_used = 0; + svc->buffer_gf_temporal_ref[0].is_used = 0; if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) { if (vpx_realloc_frame_buffer(&cpi->svc.empty_frame.img, SMALL_FRAME_WIDTH, @@ -84,6 +117,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { lrc->ni_frames = 0; lrc->decimation_count = 0; lrc->decimation_factor = 0; + lrc->worst_quality = oxcf->worst_allowed_q; + lrc->best_quality = oxcf->best_allowed_q; for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { lrc->rate_correction_factors[i] = 1.0; @@ -122,6 +157,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { size_t consec_zero_mv_size; VP9_COMMON *const cm = &cpi->common; lc->sb_index = 0; + lc->actual_num_seg1_blocks = 0; + lc->actual_num_seg2_blocks = 0; + lc->counter_encode_maxq_scene_change = 0; CHECK_MEM_ERROR(cm, lc->map, vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map))); memset(lc->map, 0, mi_rows * mi_cols); @@ -154,6 +192,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, int sl, tl, layer = 0, spatial_layer_target; float bitrate_alloc = 1.0; + cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode; + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { @@ -290,6 +330,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { LAYER_CONTEXT *const lc = get_layer_context(cpi); const int old_frame_since_key = cpi->rc.frames_since_key; const int old_frame_to_key = cpi->rc.frames_to_key; + const int old_ext_use_post_encode_drop = cpi->rc.ext_use_post_encode_drop; cpi->rc = lc->rc; cpi->twopass = lc->twopass; @@ -303,26 +344,23 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) { // Reset the frames_since_key and frames_to_key counters to their values // before the layer restore. Keep these defined for the stream (not layer). if (cpi->svc.number_temporal_layers > 1 || - (cpi->svc.number_spatial_layers > 1 && !is_two_pass_svc(cpi))) { + cpi->svc.number_spatial_layers > 1) { cpi->rc.frames_since_key = old_frame_since_key; cpi->rc.frames_to_key = old_frame_to_key; } - + cpi->rc.ext_use_post_encode_drop = old_ext_use_post_encode_drop; // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers, // for the base temporal layer. if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->svc.number_spatial_layers > 1 && cpi->svc.temporal_layer_id == 0) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; - signed char *temp = cr->map; - uint8_t *temp2 = cr->last_coded_q_map; - uint8_t *temp3 = cpi->consec_zero_mv; - cr->map = lc->map; - lc->map = temp; - cr->last_coded_q_map = lc->last_coded_q_map; - lc->last_coded_q_map = temp2; - cpi->consec_zero_mv = lc->consec_zero_mv; - lc->consec_zero_mv = temp3; + swap_ptr(&cr->map, &lc->map); + swap_ptr(&cr->last_coded_q_map, &lc->last_coded_q_map); + swap_ptr(&cpi->consec_zero_mv, &lc->consec_zero_mv); cr->sb_index = lc->sb_index; + cr->actual_num_seg1_blocks = lc->actual_num_seg1_blocks; + cr->actual_num_seg2_blocks = lc->actual_num_seg2_blocks; + cr->counter_encode_maxq_scene_change = lc->counter_encode_maxq_scene_change; } } @@ -350,6 +388,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) { lc->consec_zero_mv = cpi->consec_zero_mv; cpi->consec_zero_mv = temp3; lc->sb_index = cr->sb_index; + lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks; + lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks; + lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change; } } @@ -381,15 +422,6 @@ void vp9_inc_frame_in_layer(VP9_COMP *const cpi) { ++cpi->svc.current_superframe; } -int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) { - return is_two_pass_svc(cpi) && cpi->svc.spatial_layer_id > 0 && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame; -} - void get_layer_resolution(const int width_org, const int height_org, const int num, const int den, int *width_out, int *height_out) { @@ -408,6 +440,40 @@ void get_layer_resolution(const int width_org, const int height_org, *height_out = h; } +static void reset_fb_idx_unused(VP9_COMP *const cpi) { + // If a reference frame is not referenced or refreshed, then set the + // fb_idx for that reference to the first one used/referenced. + // This is to avoid setting fb_idx for a reference to a slot that is not + // used/needed (i.e., since that reference is not referenced or refreshed). + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; + } + } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && + !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } +} + // The function sets proper ref_frame_flags, buffer indices, and buffer update // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering // scheme. @@ -511,6 +577,8 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -546,6 +614,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { if (!spatial_id) { cpi->ref_frame_flags = VP9_LAST_FLAG; } else { + if (spatial_id == cpi->svc.number_spatial_layers - 1) + cpi->ext_refresh_alt_ref_frame = 0; cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG; } } @@ -568,6 +638,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -600,54 +672,161 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering( } else { cpi->gld_fb_idx = 0; } + + reset_fb_idx_unused(cpi); +} + +static void set_flags_and_fb_idx_bypass_via_set_ref_frame_config( + VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + int sl = svc->spatial_layer_id = svc->spatial_layer_to_encode; + cpi->svc.temporal_layer_id = cpi->svc.temporal_layer_id_per_spatial[sl]; + cpi->ext_refresh_frame_flags_pending = 1; + cpi->lst_fb_idx = svc->lst_fb_idx[sl]; + cpi->gld_fb_idx = svc->gld_fb_idx[sl]; + cpi->alt_fb_idx = svc->alt_fb_idx[sl]; + cpi->ext_refresh_last_frame = 0; + cpi->ext_refresh_golden_frame = 0; + cpi->ext_refresh_alt_ref_frame = 0; + cpi->ref_frame_flags = 0; + if (svc->reference_last[sl]) cpi->ref_frame_flags |= VP9_LAST_FLAG; + if (svc->reference_golden[sl]) cpi->ref_frame_flags |= VP9_GOLD_FLAG; + if (svc->reference_altref[sl]) cpi->ref_frame_flags |= VP9_ALT_FLAG; +} + +void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int sl = svc->spatial_layer_id; + svc->lst_fb_idx[sl] = cpi->lst_fb_idx; + svc->gld_fb_idx[sl] = cpi->gld_fb_idx; + svc->alt_fb_idx[sl] = cpi->alt_fb_idx; + // For the fixed SVC mode: pass the refresh_lst/gld/alt_frame flags to the + // update_buffer_slot, this is needed for the GET_SVC_REF_FRAME_CONFIG api. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + int ref; + for (ref = 0; ref < REF_FRAMES; ++ref) { + svc->update_buffer_slot[sl] &= ~(1 << ref); + if ((ref == svc->lst_fb_idx[sl] && cpi->refresh_last_frame) || + (ref == svc->gld_fb_idx[sl] && cpi->refresh_golden_frame) || + (ref == svc->alt_fb_idx[sl] && cpi->refresh_alt_ref_frame)) + svc->update_buffer_slot[sl] |= (1 << ref); + } + } + // TODO(jianj): Remove these 3, deprecated. + svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame; + svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; + svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; + + svc->reference_last[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); + svc->reference_golden[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); + svc->reference_altref[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { int width = 0, height = 0; + SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; - if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1; - cpi->svc.force_zero_mode_spatial_ref = 1; - cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride; + svc->skip_enhancement_layer = 0; + if (svc->number_spatial_layers > 1) { + svc->use_base_mv = 1; + svc->use_partition_reuse = 1; + } + svc->force_zero_mode_spatial_ref = 1; + svc->mi_stride[svc->spatial_layer_id] = cpi->common.mi_stride; + svc->mi_rows[svc->spatial_layer_id] = cpi->common.mi_rows; + svc->mi_cols[svc->spatial_layer_id] = cpi->common.mi_cols; - if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { set_flags_and_fb_idx_for_temporal_mode3(cpi); - } else if (cpi->svc.temporal_layering_mode == + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) { set_flags_and_fb_idx_for_temporal_mode_noLayering(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_0101) { + } else if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0101) { set_flags_and_fb_idx_for_temporal_mode2(cpi); - } else if (cpi->svc.temporal_layering_mode == - VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { - // In the BYPASS/flexible mode, the encoder is relying on the application - // to specify, for each spatial layer, the flags and buffer indices for the - // layering. - // Note that the check (cpi->ext_refresh_frame_flags_pending == 0) is - // needed to support the case where the frame flags may be passed in via - // vpx_codec_encode(), which can be used for the temporal-only svc case. - // TODO(marpan): Consider adding an enc_config parameter to better handle - // this case. - if (cpi->ext_refresh_frame_flags_pending == 0) { - int sl; - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - sl = cpi->svc.spatial_layer_id; - vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]); - cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl]; - cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl]; - cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl]; - } - } - - if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode) - cpi->svc.rc_drop_superframe = 0; - - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; + } else if (svc->temporal_layering_mode == + VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + set_flags_and_fb_idx_bypass_via_set_ref_frame_config(cpi); + } + + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[0].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[0].idx) + svc->buffer_gf_temporal_ref[0].is_used = 1; + if (cpi->lst_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->gld_fb_idx == svc->buffer_gf_temporal_ref[1].idx || + cpi->alt_fb_idx == svc->buffer_gf_temporal_ref[1].idx) + svc->buffer_gf_temporal_ref[1].is_used = 1; + + // For the fixed (non-flexible/bypass) SVC mode: + // If long term temporal reference is enabled at the sequence level + // (use_gf_temporal_ref == 1), and inter_layer is disabled (on inter-frames), + // we can use golden as a second temporal reference + // (since the spatial/inter-layer reference is disabled). + // We check that the fb_idx for this reference (buffer_gf_temporal_ref.idx) is + // unused (slot 7 and 6 should be available for 3-3 layer system). + // For now usage of this second temporal reference will only be used for + // highest and next to highest spatial layer (i.e., top and middle layer for + // 3 spatial layers). + svc->use_gf_temporal_ref_current_layer = 0; + if (svc->use_gf_temporal_ref && !svc->buffer_gf_temporal_ref[0].is_used && + !svc->buffer_gf_temporal_ref[1].is_used && + svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_ON && + svc->number_spatial_layers <= 3 && svc->number_temporal_layers <= 3 && + svc->spatial_layer_id >= svc->number_spatial_layers - 2) { + // Enable the second (long-term) temporal reference at the frame-level. + svc->use_gf_temporal_ref_current_layer = 1; + } + + // Check if current superframe has any layer sync, only check once on + // base layer. + if (svc->spatial_layer_id == 0) { + int sl = 0; + // Default is no sync. + svc->superframe_has_layer_sync = 0; + for (sl = 0; sl < svc->number_spatial_layers; ++sl) { + if (cpi->svc.spatial_layer_sync[sl]) svc->superframe_has_layer_sync = 1; + } + } + + // Reset the drop flags for all spatial layers, on the base layer. + if (svc->spatial_layer_id == 0) { + vp9_zero(svc->drop_spatial_layer); + // TODO(jianj/marpan): Investigate why setting svc->lst/gld/alt_fb_idx + // causes an issue with frame dropping and temporal layers, when the frame + // flags are passed via the encode call (bypass mode). Issue is that we're + // resetting ext_refresh_frame_flags_pending to 0 on frame drops. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + memset(&svc->lst_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->gld_fb_idx, -1, sizeof(svc->lst_fb_idx)); + memset(&svc->alt_fb_idx, -1, sizeof(svc->lst_fb_idx)); + // These are set by API before the superframe is encoded and they are + // passed to encoder layer by layer. Don't reset them on layer 0 in bypass + // mode. + vp9_zero(svc->update_buffer_slot); + vp9_zero(svc->reference_last); + vp9_zero(svc->reference_golden); + vp9_zero(svc->reference_altref); + // TODO(jianj): Remove these 3, deprecated. + vp9_zero(svc->update_last); + vp9_zero(svc->update_golden); + vp9_zero(svc->update_altref); + } + } + + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; // Setting the worst/best_quality via the encoder control: SET_SVC_PARAMETERS, // only for non-BYPASS mode for now. - if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS || + svc->use_set_ref_frame_config) { RATE_CONTROL *const lrc = &lc->rc; lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q); lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q); @@ -657,157 +836,76 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); - // For resolutions <= VGA: set phase of the filter = 8 (for symmetric - // averaging filter), use bilinear for now. - if (width * height <= 640 * 480) { - cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR; - cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8; - } - - // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use - // of base motion vectors if spatial scale factors for any layers are not 2, + // Use Eightap_smooth for low resolutions. + if (width * height <= 320 * 240) + svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; + // For scale factors > 0.75, set the phase to 0 (aligns decimated pixel + // to source pixel). + lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + + svc->temporal_layer_id]; + if (lc->scaling_factor_num > (3 * lc->scaling_factor_den) >> 2) + svc->downsample_filter_phase[svc->spatial_layer_id] = 0; + + // The usage of use_base_mv or partition_reuse assumes down-scale of 2x2. + // For now, turn off use of base motion vectors and partition reuse if the + // spatial scale factors for any layers are not 2, // keep the case of 3 spatial layers with scale factor of 4x4 for base layer. // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. - if (cpi->svc.number_spatial_layers > 1) { + if (svc->number_spatial_layers > 1) { int sl; - for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { - lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id]; + for (sl = 0; sl < svc->number_spatial_layers - 1; ++sl) { + lc = &svc->layer_context[sl * svc->number_temporal_layers + + svc->temporal_layer_id]; if ((lc->scaling_factor_num != lc->scaling_factor_den >> 1) && !(lc->scaling_factor_num == lc->scaling_factor_den >> 2 && sl == 0 && - cpi->svc.number_spatial_layers == 3)) { - cpi->svc.use_base_mv = 0; + svc->number_spatial_layers == 3)) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; break; } } + // For non-zero spatial layers: if the previous spatial layer was dropped + // disable the base_mv and partition_reuse features. + if (svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) { + svc->use_base_mv = 0; + svc->use_partition_reuse = 0; + } } - cpi->svc.non_reference_frame = 0; + svc->non_reference_frame = 0; if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame && - !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) { - cpi->svc.non_reference_frame = 1; + !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) + svc->non_reference_frame = 1; + // For non-flexible mode, where update_buffer_slot is used, need to check if + // all buffer slots are not refreshed. + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + if (svc->update_buffer_slot[svc->spatial_layer_id] != 0) + svc->non_reference_frame = 0; } - if (vp9_set_size_literal(cpi, width, height) != 0) - return VPX_CODEC_INVALID_PARAM; - - return 0; -} - -#if CONFIG_SPATIAL_SVC -#define SMALL_FRAME_FB_IDX 7 - -int vp9_svc_start_frame(VP9_COMP *const cpi) { - int width = 0, height = 0; - LAYER_CONTEXT *lc; - struct lookahead_entry *buf; - int count = 1 << (cpi->svc.number_temporal_layers - 1); - - cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; - lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id]; - - cpi->svc.temporal_layer_id = 0; - while ((lc->current_video_frame_in_layer % count) != 0) { - ++cpi->svc.temporal_layer_id; - count >>= 1; - } - - cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - - if (cpi->svc.spatial_layer_id == 0) - cpi->gld_fb_idx = - (lc->gold_ref_idx >= 0) ? lc->gold_ref_idx : cpi->lst_fb_idx; - else - cpi->gld_fb_idx = cpi->svc.spatial_layer_id - 1; - - if (lc->current_video_frame_in_layer == 0) { - if (cpi->svc.spatial_layer_id >= 2) { - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - } else { - cpi->alt_fb_idx = cpi->lst_fb_idx; - cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_ALT_FLAG); - } - } else { - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]) { - cpi->alt_fb_idx = lc->alt_ref_idx; - if (!lc->has_alt_frame) cpi->ref_frame_flags &= (~VP9_ALT_FLAG); - } else { - // Find a proper alt_fb_idx for layers that don't have alt ref frame - if (cpi->svc.spatial_layer_id == 0) { - cpi->alt_fb_idx = cpi->lst_fb_idx; - } else { - LAYER_CONTEXT *lc_lower = - &cpi->svc.layer_context[cpi->svc.spatial_layer_id - 1]; - - if (cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id - 1] && - lc_lower->alt_ref_source != NULL) - cpi->alt_fb_idx = lc_lower->alt_ref_idx; - else if (cpi->svc.spatial_layer_id >= 2) - cpi->alt_fb_idx = cpi->svc.spatial_layer_id - 2; - else - cpi->alt_fb_idx = cpi->lst_fb_idx; - } - } + if (svc->spatial_layer_id == 0) { + svc->high_source_sad_superframe = 0; + svc->high_num_blocks_with_motion = 0; } - get_layer_resolution(cpi->oxcf.width, cpi->oxcf.height, - lc->scaling_factor_num, lc->scaling_factor_den, &width, - &height); - - // Workaround for multiple frame contexts. In some frames we can't use prev_mi - // since its previous frame could be changed during decoding time. The idea is - // we put a empty invisible frame in front of them, then we will not use - // prev_mi when encoding these frames. - - buf = vp9_lookahead_peek(cpi->lookahead, 0); - if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2 && - cpi->svc.encode_empty_frame_state == NEED_TO_ENCODE && - lc->rc.frames_to_key != 0 && - !(buf != NULL && (buf->flags & VPX_EFLAG_FORCE_KF))) { - if ((cpi->svc.number_temporal_layers > 1 && - cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1) || - (cpi->svc.number_spatial_layers > 1 && - cpi->svc.spatial_layer_id == 0)) { - struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, 0); - - if (buf != NULL) { - cpi->svc.empty_frame.ts_start = buf->ts_start; - cpi->svc.empty_frame.ts_end = buf->ts_end; - cpi->svc.encode_empty_frame_state = ENCODING; - cpi->common.show_frame = 0; - cpi->ref_frame_flags = 0; - cpi->common.frame_type = INTER_FRAME; - cpi->lst_fb_idx = cpi->gld_fb_idx = cpi->alt_fb_idx = - SMALL_FRAME_FB_IDX; - - if (cpi->svc.encode_intra_empty_frame != 0) cpi->common.intra_only = 1; - - width = SMALL_FRAME_WIDTH; - height = SMALL_FRAME_HEIGHT; - } - } + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->last_layer_dropped[svc->spatial_layer_id] && + svc->fb_idx_upd_tl0[svc->spatial_layer_id] != -1 && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For fixed/non-flexible mode, if the previous frame (same spatial layer + // from previous superframe) was dropped, make sure the lst_fb_idx + // for this frame corresponds to the buffer index updated on (last) encoded + // TL0 frame (with same spatial layer). + cpi->lst_fb_idx = svc->fb_idx_upd_tl0[svc->spatial_layer_id]; } - cpi->oxcf.worst_allowed_q = vp9_quantizer_to_qindex(lc->max_q); - cpi->oxcf.best_allowed_q = vp9_quantizer_to_qindex(lc->min_q); - - vp9_change_config(cpi, &cpi->oxcf); - if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; - vp9_set_high_precision_mv(cpi, 1); - - cpi->alt_ref_source = get_layer_context(cpi)->alt_ref_source; - return 0; } -#undef SMALL_FRAME_FB_IDX -#endif // CONFIG_SPATIAL_SVC - struct lookahead_entry *vp9_svc_lookahead_pop(VP9_COMP *const cpi, struct lookahead_ctx *ctx, int drain) { @@ -840,7 +938,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) { } // Reset on key frame: reset counters, references and buffer updates. -void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { +void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) { int sl, tl; SVC *const svc = &cpi->svc; LAYER_CONTEXT *lc = NULL; @@ -848,7 +946,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) { for (tl = 0; tl < svc->number_temporal_layers; ++tl) { lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl]; lc->current_video_frame_in_layer = 0; - lc->frames_from_key_frame = 0; + if (is_key) lc->frames_from_key_frame = 0; } } if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) { @@ -887,3 +985,245 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { } } } + +void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + // Check for disabling inter-layer (spatial) prediction, if + // svc.disable_inter_layer_pred is set. If the previous spatial layer was + // dropped then disable the prediction from this (scaled) reference. + // For INTER_LAYER_PRED_OFF_NONKEY: inter-layer prediction is disabled + // on key frames or if any spatial layer is a sync layer. + if ((svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && + !svc->layer_context[svc->temporal_layer_id].is_key_frame && + !svc->superframe_has_layer_sync) || + svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF || + svc->drop_spatial_layer[svc->spatial_layer_id - 1]) { + MV_REFERENCE_FRAME ref_frame; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) + cpi->ref_frame_flags &= (~flag_list[ref_frame]); + } + } + } + // For fixed/non-flexible SVC: check for disabling inter-layer prediction. + // If the reference for inter-layer prediction (the reference that is scaled) + // is not the previous spatial layer from the same superframe, then we disable + // inter-layer prediction. Only need to check when inter_layer prediction is + // not set to OFF mode. + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred != INTER_LAYER_PRED_OFF) { + // We only use LAST and GOLDEN for prediction in real-time mode, so we + // check both here. + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) { + struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + // If this reference was updated on the previous spatial layer of the + // current superframe, then we keep this reference (don't disable). + // Otherwise we disable the inter-layer prediction. + // This condition is verified by checking if the current frame buffer + // index is equal to any of the slots for the previous spatial layer, + // and if so, check if that slot was updated/refreshed. If that is the + // case, then this reference is valid for inter-layer prediction under + // the mode INTER_LAYER_PRED_ON_CONSTRAINED. + int fb_idx = + ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx; + int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG; + int sl = svc->spatial_layer_id; + int disable = 1; + if (fb_idx < 0) continue; + if ((fb_idx == svc->lst_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->gld_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx))) || + (fb_idx == svc->alt_fb_idx[sl - 1] && + (svc->update_buffer_slot[sl - 1] & (1 << fb_idx)))) + disable = 0; + if (disable) cpi->ref_frame_flags &= (~ref_flag); + } + } + } +} + +void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // For fixed/non-flexible mode, the following constraint are expected, + // when inter-layer prediciton is on (default). + if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON && + svc->framedrop_mode != LAYER_DROP) { + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // On non-key frames: LAST is always temporal reference, GOLDEN is + // spatial reference. + if (svc->temporal_layer_id == 0) + // Base temporal only predicts from base temporal. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == 0); + else + // Non-base temporal only predicts from lower temporal layer. + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] < + svc->temporal_layer_id); + if (svc->spatial_layer_id > 0 && cpi->ref_frame_flags & VP9_GOLD_FLAG && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Non-base spatial only predicts from lower spatial layer with same + // temporal_id. + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } else if (svc->spatial_layer_id > 0 && + svc->spatial_layer_id > svc->first_spatial_layer_to_encode) { + // Only 1 reference for frame whose base is key; reference may be LAST + // or GOLDEN, so we check both. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] == + svc->temporal_layer_id); + } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + assert(svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] == + svc->spatial_layer_id - 1); + assert(svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] == + svc->temporal_layer_id); + } + } + } else if (svc->use_gf_temporal_ref_current_layer && + !svc->layer_context[svc->temporal_layer_id].is_key_frame) { + // For the usage of golden as second long term reference: the + // temporal_layer_id of that reference must be base temporal layer 0, and + // spatial_layer_id of that reference must be same as current + // spatial_layer_id. If not, disable feature. + // TODO(marpan): Investigate when this can happen, and maybe put this check + // and reset in a different place. + if (svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] != + svc->spatial_layer_id || + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] != 0) + svc->use_gf_temporal_ref_current_layer = 0; + } +} + +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(VP9_COMP *const cpi) { + int layer = + LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id, cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + return denoise_svc(cpi) && !lc->is_key_frame; +} +#endif + +void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Only for superframes whose base is not key, as those are + // already sync frames. + if (!svc->layer_context[svc->temporal_layer_id].is_key_frame) { + if (svc->spatial_layer_id == 0) { + // On base spatial layer: if the current superframe has a layer sync then + // reset the pattern counters and reset to base temporal layer. + if (svc->superframe_has_layer_sync) + vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME); + } + // If the layer sync is set for this current spatial layer then + // disable the temporal reference. + if (svc->spatial_layer_id > 0 && + svc->spatial_layer_sync[svc->spatial_layer_id]) { + cpi->ref_frame_flags &= (~VP9_LAST_FLAG); + if (svc->use_gf_temporal_ref_current_layer) { + int index = svc->spatial_layer_id; + // If golden is used as second reference: need to remove it from + // prediction, reset refresh period to 0, and update the reference. + svc->use_gf_temporal_ref_current_layer = 0; + cpi->rc.baseline_gf_interval = 0; + cpi->rc.frames_till_gf_update_due = 0; + // On layer sync frame we must update the buffer index used for long + // term reference. Use the alt_ref since it is not used or updated on + // sync frames. + if (svc->number_spatial_layers == 3) index = svc->spatial_layer_id - 1; + assert(index >= 0); + cpi->alt_fb_idx = svc->buffer_gf_temporal_ref[index].idx; + cpi->ext_refresh_alt_ref_frame = 1; + } + } + } +} + +void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + // Update the usage of frame buffer index for base spatial layers. + if (svc->spatial_layer_id == 0) { + if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame) + svc->fb_idx_base[cpi->lst_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_GOLD_FLAG) || cpi->refresh_golden_frame) + svc->fb_idx_base[cpi->gld_fb_idx] = 1; + if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame) + svc->fb_idx_base[cpi->alt_fb_idx] = 1; + } +} + +static void vp9_svc_update_ref_frame_bypass_mode(VP9_COMP *const cpi) { + // For non-flexible/bypass SVC mode: check for refreshing other buffer + // slots. + SVC *const svc = &cpi->svc; + VP9_COMMON *const cm = &cpi->common; + BufferPool *const pool = cm->buffer_pool; + int i; + for (i = 0; i < REF_FRAMES; i++) { + if (cm->frame_type == KEY_FRAME || + svc->update_buffer_slot[svc->spatial_layer_id] & (1 << i)) { + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + } + } +} + +void vp9_svc_update_ref_frame(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + SVC *const svc = &cpi->svc; + BufferPool *const pool = cm->buffer_pool; + + if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + svc->use_set_ref_frame_config) { + vp9_svc_update_ref_frame_bypass_mode(cpi); + } else if (cm->frame_type == KEY_FRAME) { + // Keep track of frame index for each reference frame. + int i; + // On key frame update all reference frame slots. + for (i = 0; i < REF_FRAMES; i++) { + svc->fb_idx_spatial_layer_id[i] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[i] = svc->temporal_layer_id; + // LAST/GOLDEN/ALTREF is already updated above. + if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && i != cpi->alt_fb_idx) + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + } + } else { + if (cpi->refresh_last_frame) { + svc->fb_idx_spatial_layer_id[cpi->lst_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->lst_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_golden_frame) { + svc->fb_idx_spatial_layer_id[cpi->gld_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->gld_fb_idx] = svc->temporal_layer_id; + } + if (cpi->refresh_alt_ref_frame) { + svc->fb_idx_spatial_layer_id[cpi->alt_fb_idx] = svc->spatial_layer_id; + svc->fb_idx_temporal_layer_id[cpi->alt_fb_idx] = svc->temporal_layer_id; + } + } + // Copy flags from encoder to SVC struct. + vp9_copy_flags_ref_update_idx(cpi); + vp9_svc_update_ref_frame_buffer_idx(cpi); +} + +void vp9_svc_adjust_frame_rate(VP9_COMP *const cpi) { + int64_t this_duration = + cpi->svc.timebase_fac * cpi->svc.duration[cpi->svc.spatial_layer_id]; + vp9_new_framerate(cpi, 10000000.0 / this_duration); +} diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h index b7cdfd962..c25644617 100644 --- a/libvpx/vp9/encoder/vp9_svc_layercontext.h +++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ -#define VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#ifndef VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ +#define VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ #include "vpx/vpx_encoder.h" @@ -19,6 +19,24 @@ extern "C" { #endif +typedef enum { + // Inter-layer prediction is on on all frames. + INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. + INTER_LAYER_PRED_OFF, + // Inter-layer prediction is off on non-key frames and non-sync frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED +} INTER_LAYER_PRED; + +typedef struct BUFFER_LONGTERM_REF { + int idx; + int is_used; +} BUFFER_LONGTERM_REF; + typedef struct { RATE_CONTROL rc; int target_bandwidth; @@ -42,10 +60,14 @@ typedef struct { size_t layer_size; struct vpx_psnr_pkt psnr_pkt; // Cyclic refresh parameters (aq-mode=3), that need to be updated per-frame. + // TODO(jianj/marpan): Is it better to use the full cyclic refresh struct. int sb_index; signed char *map; uint8_t *last_coded_q_map; uint8_t *consec_zero_mv; + int actual_num_seg1_blocks; + int actual_num_seg2_blocks; + int counter_encode_maxq_scene_change; uint8_t speed; } LAYER_CONTEXT; @@ -56,8 +78,6 @@ typedef struct SVC { int number_temporal_layers; int spatial_layer_to_encode; - int first_spatial_layer_to_encode; - int rc_drop_superframe; // Workaround for multiple frame contexts enum { ENCODED = 0, ENCODING, NEED_TO_ENCODE } encode_empty_frame_state; @@ -81,14 +101,20 @@ typedef struct SVC { // Frame flags and buffer indexes for each spatial layer, set by the // application (external settings). int ext_frame_flags[VPX_MAX_LAYERS]; - int ext_lst_fb_idx[VPX_MAX_LAYERS]; - int ext_gld_fb_idx[VPX_MAX_LAYERS]; - int ext_alt_fb_idx[VPX_MAX_LAYERS]; - int ref_frame_index[REF_FRAMES]; + int lst_fb_idx[VPX_MAX_LAYERS]; + int gld_fb_idx[VPX_MAX_LAYERS]; + int alt_fb_idx[VPX_MAX_LAYERS]; int force_zero_mode_spatial_ref; + // Sequence level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref; + // Frame level flag to enable second (long term) temporal reference. + int use_gf_temporal_ref_current_layer; + // Allow second reference for at most 2 top highest resolution layers. + BUFFER_LONGTERM_REF buffer_gf_temporal_ref[2]; int current_superframe; int non_reference_frame; int use_base_mv; + int use_partition_reuse; // Used to control the downscaling filter for source scaling, for 1 pass CBR. // downsample_filter_phase: = 0 will do sub-sampling (no weighted average), // = 8 will center the target pixel and get a symmetric averaging filter. @@ -99,8 +125,70 @@ typedef struct SVC { BLOCK_SIZE *prev_partition_svc; int mi_stride[VPX_MAX_LAYERS]; + int mi_rows[VPX_MAX_LAYERS]; + int mi_cols[VPX_MAX_LAYERS]; int first_layer_denoise; + + int skip_enhancement_layer; + + int lower_layer_qindex; + + int last_layer_dropped[VPX_MAX_LAYERS]; + int drop_spatial_layer[VPX_MAX_LAYERS]; + int framedrop_thresh[VPX_MAX_LAYERS]; + int drop_count[VPX_MAX_LAYERS]; + int max_consec_drop; + SVC_LAYER_DROP_MODE framedrop_mode; + + INTER_LAYER_PRED disable_inter_layer_pred; + + // Flag to indicate scene change and high num of motion blocks at current + // superframe, scene detection is currently checked for each superframe prior + // to encoding, on the full resolution source. + int high_source_sad_superframe; + int high_num_blocks_with_motion; + + // Flags used to get SVC pattern info. + int update_buffer_slot[VPX_SS_MAX_LAYERS]; + uint8_t reference_last[VPX_SS_MAX_LAYERS]; + uint8_t reference_golden[VPX_SS_MAX_LAYERS]; + uint8_t reference_altref[VPX_SS_MAX_LAYERS]; + // TODO(jianj): Remove these last 3, deprecated. + uint8_t update_last[VPX_SS_MAX_LAYERS]; + uint8_t update_golden[VPX_SS_MAX_LAYERS]; + uint8_t update_altref[VPX_SS_MAX_LAYERS]; + + // Keep track of the frame buffer index updated/refreshed on the base + // temporal superframe. + int fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; + + // Keep track of the spatial and temporal layer id of the frame that last + // updated the frame buffer index. + uint8_t fb_idx_spatial_layer_id[REF_FRAMES]; + uint8_t fb_idx_temporal_layer_id[REF_FRAMES]; + + int spatial_layer_sync[VPX_SS_MAX_LAYERS]; + uint8_t set_intra_only_frame; + uint8_t previous_frame_is_intra_only; + uint8_t superframe_has_layer_sync; + + uint8_t fb_idx_base[REF_FRAMES]; + + int use_set_ref_frame_config; + + int temporal_layer_id_per_spatial[VPX_SS_MAX_LAYERS]; + + int first_spatial_layer_to_encode; + + // Parameters for allowing framerate per spatial layer, and buffer + // update based on timestamps. + int64_t duration[VPX_SS_MAX_LAYERS]; + int64_t timebase_fac; + int64_t time_stamp_superframe; + int64_t time_stamp_prev[VPX_SS_MAX_LAYERS]; + + int num_encoded_top_layer; } SVC; struct VP9_COMP; @@ -148,16 +236,34 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, // Start a frame and initialize svc parameters int vp9_svc_start_frame(struct VP9_COMP *const cpi); +#if CONFIG_VP9_TEMPORAL_DENOISING +int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi); +#endif + +void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); + int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); -void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); +void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key); void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); +void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); + +void vp9_svc_assert_constraints_pattern(struct VP9_COMP *const cpi); + +void vp9_svc_check_spatial_layer_sync(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame_buffer_idx(struct VP9_COMP *const cpi); + +void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi); + +void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif -#endif // VP9_ENCODER_VP9_SVC_LAYERCONTEXT_ +#endif // VPX_VP9_ENCODER_VP9_SVC_LAYERCONTEXT_H_ diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c index 2758c42ae..cd340c394 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -38,53 +38,141 @@ static int fixed_divide[512]; static void temporal_filter_predictors_mb_c( MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr, int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col, - uint8_t *pred, struct scale_factors *scale, int x, int y) { + uint8_t *pred, struct scale_factors *scale, int x, int y, MV *blk_mvs, + int use_32x32) { const int which_mv = 0; - const MV mv = { mv_row, mv_col }; const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP_SHARP]; + int i, j, k = 0, ys = (BH >> 1), xs = (BW >> 1); enum mv_precision mv_precision_uv; int uv_stride; - if (uv_block_width == 8) { + if (uv_block_width == (BW >> 1)) { uv_stride = (stride + 1) >> 1; mv_precision_uv = MV_PRECISION_Q4; } else { uv_stride = stride; mv_precision_uv = MV_PRECISION_Q3; } +#if !CONFIG_VP9_HIGHBITDEPTH + (void)xd; +#endif + if (use_32x32) { + const MV mv = { mv_row, mv_col }; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, - CONVERT_TO_SHORTPTR(&pred[0]), 16, &mv, - scale, 16, 16, which_mv, kernel, - MV_PRECISION_Q3, x, y, xd->bd); - - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, - CONVERT_TO_SHORTPTR(&pred[256]), - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); - - vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, - CONVERT_TO_SHORTPTR(&pred[512]), - uv_block_width, &mv, scale, uv_block_width, - uv_block_height, which_mv, kernel, - mv_precision_uv, x, y, xd->bd); + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor(CONVERT_TO_SHORTPTR(y_mb_ptr), stride, + CONVERT_TO_SHORTPTR(&pred[0]), BW, &mv, + scale, BW, BH, which_mv, kernel, + MV_PRECISION_Q3, x, y, xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS]), uv_block_width, &mv, scale, + uv_block_width, uv_block_height, which_mv, kernel, mv_precision_uv, x, + y, xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1)]), uv_block_width, &mv, + scale, uv_block_width, uv_block_height, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + return; + } +#endif // CONFIG_VP9_HIGHBITDEPTH + vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], BW, &mv, scale, BW, + BH, which_mv, kernel, MV_PRECISION_Q3, x, y); + + vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[BLK_PELS], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[(BLK_PELS << 1)], + uv_block_width, &mv, scale, uv_block_width, + uv_block_height, which_mv, kernel, + mv_precision_uv, x, y); return; } + + // While use_32x32 = 0, construct the 32x32 predictor using 4 16x16 + // predictors. + // Y predictor + for (i = 0; i < BH; i += ys) { + for (j = 0; j < BW; j += xs) { + const MV mv = blk_mvs[k]; + const int y_offset = i * stride + j; + const int p_offset = i * BW + j; + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(y_mb_ptr + y_offset), stride, + CONVERT_TO_SHORTPTR(&pred[p_offset]), BW, &mv, scale, xs, ys, + which_mv, kernel, MV_PRECISION_Q3, x, y, xd->bd); + } else { + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); + } +#else + vp9_build_inter_predictor(y_mb_ptr + y_offset, stride, &pred[p_offset], + BW, &mv, scale, xs, ys, which_mv, kernel, + MV_PRECISION_Q3, x, y); #endif // CONFIG_VP9_HIGHBITDEPTH - (void)xd; - vp9_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16, - which_mv, kernel, MV_PRECISION_Q3, x, y); + k++; + } + } + + // U and V predictors + ys = (uv_block_height >> 1); + xs = (uv_block_width >> 1); + k = 0; - vp9_build_inter_predictor(u_mb_ptr, uv_stride, &pred[256], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); + for (i = 0; i < uv_block_height; i += ys) { + for (j = 0; j < uv_block_width; j += xs) { + const MV mv = blk_mvs[k]; + const int uv_offset = i * uv_stride + j; + const int p_offset = i * uv_block_width + j; - vp9_build_inter_predictor(v_mb_ptr, uv_stride, &pred[512], uv_block_width, - &mv, scale, uv_block_width, uv_block_height, - which_mv, kernel, mv_precision_uv, x, y); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(u_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[BLK_PELS + p_offset]), uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, mv_precision_uv, x, y, + xd->bd); + + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(v_mb_ptr + uv_offset), uv_stride, + CONVERT_TO_SHORTPTR(&pred[(BLK_PELS << 1) + p_offset]), + uv_block_width, &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y, xd->bd); + } else { + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, + &mv, scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); + } +#else + vp9_build_inter_predictor(u_mb_ptr + uv_offset, uv_stride, + &pred[BLK_PELS + p_offset], uv_block_width, &mv, + scale, xs, ys, which_mv, kernel, + mv_precision_uv, x, y); + + vp9_build_inter_predictor(v_mb_ptr + uv_offset, uv_stride, + &pred[(BLK_PELS << 1) + p_offset], + uv_block_width, &mv, scale, xs, ys, which_mv, + kernel, mv_precision_uv, x, y); +#endif // CONFIG_VP9_HIGHBITDEPTH + k++; + } + } } void vp9_temporal_filter_init(void) { @@ -94,6 +182,186 @@ void vp9_temporal_filter_init(void) { for (i = 1; i < 512; ++i) fixed_divide[i] = 0x80000 / i; } +static INLINE int mod_index(int sum_dist, int index, int rounding, int strength, + int filter_weight) { + int mod = (sum_dist * 3) / index; + mod += rounding; + mod >>= strength; + + mod = VPXMIN(16, mod); + + mod = 16 - mod; + mod *= filter_weight; + + return mod; +} + +static INLINE int get_filter_weight(unsigned int i, unsigned int j, + unsigned int block_height, + unsigned int block_width, int *blk_fw, + int use_32x32) { + int filter_weight = 0; + + if (use_32x32) + // blk_fw[0] ~ blk_fw[3] are the same. + return blk_fw[0]; + + if (i < block_height / 2) { + if (j < block_width / 2) + filter_weight = blk_fw[0]; + else + filter_weight = blk_fw[1]; + } else { + if (j < block_width / 2) + filter_weight = blk_fw[2]; + else + filter_weight = blk_fw[3]; + } + return filter_weight; +} + +static void apply_temporal_filter( + const uint8_t *y_frame1, int y_stride, const uint8_t *y_pred, + int y_buf_stride, const uint8_t *u_frame1, const uint8_t *v_frame1, + int uv_stride, const uint8_t *u_pred, const uint8_t *v_pred, + int uv_buf_stride, unsigned int block_width, unsigned int block_height, + int ss_x, int ss_y, int strength, int *blk_fw, int use_32x32, + uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, + uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count) { + unsigned int i, j, k, m; + int modifier; + const int rounding = (1 << strength) >> 1; + const unsigned int uv_block_width = block_width >> ss_x; + const unsigned int uv_block_height = block_height >> ss_y; + DECLARE_ALIGNED(16, uint16_t, y_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, u_diff_sse[BLK_PELS]); + DECLARE_ALIGNED(16, uint16_t, v_diff_sse[BLK_PELS]); + + int idx = 0, idy; + + assert(strength >= 0); + assert(strength <= 6); + + memset(y_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(u_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + memset(v_diff_sse, 0, BLK_PELS * sizeof(uint16_t)); + + // Calculate diff^2 for each pixel of the 16x16 block. + // TODO(yunqing): the following code needs to be optimized. + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int16_t diff = + y_frame1[i * (int)y_stride + j] - y_pred[i * (int)block_width + j]; + y_diff_sse[idx++] = diff * diff; + } + } + idx = 0; + for (i = 0; i < uv_block_height; i++) { + for (j = 0; j < uv_block_width; j++) { + const int16_t diffu = + u_frame1[i * uv_stride + j] - u_pred[i * uv_buf_stride + j]; + const int16_t diffv = + v_frame1[i * uv_stride + j] - v_pred[i * uv_buf_stride + j]; + u_diff_sse[idx] = diffu * diffu; + v_diff_sse[idx] = diffv * diffv; + idx++; + } + } + + for (i = 0, k = 0, m = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int pixel_value = y_pred[i * y_buf_stride + j]; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); + + // non-local mean approach + int y_index = 0; + + const int uv_r = i >> ss_y; + const int uv_c = j >> ss_x; + modifier = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = (int)i + idy; + const int col = (int)j + idx; + + if (row >= 0 && row < (int)block_height && col >= 0 && + col < (int)block_width) { + modifier += y_diff_sse[row * (int)block_width + col]; + ++y_index; + } + } + } + + assert(y_index > 0); + + modifier += u_diff_sse[uv_r * uv_block_width + uv_c]; + modifier += v_diff_sse[uv_r * uv_block_width + uv_c]; + + y_index += 2; + + modifier = + mod_index(modifier, y_index, rounding, strength, filter_weight); + + y_count[k] += modifier; + y_accumulator[k] += modifier * pixel_value; + + ++k; + + // Process chroma component + if (!(i & ss_y) && !(j & ss_x)) { + const int u_pixel_value = u_pred[uv_r * uv_buf_stride + uv_c]; + const int v_pixel_value = v_pred[uv_r * uv_buf_stride + uv_c]; + + // non-local mean approach + int cr_index = 0; + int u_mod = 0, v_mod = 0; + int y_diff = 0; + + for (idy = -1; idy <= 1; ++idy) { + for (idx = -1; idx <= 1; ++idx) { + const int row = uv_r + idy; + const int col = uv_c + idx; + + if (row >= 0 && row < (int)uv_block_height && col >= 0 && + col < (int)uv_block_width) { + u_mod += u_diff_sse[row * uv_block_width + col]; + v_mod += v_diff_sse[row * uv_block_width + col]; + ++cr_index; + } + } + } + + assert(cr_index > 0); + + for (idy = 0; idy < 1 + ss_y; ++idy) { + for (idx = 0; idx < 1 + ss_x; ++idx) { + const int row = (uv_r << ss_y) + idy; + const int col = (uv_c << ss_x) + idx; + y_diff += y_diff_sse[row * (int)block_width + col]; + ++cr_index; + } + } + + u_mod += y_diff; + v_mod += y_diff; + + u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight); + v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight); + + u_count[m] += u_mod; + u_accumulator[m] += u_mod * u_pixel_value; + v_count[m] += v_mod; + v_accumulator[m] += v_mod * v_pixel_value; + + ++m; + } // Complete YUV pixel + } + } +} + +// TODO(any): This function is not used anymore. Should be removed. void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, @@ -103,7 +371,7 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, unsigned int i, j, k; int modifier; int byte = 0; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + const int rounding = (1 << strength) >> 1; assert(strength >= 0); assert(strength <= 6); @@ -166,18 +434,31 @@ void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, void vp9_highbd_temporal_filter_apply_c( const uint8_t *frame1_8, unsigned int stride, const uint8_t *frame2_8, unsigned int block_width, unsigned int block_height, int strength, - int filter_weight, uint32_t *accumulator, uint16_t *count) { + int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count) { const uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8); const uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8); unsigned int i, j, k; int modifier; - int byte = 0; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + int diff_sse[BLK_PELS] = { 0 }; + int this_idx = 0; + + for (i = 0; i < block_height; i++) { + for (j = 0; j < block_width; j++) { + const int diff = + frame1[i * (int)stride + j] - frame2[i * (int)block_width + j]; + diff_sse[this_idx++] = diff * diff; + } + } + + modifier = 0; for (i = 0, k = 0; i < block_height; i++) { for (j = 0; j < block_width; j++, k++) { - int pixel_value = *frame2; - int diff_sse[9] = { 0 }; + int pixel_value = frame2[i * (int)block_width + j]; + int filter_weight = + get_filter_weight(i, j, block_height, block_width, blk_fw, use_32x32); + int idx, idy, index = 0; for (idy = -1; idy <= 1; ++idy) { @@ -187,22 +468,16 @@ void vp9_highbd_temporal_filter_apply_c( if (row >= 0 && row < (int)block_height && col >= 0 && col < (int)block_width) { - int diff = frame1[byte + idy * (int)stride + idx] - - frame2[idy * (int)block_width + idx]; - diff_sse[index] = diff * diff; + modifier += diff_sse[row * (int)block_width + col]; ++index; } } } assert(index > 0); - modifier = 0; - for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx]; - modifier *= 3; modifier /= index; - ++frame2; modifier += rounding; modifier >>= strength; @@ -213,24 +488,19 @@ void vp9_highbd_temporal_filter_apply_c( count[k] += modifier; accumulator[k] += modifier * pixel_value; - - byte++; } - - byte += stride - block_width; } } #endif // CONFIG_VP9_HIGHBITDEPTH -static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, - ThreadData *td, - uint8_t *arf_frame_buf, - uint8_t *frame_ptr_buf, - int stride, MV *ref_mv) { +static uint32_t temporal_filter_find_matching_mb_c( + VP9_COMP *cpi, ThreadData *td, uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, int stride, MV *ref_mv, MV *blk_mvs, + int *blk_bestsme) { MACROBLOCK *const x = &td->mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = HEX; + const SEARCH_METHODS search_method = MESH; int step_param; int sadpb = x->sadperbit16; uint32_t bestsme = UINT_MAX; @@ -245,6 +515,7 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, // Save input state struct buf_2d src = x->plane[0].src; struct buf_2d pre = xd->plane[0].pre[0]; + int i, j, k = 0; best_ref_mv1_full.col = best_ref_mv1.col >> 3; best_ref_mv1_full.row = best_ref_mv1.row >> 3; @@ -260,19 +531,52 @@ static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - vp9_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param, + vp9_full_pixel_search(cpi, x, TF_BLOCK, &best_ref_mv1_full, step_param, search_method, sadpb, cond_cost_list(cpi, cost_list), &best_ref_mv1, ref_mv, 0, 0); /* restore UMV window */ x->mv_limits = tmp_mv_limits; - // Ignore mv costing by sending NULL pointer instead of cost array + // find_fractional_mv_step parameters: best_ref_mv1 is for mv rate cost + // calculation. The start full mv and the search result are stored in + // ref_mv. bestsme = cpi->find_fractional_mv_step( x, ref_mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, - x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], 0, - mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL, NULL, - &distortion, &sse, NULL, 0, 0); + x->errorperbit, &cpi->fn_ptr[TF_BLOCK], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, BW, + BH, USE_8_TAPS_SHARP); + + // DO motion search on 4 16x16 sub_blocks. + best_ref_mv1.row = ref_mv->row; + best_ref_mv1.col = ref_mv->col; + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + for (i = 0; i < BH; i += SUB_BH) { + for (j = 0; j < BW; j += SUB_BW) { + // Setup frame pointers + x->plane[0].src.buf = arf_frame_buf + i * stride + j; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = frame_ptr_buf + i * stride + j; + xd->plane[0].pre[0].stride = stride; + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + vp9_full_pixel_search(cpi, x, TF_SUB_BLOCK, &best_ref_mv1_full, + step_param, search_method, sadpb, + cond_cost_list(cpi, cost_list), &best_ref_mv1, + &blk_mvs[k], 0, 0); + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + blk_bestsme[k] = cpi->find_fractional_mv_step( + x, &blk_mvs[k], &best_ref_mv1, cpi->common.allow_high_precision_mv, + x->errorperbit, &cpi->fn_ptr[TF_SUB_BLOCK], 0, + mv_sf->subpel_search_level, cond_cost_list(cpi, cost_list), NULL, + NULL, &distortion, &sse, NULL, SUB_BW, SUB_BH, USE_8_TAPS_SHARP); + k++; + } + } // Restore input state x->plane[0].src = src; @@ -293,25 +597,24 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, int byte; int frame; int mb_col; - unsigned int filter_weight; - int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4; - int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4; - DECLARE_ALIGNED(16, uint32_t, accumulator[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]); + int mb_cols = (frames[alt_ref_index]->y_crop_width + BW - 1) >> BW_LOG2; + int mb_rows = (frames[alt_ref_index]->y_crop_height + BH - 1) >> BH_LOG2; + DECLARE_ALIGNED(16, uint32_t, accumulator[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint16_t, count[BLK_PELS * 3]); MACROBLOCKD *mbd = &td->mb.e_mbd; YV12_BUFFER_CONFIG *f = frames[alt_ref_index]; uint8_t *dst1, *dst2; #if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[16 * 16 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint16_t, predictor16[BLK_PELS * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[BLK_PELS * 3]); uint8_t *predictor; #else - DECLARE_ALIGNED(16, uint8_t, predictor[16 * 16 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor[BLK_PELS * 3]); #endif - const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y; - const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x; + const int mb_uv_height = BH >> mbd->plane[1].subsampling_y; + const int mb_uv_width = BW >> mbd->plane[1].subsampling_x; // Addition of the tile col level offsets - int mb_y_offset = mb_row * 16 * (f->y_stride) + 16 * mb_col_start; + int mb_y_offset = mb_row * BH * (f->y_stride) + BW * mb_col_start; int mb_uv_offset = mb_row * mb_uv_height * f->uv_stride + mb_uv_width * mb_col_start; @@ -334,21 +637,21 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // 8 - VP9_INTERP_EXTEND. // To keep the mv in play for both Y and UV planes the max that it // can be on a border is therefore 16 - (2*VP9_INTERP_EXTEND+1). - td->mb.mv_limits.row_min = -((mb_row * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.row_min = -((mb_row * BH) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.row_max = - ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + ((mb_rows - 1 - mb_row) * BH) + (17 - 2 * VP9_INTERP_EXTEND); for (mb_col = mb_col_start; mb_col < mb_col_end; mb_col++) { int i, j, k; int stride; MV ref_mv; - vp9_zero_array(accumulator, 16 * 16 * 3); - vp9_zero_array(count, 16 * 16 * 3); + vp9_zero_array(accumulator, BLK_PELS * 3); + vp9_zero_array(count, BLK_PELS * 3); - td->mb.mv_limits.col_min = -((mb_col * 16) + (17 - 2 * VP9_INTERP_EXTEND)); + td->mb.mv_limits.col_min = -((mb_col * BW) + (17 - 2 * VP9_INTERP_EXTEND)); td->mb.mv_limits.col_max = - ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); + ((mb_cols - 1 - mb_col) * BW) + (17 - 2 * VP9_INTERP_EXTEND); if (cpi->oxcf.content == VP9E_CONTENT_FILM) { unsigned int src_variance; @@ -360,92 +663,129 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { src_variance = - vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd); + vp9_high_get_sby_perpixel_variance(cpi, &src, TF_BLOCK, mbd->bd); } else { - src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); } #else - src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16); + src_variance = vp9_get_sby_perpixel_variance(cpi, &src, TF_BLOCK); #endif // CONFIG_VP9_HIGHBITDEPTH if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2); } for (frame = 0; frame < frame_count; frame++) { - const uint32_t thresh_low = 10000; - const uint32_t thresh_high = 20000; + // MVs for 4 16x16 sub blocks. + MV blk_mvs[4]; + // Filter weights for 4 16x16 sub blocks. + int blk_fw[4] = { 0, 0, 0, 0 }; + int use_32x32 = 0; if (frames[frame] == NULL) continue; ref_mv.row = 0; ref_mv.col = 0; + blk_mvs[0] = kZeroMv; + blk_mvs[1] = kZeroMv; + blk_mvs[2] = kZeroMv; + blk_mvs[3] = kZeroMv; if (frame == alt_ref_index) { - filter_weight = 2; + blk_fw[0] = blk_fw[1] = blk_fw[2] = blk_fw[3] = 2; + use_32x32 = 1; } else { + const int thresh_low = 10000; + const int thresh_high = 20000; + int blk_bestsme[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX }; + // Find best match in this frame by MC - uint32_t err = temporal_filter_find_matching_mb_c( + int err = temporal_filter_find_matching_mb_c( cpi, td, frames[alt_ref_index]->y_buffer + mb_y_offset, frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride, - &ref_mv); + &ref_mv, blk_mvs, blk_bestsme); + + int err16 = + blk_bestsme[0] + blk_bestsme[1] + blk_bestsme[2] + blk_bestsme[3]; + int max_err = INT_MIN, min_err = INT_MAX; + for (k = 0; k < 4; k++) { + if (min_err > blk_bestsme[k]) min_err = blk_bestsme[k]; + if (max_err < blk_bestsme[k]) max_err = blk_bestsme[k]; + } + + if (((err * 15 < (err16 << 4)) && max_err - min_err < 10000) || + ((err * 14 < (err16 << 4)) && max_err - min_err < 5000)) { + use_32x32 = 1; + // Assign higher weight to matching MB if it's error + // score is lower. If not applying MC default behavior + // is to weight all MBs equal. + blk_fw[0] = err < (thresh_low << THR_SHIFT) + ? 2 + : err < (thresh_high << THR_SHIFT) ? 1 : 0; + blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0]; + } else { + use_32x32 = 0; + for (k = 0; k < 4; k++) + blk_fw[k] = blk_bestsme[k] < thresh_low + ? 2 + : blk_bestsme[k] < thresh_high ? 1 : 0; + } - // Assign higher weight to matching MB if its error - // score is lower. If not applying MC default behavior - // is to weight all MBs equal. - filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0; + for (k = 0; k < 4; k++) { + switch (abs(frame - alt_ref_index)) { + case 1: blk_fw[k] = VPXMIN(blk_fw[k], 2); break; + case 2: + case 3: blk_fw[k] = VPXMIN(blk_fw[k], 1); break; + default: break; + } + } } - if (filter_weight != 0) { + if (blk_fw[0] || blk_fw[1] || blk_fw[2] || blk_fw[3]) { // Construct the predictors temporal_filter_predictors_mb_c( mbd, frames[frame]->y_buffer + mb_y_offset, frames[frame]->u_buffer + mb_uv_offset, frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride, mb_uv_width, mb_uv_height, ref_mv.row, ref_mv.col, predictor, scale, - mb_col * 16, mb_row * 16); + mb_col * BW, mb_row * BH, blk_mvs, use_32x32); #if CONFIG_VP9_HIGHBITDEPTH if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { int adj_strength = strength + 2 * (mbd->bd - 8); // Apply the filter (YUV) vp9_highbd_temporal_filter_apply( - f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16, - adj_strength, filter_weight, accumulator, count); + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, BH, + adj_strength, blk_fw, use_32x32, accumulator, count); vp9_highbd_temporal_filter_apply( - f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 256, count + 256); + f->u_buffer + mb_uv_offset, f->uv_stride, predictor + BLK_PELS, + mb_uv_width, mb_uv_height, adj_strength, blk_fw, use_32x32, + accumulator + BLK_PELS, count + BLK_PELS); vp9_highbd_temporal_filter_apply( - f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512, - mb_uv_width, mb_uv_height, adj_strength, filter_weight, - accumulator + 512, count + 512); + f->v_buffer + mb_uv_offset, f->uv_stride, + predictor + (BLK_PELS << 1), mb_uv_width, mb_uv_height, + adj_strength, blk_fw, use_32x32, accumulator + (BLK_PELS << 1), + count + (BLK_PELS << 1)); } else { // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 512, - count + 512); + apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); } #else // Apply the filter (YUV) - vp9_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride, - predictor, 16, 16, strength, filter_weight, - accumulator, count); - vp9_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride, - predictor + 256, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 256, - count + 256); - vp9_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride, - predictor + 512, mb_uv_width, mb_uv_height, - strength, filter_weight, accumulator + 512, - count + 512); + apply_temporal_filter( + f->y_buffer + mb_y_offset, f->y_stride, predictor, BW, + f->u_buffer + mb_uv_offset, f->v_buffer + mb_uv_offset, + f->uv_stride, predictor + BLK_PELS, predictor + (BLK_PELS << 1), + mb_uv_width, BW, BH, mbd->plane[1].subsampling_x, + mbd->plane[1].subsampling_y, strength, blk_fw, use_32x32, + accumulator, count, accumulator + BLK_PELS, count + BLK_PELS, + accumulator + (BLK_PELS << 1), count + (BLK_PELS << 1)); #endif // CONFIG_VP9_HIGHBITDEPTH } } @@ -459,8 +799,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1_16 = CONVERT_TO_SHORTPTR(dst1); stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -471,7 +811,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; @@ -480,9 +820,9 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst2_16 = CONVERT_TO_SHORTPTR(dst2); stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -507,8 +847,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -518,16 +858,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -552,8 +892,8 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, dst1 = cpi->alt_ref_buffer.y_buffer; stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; - for (i = 0, k = 0; i < 16; i++) { - for (j = 0; j < 16; j++, k++) { + for (i = 0, k = 0; i < BH; i++) { + for (j = 0; j < BW; j++, k++) { unsigned int pval = accumulator[k] + (count[k] >> 1); pval *= fixed_divide[count[k]]; pval >>= 19; @@ -563,16 +903,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, // move to next pixel byte++; } - byte += stride - 16; + byte += stride - BW; } dst1 = cpi->alt_ref_buffer.u_buffer; dst2 = cpi->alt_ref_buffer.v_buffer; stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; - for (i = 0, k = 256; i < mb_uv_height; i++) { + for (i = 0, k = BLK_PELS; i < mb_uv_height; i++) { for (j = 0; j < mb_uv_width; j++, k++) { - int m = k + 256; + int m = k + BLK_PELS; // U unsigned int pval = accumulator[k] + (count[k] >> 1); @@ -592,7 +932,7 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, byte += stride - mb_uv_width; } #endif // CONFIG_VP9_HIGHBITDEPTH - mb_y_offset += 16; + mb_y_offset += BW; mb_uv_offset += mb_uv_width; } } @@ -603,10 +943,10 @@ static void temporal_filter_iterate_tile_c(VP9_COMP *cpi, int tile_row, const int tile_cols = 1 << cm->log2_tile_cols; TileInfo *tile_info = &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; - const int mb_row_start = (tile_info->mi_row_start) >> 1; - const int mb_row_end = (tile_info->mi_row_end + 1) >> 1; - const int mb_col_start = (tile_info->mi_col_start) >> 1; - const int mb_col_end = (tile_info->mi_col_end + 1) >> 1; + const int mb_row_start = (tile_info->mi_row_start) >> TF_SHIFT; + const int mb_row_end = (tile_info->mi_row_end + TF_ROUND) >> TF_SHIFT; + const int mb_col_start = (tile_info->mi_col_start) >> TF_SHIFT; + const int mb_col_end = (tile_info->mi_col_end + TF_ROUND) >> TF_SHIFT; int mb_row; for (mb_row = mb_row_start; mb_row < mb_row_end; mb_row++) { @@ -620,13 +960,6 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) { const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; int tile_row, tile_col; - MACROBLOCKD *mbd = &cpi->td.mb.e_mbd; - // Save input state - uint8_t *input_buffer[MAX_MB_PLANE]; - int i; - - for (i = 0; i < MAX_MB_PLANE; i++) input_buffer[i] = mbd->plane[i].pre[0].buf; - vp9_init_tile_data(cpi); for (tile_row = 0; tile_row < tile_rows; ++tile_row) { @@ -634,15 +967,13 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi) { temporal_filter_iterate_tile_c(cpi, tile_row, tile_col); } } - - // Restore input state - for (i = 0; i < MAX_MB_PLANE; i++) mbd->plane[i].pre[0].buf = input_buffer[i]; } // Apply buffer limits and context specific adjustments to arnr filter. static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, int *arnr_frames, int *arnr_strength) { const VP9EncoderConfig *const oxcf = &cpi->oxcf; + const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const int frames_after_arf = vp9_lookahead_depth(cpi->lookahead) - distance - 1; int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; @@ -696,12 +1027,17 @@ static void adjust_arnr_filter(VP9_COMP *cpi, int distance, int group_boost, } // Adjustments for second level arf in multi arf case. - if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) { - const GF_GROUP *const gf_group = &cpi->twopass.gf_group; - if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) { - strength >>= 1; - } - } + // Leave commented out place holder for possible filtering adjustment with + // new multi-layer arf code. + // if (cpi->oxcf.pass == 2 && cpi->multi_arf_allowed) + // if (gf_group->rf_level[gf_group->index] != GF_ARF_STD) strength >>= 1; + + // TODO(jingning): Skip temporal filtering for intermediate frames that will + // be used as show_existing_frame. Need to further explore the possibility to + // apply certain filter. + if (gf_group->arf_src_offset[gf_group->index] < + cpi->rc.baseline_gf_interval - 1) + frames = 1; *arnr_frames = frames; *arnr_strength = strength; @@ -800,8 +1136,7 @@ void vp9_temporal_filter(VP9_COMP *cpi, int distance) { } // Initialize errorperbit and sabperbit. - rdmult = (int)vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); - if (rdmult < 1) rdmult = 1; + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, ARNR_FILT_QINDEX); set_error_per_bit(&cpi->td.mb, rdmult); vp9_initialize_me_consts(cpi, &cpi->td.mb, ARNR_FILT_QINDEX); diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.h b/libvpx/vp9/encoder/vp9_temporal_filter.h index 775e49cc5..f5fa194d1 100644 --- a/libvpx/vp9/encoder/vp9_temporal_filter.h +++ b/libvpx/vp9/encoder/vp9_temporal_filter.h @@ -8,14 +8,29 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ -#define VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#define VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ #ifdef __cplusplus extern "C" { #endif #define ARNR_FILT_QINDEX 128 +static const MV kZeroMv = { 0, 0 }; + +// Block size used in temporal filtering +#define TF_BLOCK BLOCK_32X32 +#define BH 32 +#define BH_LOG2 5 +#define BW 32 +#define BW_LOG2 5 +#define BLK_PELS 1024 // Pixels in the block +#define TF_SHIFT 2 +#define TF_ROUND 3 +#define THR_SHIFT 2 +#define TF_SUB_BLOCK BLOCK_16X16 +#define SUB_BH 16 +#define SUB_BW 16 void vp9_temporal_filter_init(void); void vp9_temporal_filter(VP9_COMP *cpi, int distance); @@ -28,4 +43,4 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ +#endif // VPX_VP9_ENCODER_VP9_TEMPORAL_FILTER_H_ diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h index b2f63ffef..6407ff923 100644 --- a/libvpx/vp9/encoder/vp9_tokenize.h +++ b/libvpx/vp9/encoder/vp9_tokenize.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TOKENIZE_H_ -#define VP9_ENCODER_VP9_TOKENIZE_H_ +#ifndef VPX_VP9_ENCODER_VP9_TOKENIZE_H_ +#define VPX_VP9_ENCODER_VP9_TOKENIZE_H_ #include "vp9/common/vp9_entropy.h" @@ -127,4 +127,4 @@ static INLINE int vp9_get_token_cost(int v, int16_t *token, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TOKENIZE_H_ +#endif // VPX_VP9_ENCODER_VP9_TOKENIZE_H_ diff --git a/libvpx/vp9/encoder/vp9_treewriter.h b/libvpx/vp9/encoder/vp9_treewriter.h index a8b9c2cd3..86c5fa224 100644 --- a/libvpx/vp9/encoder/vp9_treewriter.h +++ b/libvpx/vp9/encoder/vp9_treewriter.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_ENCODER_VP9_TREEWRITER_H_ -#define VP9_ENCODER_VP9_TREEWRITER_H_ +#ifndef VPX_VP9_ENCODER_VP9_TREEWRITER_H_ +#define VPX_VP9_ENCODER_VP9_TREEWRITER_H_ #include "vpx_dsp/bitwriter.h" @@ -48,4 +48,4 @@ static INLINE void vp9_write_token(vpx_writer *w, const vpx_tree_index *tree, } // extern "C" #endif -#endif // VP9_ENCODER_VP9_TREEWRITER_H_ +#endif // VPX_VP9_ENCODER_VP9_TREEWRITER_H_ diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c index 460dab659..e5860d39c 100644 --- a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c +++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c @@ -241,7 +241,7 @@ void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, int weight, uint32_t *accumulator, uint16_t *count) { unsigned int h; - const int rounding = strength > 0 ? 1 << (strength - 1) : 0; + const int rounding = (1 << strength) >> 1; assert(strength >= 0); assert(strength <= 6); diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c index dbd243ac1..8426b9475 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -14,6 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_dsp/x86/fwd_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" @@ -170,23 +171,23 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride, fadst4_sse2(in); write_buffer_4x4(output, in); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_4x4(input, in, stride); fadst4_sse2(in); fadst4_sse2(in); write_buffer_4x4(output, in); break; - default: assert(0); break; } } void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, - int16_t *coeff_ptr, intptr_t n_coeffs, + tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, - const int16_t *quant_ptr, int16_t *qcoeff_ptr, - int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i zero; int pass; @@ -215,7 +216,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, __m128i *in[8]; int index = 0; - (void)scan_ptr; + (void)scan; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -449,7 +450,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, in7 = _mm_srai_epi16(in7, 1); } - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -497,15 +498,15 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -518,8 +519,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -562,14 +563,14 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -582,8 +583,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -609,10 +610,10 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_tran_low(zero, qcoeff_ptr + n_coeffs); + store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); + store_tran_low(zero, dqcoeff_ptr + n_coeffs); + store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; @@ -1097,14 +1098,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_8x8(input, in, stride); fadst8_sse2(in); fadst8_sse2(in); right_shift_8x8(in, 1); write_buffer_8x8(output, in, 8); break; - default: assert(0); break; } } @@ -1963,13 +1964,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - case ADST_ADST: + default: + assert(tx_type == ADST_ADST); load_buffer_16x16(input, in0, in1, stride); fadst16_sse2(in0, in1); right_shift_16x16(in0, in1); fadst16_sse2(in0, in1); write_buffer_16x16(output, in0, in1, 16); break; - default: assert(0); break; } } diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c index bf874a09e..99c193894 100644 --- a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c @@ -18,11 +18,13 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, + tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i zero; int pass; @@ -52,7 +54,7 @@ void vp9_fdct8x8_quant_ssse3( __m128i *in[8]; int index = 0; - (void)scan_ptr; + (void)scan; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -280,7 +282,7 @@ void vp9_fdct8x8_quant_ssse3( in7 = _mm_srai_epi16(in7, 1); } - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -350,8 +352,8 @@ void vp9_fdct8x8_quant_ssse3( zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -427,8 +429,8 @@ void vp9_fdct8x8_quant_ssse3( zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c index 91f627c34..af04b6c52 100644 --- a/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -11,24 +11,25 @@ #include <emmintrin.h> #include <stdio.h> +#include "./vp9_rtcd.h" #include "vp9/common/vp9_common.h" -int64_t vp9_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz, - int bps) { +int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, + const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz, int bd) { int i, j, test; uint32_t temp[4]; __m128i max, min, cmp0, cmp1, cmp2, cmp3; int64_t error = 0, sqcoeff = 0; - const int shift = 2 * (bps - 8); + const int shift = 2 * (bd - 8); const int rounding = shift > 0 ? 1 << (shift - 1) : 0; for (i = 0; i < block_size; i += 8) { // Load the data into xmm registers - __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i)); - __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4)); - __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i)); - __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4)); + __m128i mm_coeff = _mm_load_si128((const __m128i *)(coeff + i)); + __m128i mm_coeff2 = _mm_load_si128((const __m128i *)(coeff + i + 4)); + __m128i mm_dqcoeff = _mm_load_si128((const __m128i *)(dqcoeff + i)); + __m128i mm_dqcoeff2 = _mm_load_si128((const __m128i *)(dqcoeff + i + 4)); // Check if any values require more than 15 bit max = _mm_set1_epi32(0x3fff); min = _mm_set1_epi32(0xffffc000); diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c new file mode 100644 index 000000000..8dfdbd50f --- /dev/null +++ b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <immintrin.h> // AVX2 + +#include "./vp9_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/bitdepth_conversion_avx2.h" +#include "vpx_dsp/x86/quantize_sse2.h" + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m256i zero = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm256_storeu_si256((__m256i *)(a), zero); + _mm256_storeu_si256((__m256i *)(a + 8), zero); +#else + _mm256_storeu_si256((__m256i *)(a), zero); +#endif +} + +static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr, + __m256i *coeff256) { + const __m256i iscan = _mm256_loadu_si256(iscan_ptr); + const __m256i zero256 = _mm256_setzero_si256(); +#if CONFIG_VP9_HIGHBITDEPTH + // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as + // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly. + const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8); + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256); +#else + const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256); +#endif + const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256); + // Add one to convert from indices to counts + const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0); + return _mm256_and_si256(iscan_plus_one, nzero_coeff0); +} + +void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + __m128i eob; + __m256i round256, quant256, dequant256; + __m256i eob256, thr256; + + (void)scan; + (void)skip_block; + assert(!skip_block); + + coeff_ptr += n_coeffs; + iscan += n_coeffs; + qcoeff_ptr += n_coeffs; + dqcoeff_ptr += n_coeffs; + n_coeffs = -n_coeffs; + + { + __m256i coeff256; + + // Setup global values + { + const __m128i round = _mm_load_si128((const __m128i *)round_ptr); + const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr); + const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr); + round256 = _mm256_castsi128_si256(round); + round256 = _mm256_permute4x64_epi64(round256, 0x54); + + quant256 = _mm256_castsi128_si256(quant); + quant256 = _mm256_permute4x64_epi64(quant256, 0x54); + + dequant256 = _mm256_castsi128_si256(dequant); + dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54); + } + + { + __m256i qcoeff256; + __m256i qtmp256; + coeff256 = load_tran_low(coeff_ptr + n_coeffs); + qcoeff256 = _mm256_abs_epi16(coeff256); + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + } + + eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); + n_coeffs += 8 * 2; + } + + // remove dc constants + dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31); + quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); + round256 = _mm256_permute2x128_si256(round256, round256, 0x31); + + thr256 = _mm256_srai_epi16(dequant256, 1); + + // AC only loop + while (n_coeffs < 0) { + __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs); + __m256i qcoeff256 = _mm256_abs_epi16(coeff256); + int32_t nzflag = + _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); + + if (nzflag) { + __m256i qtmp256; + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); + eob256 = _mm256_max_epi16( + eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); + } else { + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + } + n_coeffs += 8 * 2; + } + + eob = _mm_max_epi16(_mm256_castsi256_si128(eob256), + _mm256_extracti128_si256(eob256, 1)); + + *eob_ptr = accumulate_eob(eob); +} diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c index ca0ad4407..885220a71 100644 --- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c @@ -21,20 +21,20 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i zero; __m128i thr; int16_t nzflag; __m128i eob; __m128i round, quant, dequant; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -100,8 +100,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -175,8 +175,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk index 5bfc0d359..7ca4004b0 100644 --- a/libvpx/vp9/vp9_common.mk +++ b/libvpx/vp9/vp9_common.mk @@ -63,30 +63,33 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c -ifeq ($(CONFIG_VP9_POSTPROC),yes) -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm -endif - -ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c -endif -# common (msa) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c +VP9_COMMON_SRCS-$(HAVE_VSX) += common/ppc/vp9_idct_vsx.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h ifeq ($(CONFIG_VP9_POSTPROC),yes) -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_mfqe_sse2.asm endif -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c - ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c +VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c +else +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht8x8_add_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht16x16_add_neon.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c +VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c endif $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c index 881caae78..85f83a662 100644 --- a/libvpx/vp9/vp9_cx_iface.c +++ b/libvpx/vp9/vp9_cx_iface.c @@ -30,6 +30,7 @@ struct vp9_extracfg { unsigned int static_thresh; unsigned int tile_columns; unsigned int tile_rows; + unsigned int enable_tpl_model; unsigned int arnr_max_frames; unsigned int arnr_strength; unsigned int min_gf_interval; @@ -63,6 +64,7 @@ static struct vp9_extracfg default_extra_cfg = { 0, // static_thresh 6, // tile_columns 0, // tile_rows + 1, // enable_tpl_model 7, // arnr_max_frames 5, // arnr_strength 0, // min_gf_interval; 0 -> default decision @@ -237,22 +239,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, ERROR("ts_rate_decimator factors are not powers of 2"); } -#if CONFIG_SPATIAL_SVC - - if ((cfg->ss_number_layers > 1 || cfg->ts_number_layers > 1) && - cfg->g_pass == VPX_RC_LAST_PASS) { - unsigned int i, alt_ref_sum = 0; - for (i = 0; i < cfg->ss_number_layers; ++i) { - if (cfg->ss_enable_auto_alt_ref[i]) ++alt_ref_sum; - } - if (alt_ref_sum > REF_FRAMES - cfg->ss_number_layers) - ERROR("Not enough ref buffers for svc alt ref frames"); - if (cfg->ss_number_layers * cfg->ts_number_layers > 3 && - cfg->g_error_resilient == 0) - ERROR("Multiple frame context are not supported for more than 3 layers"); - } -#endif - // VP9 does not support a lower bound on the keyframe interval in // automatic keyframe placement mode. if (cfg->kf_mode != VPX_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist && @@ -263,8 +249,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK(extra_cfg, row_mt, 0, 1); RANGE_CHECK(extra_cfg, motion_vector_unit_test, 0, 2); - RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); - RANGE_CHECK(extra_cfg, cpu_used, -8, 8); + RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, MAX_ARF_LAYERS); + RANGE_CHECK(extra_cfg, cpu_used, -9, 9); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); RANGE_CHECK(extra_cfg, tile_columns, 0, 6); RANGE_CHECK(extra_cfg, tile_rows, 0, 2); @@ -560,6 +546,8 @@ static vpx_codec_err_t set_encoder_config( oxcf->tile_columns = extra_cfg->tile_columns; + oxcf->enable_tpl_model = extra_cfg->enable_tpl_model; + // TODO(yunqing): The dependencies between row tiles cause error in multi- // threaded encoding. For now, tile_rows is forced to be 0 in this case. // The further fix can be done by adding synchronizations after a tile row @@ -589,9 +577,6 @@ static vpx_codec_err_t set_encoder_config( oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test; for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; -#endif for (tl = 0; tl < oxcf->ts_number_layers; ++tl) { oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] = 1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl]; @@ -599,9 +584,6 @@ static vpx_codec_err_t set_encoder_config( } if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) { oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; -#if CONFIG_SPATIAL_SVC - oxcf->ss_enable_auto_arf[0] = extra_cfg->enable_auto_alt_ref; -#endif } if (oxcf->ts_number_layers > 1) { for (tl = 0; tl < VPX_TS_MAX_LAYERS; ++tl) { @@ -762,6 +744,13 @@ static vpx_codec_err_t ctrl_set_tile_rows(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_tpl_model(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.enable_tpl_model = CAST(VP9E_SET_TPL, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_set_arnr_max_frames(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp9_extracfg extra_cfg = ctx->extra_cfg; @@ -1067,12 +1056,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi, vpx_codec_frame_flags_t flags = lib_flags << 16; if (lib_flags & FRAMEFLAGS_KEY || - (cpi->use_svc && - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers + - cpi->svc.temporal_layer_id] - .is_key_frame)) + (cpi->use_svc && cpi->svc + .layer_context[cpi->svc.spatial_layer_id * + cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id] + .is_key_frame)) flags |= VPX_FRAME_IS_KEY; if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE; @@ -1097,23 +1085,11 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 && !cpi->level_constraint.rc_config_updated) { - SVC *const svc = &cpi->svc; - const int is_two_pass_svc = - (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); const VP9EncoderConfig *const oxcf = &cpi->oxcf; TWO_PASS *const twopass = &cpi->twopass; FIRSTPASS_STATS *stats = &twopass->total_stats; - if (is_two_pass_svc) { - const double frame_rate = 10000000.0 * stats->count / stats->duration; - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); cpi->level_constraint.rc_config_updated = 1; } @@ -1123,7 +1099,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, // There's no codec control for multiple alt-refs so check the encoder // instance for its status to determine the compressed data size. data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 * - (cpi->multi_arf_allowed ? 8 : 2); + (cpi->multi_layer_arf ? 8 : 2); if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize; if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) { ctx->cx_data_sz = data_sz; @@ -1174,6 +1150,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, size_t size, cx_data_sz; unsigned char *cx_data; + cpi->svc.timebase_fac = timebase_units_to_ticks(timebase, 1); + cpi->svc.time_stamp_superframe = dst_time_stamp; + // Set up internal flags if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1; @@ -1213,27 +1192,23 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data, &dst_time_stamp, &dst_end_time_stamp, !img)) { - if (size) { + if (size || (cpi->use_svc && cpi->svc.skip_enhancement_layer)) { vpx_codec_cx_pkt_t pkt; -#if CONFIG_SPATIAL_SVC - if (cpi->use_svc) - cpi->svc - .layer_context[cpi->svc.spatial_layer_id * - cpi->svc.number_temporal_layers] - .layer_size += size; -#endif - // Pack invisible frames with the next visible frame if (!cpi->common.show_frame || (cpi->use_svc && cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)) { if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data; ctx->pending_cx_data_sz += size; - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; ctx->pending_frame_magnitude |= size; cx_data += size; cx_data_sz -= size; + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; if (ctx->output_cx_pkt_cb.output_cx_pkt) { pkt.kind = VPX_CODEC_CX_FRAME_PKT; @@ -1260,9 +1235,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units( timebase, dst_end_time_stamp - dst_time_stamp); pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags); + pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width; + pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height; + pkt.data.frame.spatial_layer_encoded[cpi->svc.spatial_layer_id] = + 1 - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id]; if (ctx->pending_cx_data) { - ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; + if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size; ctx->pending_frame_magnitude |= size; ctx->pending_cx_data_sz += size; // write the superframe only for the case when @@ -1288,27 +1267,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, cx_data += size; cx_data_sz -= size; -#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC) - if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) { - vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr; - int sl; - vp9_zero(pkt_sizes); - vp9_zero(pkt_psnr); - pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES; - pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR; - for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - LAYER_CONTEXT *lc = - &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers]; - pkt_sizes.data.layer_sizes[sl] = lc->layer_size; - pkt_psnr.data.layer_psnr[sl] = lc->psnr_pkt; - lc->layer_size = 0; - } - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes); - - vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr); - } -#endif if (is_one_pass_cbr_svc(cpi) && (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { // Encoded all spatial layers; exit loop. @@ -1338,9 +1296,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx, vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, @@ -1354,9 +1311,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx, vp9_copy_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type), &sd); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, @@ -1364,14 +1320,13 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *const frame = va_arg(args, vp9_ref_frame_t *); if (frame != NULL) { - YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx); + const int fb_idx = ctx->cpi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->cpi->common, fb_idx); if (fb == NULL) return VPX_CODEC_ERROR; - yuvconfig2image(&frame->img, fb, NULL); return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, @@ -1381,9 +1336,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx, if (config != NULL) { ctx->preview_ppcfg = *config; return VPX_CODEC_OK; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; #else (void)ctx; (void)args; @@ -1405,17 +1359,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) { if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) { yuvconfig2image(&ctx->preview_img, &sd, NULL); return &ctx->preview_img; - } else { - return NULL; } + return NULL; } static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx, va_list args) { - (void)ctx; - (void)args; + vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *); + + if (data) { + vpx_roi_map_t *roi = (vpx_roi_map_t *)data; - // TODO(yaowu): Need to re-implement and test for VP9. + if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols, + roi->delta_q, roi->delta_lf, roi->skip, + roi->ref_frame)) { + return VPX_CODEC_OK; + } + return VPX_CODEC_INVALID_PARAM; + } return VPX_CODEC_INVALID_PARAM; } @@ -1427,11 +1388,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, @@ -1442,11 +1402,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx, if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows, (int)map->cols)) return VPX_CODEC_OK; - else - return VPX_CODEC_INVALID_PARAM; - } else { + return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, @@ -1458,9 +1417,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, (VPX_SCALING)mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; - } else { - return VPX_CODEC_INVALID_PARAM; } + return VPX_CODEC_INVALID_PARAM; } static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) { @@ -1491,22 +1449,23 @@ static vpx_codec_err_t ctrl_set_svc_layer_id(vpx_codec_alg_priv_t *ctx, vpx_svc_layer_id_t *const data = va_arg(args, vpx_svc_layer_id_t *); VP9_COMP *const cpi = (VP9_COMP *)ctx->cpi; SVC *const svc = &cpi->svc; + int sl; - svc->first_spatial_layer_to_encode = data->spatial_layer_id; svc->spatial_layer_to_encode = data->spatial_layer_id; + svc->first_spatial_layer_to_encode = data->spatial_layer_id; + // TODO(jianj): Deprecated to be removed. svc->temporal_layer_id = data->temporal_layer_id; + // Allow for setting temporal layer per spatial layer for superframe. + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { + svc->temporal_layer_id_per_spatial[sl] = + data->temporal_layer_id_per_spatial[sl]; + } // Checks on valid layer_id input. if (svc->temporal_layer_id < 0 || svc->temporal_layer_id >= (int)ctx->cfg.ts_number_layers) { return VPX_CODEC_INVALID_PARAM; } - if (svc->first_spatial_layer_to_encode < 0 || - svc->first_spatial_layer_to_encode >= (int)ctx->cfg.ss_number_layers) { - return VPX_CODEC_INVALID_PARAM; - } - // First spatial layer to encode not implemented for two-pass. - if (is_two_pass_svc(cpi) && svc->first_spatial_layer_to_encode > 0) - return VPX_CODEC_INVALID_PARAM; + return VPX_CODEC_OK; } @@ -1546,20 +1505,87 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) { + data->update_buffer_slot[sl] = cpi->svc.update_buffer_slot[sl]; + data->reference_last[sl] = cpi->svc.reference_last[sl]; + data->reference_golden[sl] = cpi->svc.reference_golden[sl]; + data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl]; + data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl]; + data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl]; + data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl]; + // TODO(jianj): Remove these 3, deprecated. + data->update_last[sl] = cpi->svc.update_last[sl]; + data->update_golden[sl] = cpi->svc.update_golden[sl]; + data->update_alt_ref[sl] = cpi->svc.update_altref[sl]; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); int sl; + cpi->svc.use_set_ref_frame_config = 1; for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { - cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl]; - cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl]; - cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl]; - cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.update_buffer_slot[sl] = data->update_buffer_slot[sl]; + cpi->svc.reference_last[sl] = data->reference_last[sl]; + cpi->svc.reference_golden[sl] = data->reference_golden[sl]; + cpi->svc.reference_altref[sl] = data->reference_alt_ref[sl]; + cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl]; + cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl]; + cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.duration[sl] = data->duration[sl]; } return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_svc_inter_layer_pred(vpx_codec_alg_priv_t *ctx, + va_list args) { + const int data = va_arg(args, int); + VP9_COMP *const cpi = ctx->cpi; + cpi->svc.disable_inter_layer_pred = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_frame_drop_layer(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_frame_drop_t *data = va_arg(args, vpx_svc_frame_drop_t *); + int sl; + cpi->svc.framedrop_mode = data->framedrop_mode; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.framedrop_thresh[sl] = data->framedrop_thresh[sl]; + // Don't allow max_consec_drop values below 1. + cpi->svc.max_consec_drop = VPXMAX(1, data->max_consec_drop); + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_gf_temporal_ref(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->svc.use_gf_temporal_ref = data; + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_set_svc_spatial_layer_sync( + vpx_codec_alg_priv_t *ctx, va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_spatial_layer_sync_t *data = + va_arg(args, vpx_svc_spatial_layer_sync_t *); + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) + cpi->svc.spatial_layer_sync[sl] = data->spatial_layer_sync[sl]; + cpi->svc.set_intra_only_frame = data->base_layer_intra_only; + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_register_cx_callback(vpx_codec_alg_priv_t *ctx, va_list args) { vpx_codec_priv_output_cx_pkt_cb_pair_t *cbp = @@ -1600,13 +1626,21 @@ static vpx_codec_err_t ctrl_set_render_size(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_postencode_drop(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + const unsigned int data = va_arg(args, unsigned int); + cpi->rc.ext_use_post_encode_drop = data; + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, // Setters { VP8_SET_REFERENCE, ctrl_set_reference }, { VP8_SET_POSTPROC, ctrl_set_previewpp }, - { VP8E_SET_ROI_MAP, ctrl_set_roi_map }, + { VP9E_SET_ROI_MAP, ctrl_set_roi_map }, { VP8E_SET_ACTIVEMAP, ctrl_set_active_map }, { VP8E_SET_SCALEMODE, ctrl_set_scale_mode }, { VP8E_SET_CPUUSED, ctrl_set_cpuused }, @@ -1615,6 +1649,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP8E_SET_STATIC_THRESHOLD, ctrl_set_static_thresh }, { VP9E_SET_TILE_COLUMNS, ctrl_set_tile_columns }, { VP9E_SET_TILE_ROWS, ctrl_set_tile_rows }, + { VP9E_SET_TPL, ctrl_set_tpl_model }, { VP8E_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames }, { VP8E_SET_ARNR_STRENGTH, ctrl_set_arnr_strength }, { VP8E_SET_ARNR_TYPE, ctrl_set_arnr_type }, @@ -1642,7 +1677,12 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, { VP9E_SET_ROW_MT, ctrl_set_row_mt }, + { VP9E_SET_POSTENCODE_DROP, ctrl_set_postencode_drop }, { VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test }, + { VP9E_SET_SVC_INTER_LAYER_PRED, ctrl_set_svc_inter_layer_pred }, + { VP9E_SET_SVC_FRAME_DROP_LAYER, ctrl_set_svc_frame_drop_layer }, + { VP9E_SET_SVC_GF_TEMPORAL_REF, ctrl_set_svc_gf_temporal_ref }, + { VP9E_SET_SVC_SPATIAL_LAYER_SYNC, ctrl_set_svc_spatial_layer_sync }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, @@ -1651,6 +1691,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, + { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, { -1, NULL }, }; @@ -1659,7 +1700,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { { 0, { // NOLINT - 0, // g_usage + 0, // g_usage (unused) 8, // g_threads 0, // g_profile diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c index 2a5578674..6a4cb9acf 100644 --- a/libvpx/vp9/vp9_dx_iface.c +++ b/libvpx/vp9/vp9_dx_iface.c @@ -238,6 +238,19 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) { flags->noise_level = ctx->postproc_cfg.noise_level; } +#undef ERROR +#define ERROR(str) \ + do { \ + ctx->base.err_detail = str; \ + return VPX_CODEC_INVALID_PARAM; \ + } while (0) + +#define RANGE_CHECK(p, memb, lo, hi) \ + do { \ + if (!(((p)->memb == lo || (p)->memb > (lo)) && (p)->memb <= hi)) \ + ERROR(#memb " out of range [" #lo ".." #hi "]"); \ + } while (0) + static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->last_show_frame = -1; ctx->need_resync = 1; @@ -254,6 +267,12 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) { ctx->pbi->max_threads = ctx->cfg.threads; ctx->pbi->inv_tile_order = ctx->invert_tile_order; + RANGE_CHECK(ctx, row_mt, 0, 1); + ctx->pbi->row_mt = ctx->row_mt; + + RANGE_CHECK(ctx, lpf_opt, 0, 1); + ctx->pbi->lpf_mt_opt = ctx->lpf_opt; + // If postprocessing was enabled by the application and a // configuration has not been provided, default it. if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)) @@ -455,8 +474,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx, vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *); if (data) { - YV12_BUFFER_CONFIG *fb; - fb = get_ref_frame(&ctx->pbi->common, data->idx); + const int fb_idx = ctx->pbi->common.cur_show_frame_fb_idx; + YV12_BUFFER_CONFIG *fb = get_buf_frame(&ctx->pbi->common, fb_idx); if (fb == NULL) return VPX_CODEC_ERROR; yuvconfig2image(&data->img, fb, NULL); return VPX_CODEC_OK; @@ -635,6 +654,20 @@ static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_row_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->row_mt = va_arg(args, int); + + return VPX_CODEC_OK; +} + +static vpx_codec_err_t ctrl_enable_lpf_opt(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->lpf_opt = va_arg(args, int); + + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -646,6 +679,8 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, + { VP9D_SET_ROW_MT, ctrl_set_row_mt }, + { VP9D_SET_LOOP_FILTER_OPT, ctrl_enable_lpf_opt }, // Getters { VPXD_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/libvpx/vp9/vp9_dx_iface.h b/libvpx/vp9/vp9_dx_iface.h index 18bc7ab0d..f60688c4d 100644 --- a/libvpx/vp9/vp9_dx_iface.h +++ b/libvpx/vp9/vp9_dx_iface.h @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_DX_IFACE_H_ -#define VP9_VP9_DX_IFACE_H_ +#ifndef VPX_VP9_VP9_DX_IFACE_H_ +#define VPX_VP9_VP9_DX_IFACE_H_ #include "vp9/decoder/vp9_decoder.h" @@ -45,6 +45,8 @@ struct vpx_codec_alg_priv { // Allow for decoding up to a given spatial layer for SVC stream. int svc_decoding; int svc_spatial_layer; + int row_mt; + int lpf_opt; }; -#endif // VP9_VP9_DX_IFACE_H_ +#endif // VPX_VP9_VP9_DX_IFACE_H_ diff --git a/libvpx/vp9/vp9_iface_common.h b/libvpx/vp9/vp9_iface_common.h index d68872750..a1921db63 100644 --- a/libvpx/vp9/vp9_iface_common.h +++ b/libvpx/vp9/vp9_iface_common.h @@ -7,17 +7,17 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_VP9_IFACE_COMMON_H_ -#define VP9_VP9_IFACE_COMMON_H_ +#ifndef VPX_VP9_VP9_IFACE_COMMON_H_ +#define VPX_VP9_VP9_IFACE_COMMON_H_ #include "vpx_ports/mem.h" static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, void *user_priv) { /** vpx_img_wrap() doesn't allow specifying independent strides for - * the Y, U, and V planes, nor other alignment adjustments that - * might be representable by a YV12_BUFFER_CONFIG, so we just - * initialize all the fields.*/ + * the Y, U, and V planes, nor other alignment adjustments that + * might be representable by a YV12_BUFFER_CONFIG, so we just + * initialize all the fields.*/ int bps; if (!yv12->subsampling_y) { if (!yv12->subsampling_x) { @@ -142,4 +142,4 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) { assert(0 && "Invalid Reference Frame"); return VP9_LAST_FLAG; } -#endif // VP9_VP9_IFACE_COMMON_H_ +#endif // VPX_VP9_VP9_IFACE_COMMON_H_ diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk index d633ed142..05981d689 100644 --- a/libvpx/vp9/vp9cx.mk +++ b/libvpx/vp9/vp9cx.mk @@ -64,6 +64,7 @@ VP9_CX_SRCS-yes += encoder/vp9_ratectrl.c VP9_CX_SRCS-yes += encoder/vp9_rd.c VP9_CX_SRCS-yes += encoder/vp9_rdopt.c VP9_CX_SRCS-yes += encoder/vp9_pickmode.c +VP9_CX_SRCS-yes += encoder/vp9_partition_models.h VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_speed_features.c @@ -74,6 +75,7 @@ VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.c VP9_CX_SRCS-yes += encoder/vp9_resize.c VP9_CX_SRCS-yes += encoder/vp9_resize.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.c +VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_blockiness.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c @@ -103,6 +105,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c +VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c @@ -139,6 +142,8 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h +VP9_CX_SRCS-$(HAVE_VSX) += encoder/ppc/vp9_quantize_vsx.c + # Strip unnecessary files with CONFIG_REALTIME_ONLY VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c |