diff options
author | hkuang <hkuang@google.com> | 2013-08-06 11:07:19 -0700 |
---|---|---|
committer | Hangyu Kuang <hkuang@google.com> | 2013-08-06 18:31:37 +0000 |
commit | f3bed9137f66ef693bd406e43b17e9a1114f1e14 (patch) | |
tree | cd1bea0cd923c6d125cb5b3e7b3404d7c2f70208 /libvpx/vp9/common | |
parent | a8b927ab4f06e2fc0d16d9606b57672df9899ac1 (diff) | |
download | libvpx-f3bed9137f66ef693bd406e43b17e9a1114f1e14.tar.gz |
Roll latest libvpx into Android.android-4.4_r0.8android-4.4_r0.7
The latest libvpx just added initial multithread vp9 decoding support and more neon optimization.
Checkout is from master branch(hash:33afddadb9af6569bd8296ef1d48d0511b651e9d).
Change-Id: I54be2f48bc033c00876b6b1d0a3ff1eccb92a2fa
Diffstat (limited to 'libvpx/vp9/common')
31 files changed, 1706 insertions, 850 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm index 15039e267..110a56cdd 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm @@ -52,15 +52,15 @@ ; sp[]int h |vp9_convolve8_avg_horiz_neon| PROC + ldr r12, [sp, #4] ; x_step_q4 + cmp r12, #16 + bne vp9_convolve8_avg_horiz_c + push {r4-r10, lr} sub r0, r0, #3 ; adjust for taps - ldr r4, [sp, #36] ; x_step_q4 ldr r5, [sp, #32] ; filter_x - cmp r4, #16 - bne call_horiz_c_convolve ; x_step_q4 != 16 - ldr r6, [sp, #48] ; w ldr r7, [sp, #52] ; h @@ -82,22 +82,22 @@ mov r10, r6 ; w loop counter loop_horiz - vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! - vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! + vld1.8 {d24}, [r0]! vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 - vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! - vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! + vld1.8 {d25}, [r0]! vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 - vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! - vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]! + vld1.8 {d26}, [r0]! vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 - vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]! - vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]! + vld1.8 {d27}, [r0]! vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + ; extract to s16 vmovl.u8 q8, d24 vmovl.u8 q9, d25 @@ -128,8 +128,8 @@ loop_horiz vqrshrun.s32 d5, q15, #7 ; saturate - vqshrn.u16 d2, q1, #0 - vqshrn.u16 d3, q2, #0 + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 ; transpose vtrn.16 d2, d3 @@ -137,10 +137,7 @@ loop_horiz vtrn.8 d2, d3 ; average the new value and the dst value - vaddl.u8 q8, d2, d6 - vaddl.u8 q9, d3, d7 - vqrshrn.u16 d2, q8, #1 - vqrshrn.u16 d3, q9, #1 + vrhadd.u8 q1, q1, q3 vst1.u32 {d2[0]}, [r2], r3 vst1.u32 {d3[0]}, [r2], r3 @@ -159,26 +156,20 @@ loop_horiz pop {r4-r10, pc} -call_horiz_c_convolve - pop {r4-r10, lr} - add r0, r0, #3 ; un-adjust for taps - b vp9_convolve8_avg_horiz_c - - ENDP |vp9_convolve8_avg_vert_neon| PROC + ldr r12, [sp, #12] + cmp r12, #16 + bne vp9_convolve8_avg_vert_c + push {r4-r10, lr} ; adjust for taps sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r6, [sp, #44] ; y_step_q4 ldr r7, [sp, #40] ; filter_y - cmp r6, #16 - bne call_vert_c_convolve ; y_step_q4 != 16 - ldr r8, [sp, #48] ; w ldr r9, [sp, #52] ; h @@ -240,14 +231,11 @@ loop_vert vqrshrun.s32 d5, q15, #7 ; saturate - vqshrn.u16 d2, q1, #0 - vqshrn.u16 d3, q2, #0 + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 ; average the new value and the dst value - vaddl.u8 q8, d2, d6 - vaddl.u8 q9, d3, d7 - vqrshrn.u16 d2, q8, #1 - vqrshrn.u16 d3, q9, #1 + vrhadd.u8 q1, q1, q3 vst1.u32 {d2[0]}, [r2], r3 vst1.u32 {d2[1]}, [r2], r3 @@ -266,12 +254,5 @@ loop_vert pop {r4-r10, pc} -call_vert_c_convolve - pop {r4-r10, lr} - ; un-adjust for taps - add r0, r0, r1 - add r0, r0, r1, lsl #1 - b vp9_convolve8_avg_vert_c - ENDP END diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm index 842c73c90..845e4a866 100644 --- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm @@ -52,15 +52,15 @@ ; sp[]int h |vp9_convolve8_horiz_neon| PROC + ldr r12, [sp, #4] ; x_step_q4 + cmp r12, #16 + bne vp9_convolve8_horiz_c + push {r4-r10, lr} sub r0, r0, #3 ; adjust for taps - ldr r4, [sp, #36] ; x_step_q4 ldr r5, [sp, #32] ; filter_x - cmp r4, #16 - bne call_horiz_c_convolve ; x_step_q4 != 16 - ldr r6, [sp, #48] ; w ldr r7, [sp, #52] ; h @@ -82,22 +82,22 @@ mov r10, r6 ; w loop counter loop_horiz - vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]! - vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]! + vld1.8 {d24}, [r0]! vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9 - vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]! - vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]! + vld1.8 {d25}, [r0]! vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9 - vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]! - vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]! + vld1.8 {d26}, [r0]! vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9 - vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]! - vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]! + vld1.8 {d27}, [r0]! vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8 + vtrn.16 q12, q13 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + ; extract to s16 vmovl.u8 q8, d24 vmovl.u8 q9, d25 @@ -120,8 +120,8 @@ loop_horiz vqrshrun.s32 d5, q15, #7 ; saturate - vqshrn.u16 d2, q1, #0 - vqshrn.u16 d3, q2, #0 + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 ; transpose vtrn.16 d2, d3 @@ -145,26 +145,20 @@ loop_horiz pop {r4-r10, pc} -call_horiz_c_convolve - pop {r4-r10, lr} - add r0, r0, #3 ; un-adjust for taps - b vp9_convolve8_horiz_c - - ENDP |vp9_convolve8_vert_neon| PROC + ldr r12, [sp, #12] + cmp r12, #16 + bne vp9_convolve8_vert_c + push {r4-r10, lr} ; adjust for taps sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r6, [sp, #44] ; y_step_q4 ldr r7, [sp, #40] ; filter_y - cmp r6, #16 - bne call_vert_c_convolve ; y_step_q4 != 16 - ldr r8, [sp, #48] ; w ldr r9, [sp, #52] ; h @@ -219,8 +213,8 @@ loop_vert vqrshrun.s32 d5, q15, #7 ; saturate - vqshrn.u16 d2, q1, #0 - vqshrn.u16 d3, q2, #0 + vqmovn.u16 d2, q1 + vqmovn.u16 d3, q2 vst1.u32 {d2[0]}, [r2], r3 vst1.u32 {d2[1]}, [r2], r3 @@ -239,12 +233,5 @@ loop_vert pop {r4-r10, pc} -call_vert_c_convolve - pop {r4-r10, lr} - ; un-adjust for taps - add r0, r0, r1 - add r0, r0, r1, lsl #1 - b vp9_convolve8_vert_c - ENDP END diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm new file mode 100644 index 000000000..edf5786e3 --- /dev/null +++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm @@ -0,0 +1,618 @@ +; +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + EXPORT |vp9_mb_lpf_horizontal_edge_w_neon| + EXPORT |vp9_mb_lpf_vertical_edge_w_neon| + ARM + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh +; int count) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vp9_mb_lpf_horizontal_edge_w_neon| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + ldr r12, [sp, #92] ; load count + +h_count + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines + + vld1.u8 {d0}, [r8@64], r1 ; p7 + vld1.u8 {d1}, [r8@64], r1 ; p6 + vld1.u8 {d2}, [r8@64], r1 ; p5 + vld1.u8 {d3}, [r8@64], r1 ; p4 + vld1.u8 {d4}, [r8@64], r1 ; p3 + vld1.u8 {d5}, [r8@64], r1 ; p2 + vld1.u8 {d6}, [r8@64], r1 ; p1 + vld1.u8 {d7}, [r8@64], r1 ; p0 + vld1.u8 {d8}, [r8@64], r1 ; q0 + vld1.u8 {d9}, [r8@64], r1 ; q1 + vld1.u8 {d10}, [r8@64], r1 ; q2 + vld1.u8 {d11}, [r8@64], r1 ; q3 + vld1.u8 {d12}, [r8@64], r1 ; q4 + vld1.u8 {d13}, [r8@64], r1 ; q5 + vld1.u8 {d14}, [r8@64], r1 ; q6 + vld1.u8 {d15}, [r8@64], r1 ; q7 + + bl vp9_wide_mbfilter_neon + + tst r7, #1 + beq h_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, r1, lsl #1 + + vst1.u8 {d25}, [r8@64], r1 ; store op1 + vst1.u8 {d24}, [r8@64], r1 ; store op0 + vst1.u8 {d23}, [r8@64], r1 ; store oq0 + vst1.u8 {d26}, [r8@64], r1 ; store oq1 + + b h_next + +h_mbfilter + tst r7, #2 + beq h_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, r1, lsl #1 + sub r8, r8, r1 + + vst1.u8 {d18}, [r8@64], r1 ; store op2 + vst1.u8 {d19}, [r8@64], r1 ; store op1 + vst1.u8 {d20}, [r8@64], r1 ; store op0 + vst1.u8 {d21}, [r8@64], r1 ; store oq0 + vst1.u8 {d22}, [r8@64], r1 ; store oq1 + vst1.u8 {d23}, [r8@64], r1 ; store oq2 + + b h_next + +h_wide_mbfilter + sub r8, r0, r1, lsl #3 + add r8, r8, r1 + + vst1.u8 {d16}, [r8@64], r1 ; store op6 + vst1.u8 {d24}, [r8@64], r1 ; store op5 + vst1.u8 {d25}, [r8@64], r1 ; store op4 + vst1.u8 {d26}, [r8@64], r1 ; store op3 + vst1.u8 {d27}, [r8@64], r1 ; store op2 + vst1.u8 {d18}, [r8@64], r1 ; store op1 + vst1.u8 {d19}, [r8@64], r1 ; store op0 + vst1.u8 {d20}, [r8@64], r1 ; store oq0 + vst1.u8 {d21}, [r8@64], r1 ; store oq1 + vst1.u8 {d22}, [r8@64], r1 ; store oq2 + vst1.u8 {d23}, [r8@64], r1 ; store oq3 + vst1.u8 {d1}, [r8@64], r1 ; store oq4 + vst1.u8 {d2}, [r8@64], r1 ; store oq5 + vst1.u8 {d3}, [r8@64], r1 ; store oq6 + +h_next + add r0, r0, #8 + subs r12, r12, #1 + bne h_count + + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |vp9_mb_lpf_horizontal_edge_w_neon| + +; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p, +; const uint8_t *blimit, +; const uint8_t *limit, +; const uint8_t *thresh) +; r0 uint8_t *s, +; r1 int p, /* pitch */ +; r2 const uint8_t *blimit, +; r3 const uint8_t *limit, +; sp const uint8_t *thresh, +|vp9_mb_lpf_vertical_edge_w_neon| PROC + push {r4-r8, lr} + vpush {d8-d15} + ldr r4, [sp, #88] ; load thresh + + vld1.8 {d16[]}, [r2] ; load *blimit + vld1.8 {d17[]}, [r3] ; load *limit + vld1.8 {d18[]}, [r4] ; load *thresh + + sub r8, r0, #8 + + vld1.8 {d0}, [r8@64], r1 + vld1.8 {d8}, [r0@64], r1 + vld1.8 {d1}, [r8@64], r1 + vld1.8 {d9}, [r0@64], r1 + vld1.8 {d2}, [r8@64], r1 + vld1.8 {d10}, [r0@64], r1 + vld1.8 {d3}, [r8@64], r1 + vld1.8 {d11}, [r0@64], r1 + vld1.8 {d4}, [r8@64], r1 + vld1.8 {d12}, [r0@64], r1 + vld1.8 {d5}, [r8@64], r1 + vld1.8 {d13}, [r0@64], r1 + vld1.8 {d6}, [r8@64], r1 + vld1.8 {d14}, [r0@64], r1 + vld1.8 {d7}, [r8@64], r1 + vld1.8 {d15}, [r0@64], r1 + + sub r0, r0, r1, lsl #3 + + vtrn.32 q0, q2 + vtrn.32 q1, q3 + vtrn.32 q4, q6 + vtrn.32 q5, q7 + + vtrn.16 q0, q1 + vtrn.16 q2, q3 + vtrn.16 q4, q5 + vtrn.16 q6, q7 + + vtrn.8 d0, d1 + vtrn.8 d2, d3 + vtrn.8 d4, d5 + vtrn.8 d6, d7 + + vtrn.8 d8, d9 + vtrn.8 d10, d11 + vtrn.8 d12, d13 + vtrn.8 d14, d15 + + bl vp9_wide_mbfilter_neon + + tst r7, #1 + beq v_mbfilter + + ; flat && mask were not set for any of the channels. Just store the values + ; from filter. + sub r8, r0, #2 + + vswp d23, d25 + + vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1 + vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1 + vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1 + vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1 + vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1 + vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1 + vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1 + vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1 + + b v_end + +v_mbfilter + tst r7, #2 + beq v_wide_mbfilter + + ; flat2 was not set for any of the channels. Just store the values from + ; mbfilter. + sub r8, r0, #3 + + vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1 + vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1 + vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1 + vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1 + vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1 + vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1 + vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1 + vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1 + vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1 + vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1 + vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1 + vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1 + vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1 + vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1 + vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1 + vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1 + + b v_end + +v_wide_mbfilter + sub r8, r0, #8 + + vtrn.32 d0, d26 + vtrn.32 d16, d27 + vtrn.32 d24, d18 + vtrn.32 d25, d19 + + vtrn.16 d0, d24 + vtrn.16 d16, d25 + vtrn.16 d26, d18 + vtrn.16 d27, d19 + + vtrn.8 d0, d16 + vtrn.8 d24, d25 + vtrn.8 d26, d27 + vtrn.8 d18, d19 + + vtrn.32 d20, d1 + vtrn.32 d21, d2 + vtrn.32 d22, d3 + vtrn.32 d23, d15 + + vtrn.16 d20, d22 + vtrn.16 d21, d23 + vtrn.16 d1, d3 + vtrn.16 d2, d15 + + vtrn.8 d20, d21 + vtrn.8 d22, d23 + vtrn.8 d1, d2 + vtrn.8 d3, d15 + + vst1.8 {d0}, [r8@64], r1 + vst1.8 {d20}, [r0@64], r1 + vst1.8 {d16}, [r8@64], r1 + vst1.8 {d21}, [r0@64], r1 + vst1.8 {d24}, [r8@64], r1 + vst1.8 {d22}, [r0@64], r1 + vst1.8 {d25}, [r8@64], r1 + vst1.8 {d23}, [r0@64], r1 + vst1.8 {d26}, [r8@64], r1 + vst1.8 {d1}, [r0@64], r1 + vst1.8 {d27}, [r8@64], r1 + vst1.8 {d2}, [r0@64], r1 + vst1.8 {d18}, [r8@64], r1 + vst1.8 {d3}, [r0@64], r1 + vst1.8 {d19}, [r8@64], r1 + vst1.8 {d15}, [r0@64], r1 + +v_end + vpop {d8-d15} + pop {r4-r8, pc} + + ENDP ; |vp9_mb_lpf_vertical_edge_w_neon| + +; void vp9_wide_mbfilter_neon(); +; This is a helper function for the loopfilters. The invidual functions do the +; necessary load, transpose (if necessary) and store. +; +; r0-r3 PRESERVE +; d16 blimit +; d17 limit +; d18 thresh +; d0 p7 +; d1 p6 +; d2 p5 +; d3 p4 +; d4 p3 +; d5 p2 +; d6 p1 +; d7 p0 +; d8 q0 +; d9 q1 +; d10 q2 +; d11 q3 +; d12 q4 +; d13 q5 +; d14 q6 +; d15 q7 +|vp9_wide_mbfilter_neon| PROC + mov r7, #0 + + ; filter_mask + vabd.u8 d19, d4, d5 ; abs(p3 - p2) + vabd.u8 d20, d5, d6 ; abs(p2 - p1) + vabd.u8 d21, d6, d7 ; abs(p1 - p0) + vabd.u8 d22, d9, d8 ; abs(q1 - q0) + vabd.u8 d23, d10, d9 ; abs(q2 - q1) + vabd.u8 d24, d11, d10 ; abs(q3 - q2) + + ; only compare the largest value to limit + vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1)) + vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0)) + vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2)) + vmax.u8 d19, d19, d20 + + vabd.u8 d24, d7, d8 ; abs(p0 - q0) + + vmax.u8 d19, d19, d23 + + vabd.u8 d23, d6, d9 ; a = abs(p1 - q1) + vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2 + + ; abs () > limit + vcge.u8 d19, d17, d19 + + ; flatmask4 + vabd.u8 d25, d7, d5 ; abs(p0 - p2) + vabd.u8 d26, d8, d10 ; abs(q0 - q2) + vabd.u8 d27, d4, d7 ; abs(p3 - p0) + vabd.u8 d28, d11, d8 ; abs(q3 - q0) + + ; only compare the largest value to thresh + vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2)) + vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0)) + vmax.u8 d25, d25, d26 + vmax.u8 d20, d20, d25 + + vshr.u8 d23, d23, #1 ; a = a / 2 + vqadd.u8 d24, d24, d23 ; a = b + a + + vmov.u8 d30, #1 + vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1 + + vcge.u8 d20, d30, d20 ; flat + + vand d19, d19, d24 ; mask + + ; hevmask + vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1 + vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1 + vorr d21, d21, d22 ; hev + + vand d16, d20, d19 ; flat && mask + vmov r5, r6, d16 + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #1 ; Only do filter branch + + ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7) + vabd.u8 d22, d3, d7 ; abs(p4 - p0) + vabd.u8 d23, d12, d8 ; abs(q4 - q0) + vabd.u8 d24, d7, d2 ; abs(p0 - p5) + vabd.u8 d25, d8, d13 ; abs(q0 - q5) + vabd.u8 d26, d1, d7 ; abs(p6 - p0) + vabd.u8 d27, d14, d8 ; abs(q6 - q0) + vabd.u8 d28, d0, d7 ; abs(p7 - p0) + vabd.u8 d29, d15, d8 ; abs(q7 - q0) + + ; only compare the largest value to thresh + vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0)) + vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5)) + vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0)) + vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0)) + + vmax.u8 d26, d22, d23 + vmax.u8 d27, d24, d25 + vmax.u8 d23, d26, d27 + + vcge.u8 d18, d30, d23 ; flat2 + + vmov.u8 d22, #0x80 + + vand d17, d18, d16 ; flat2 && flat && mask + vmov r5, r6, d17 + orrs r5, r5, r6 ; Check for 0 + orreq r7, r7, #2 ; Only do mbfilter branch + + ; mbfilter() function + + ; filter() function + ; convert to signed + veor d23, d8, d22 ; qs0 + veor d24, d7, d22 ; ps0 + veor d25, d6, d22 ; ps1 + veor d26, d9, d22 ; qs1 + + vmov.u8 d27, #3 + + vsub.s8 d28, d23, d24 ; ( qs0 - ps0) + + vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1) + + vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0) + + vand d29, d29, d21 ; filter &= hev + + vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0) + + vmov.u8 d29, #4 + + ; filter = clamp(filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d28, q15 + + vand d28, d28, d19 ; filter &= mask + + vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3) + vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4) + vshr.s8 d30, d30, #3 ; filter2 >>= 3 + vshr.s8 d29, d29, #3 ; filter1 >>= 3 + + + vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2) + vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1) + + ; outer tap adjustments: ++filter1 >> 1 + vrshr.s8 d29, d29, #1 + vbic d29, d29, d21 ; filter &= ~hev + + vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter) + vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter) + + veor d24, d24, d22 ; *f_op0 = u^0x80 + veor d23, d23, d22 ; *f_oq0 = u^0x80 + veor d25, d25, d22 ; *f_op1 = u^0x80 + veor d26, d26, d22 ; *f_oq1 = u^0x80 + + tst r7, #1 + bxne lr + + ; mbfilter flat && mask branch + ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's + ; and using vibt on the q's? + vmov.u8 d29, #2 + vaddl.u8 q15, d7, d8 ; op2 = p0 + q0 + vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3 + vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2 + vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2 + vqrshrn.u16 d18, q15, #3 ; r_op2 + + vsubw.u8 q15, d4 ; op1 = op2 - p3 + vsubw.u8 q15, d5 ; op1 -= p2 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d9 ; op1 += q1 + vqrshrn.u16 d19, q15, #3 ; r_op1 + + vsubw.u8 q15, d4 ; op0 = op1 - p3 + vsubw.u8 q15, d6 ; op0 -= p1 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d10 ; op0 += q2 + vqrshrn.u16 d20, q15, #3 ; r_op0 + + vsubw.u8 q15, d4 ; oq0 = op0 - p3 + vsubw.u8 q15, d7 ; oq0 -= p0 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d11 ; oq0 += q3 + vqrshrn.u16 d21, q15, #3 ; r_oq0 + + vsubw.u8 q15, d5 ; oq1 = oq0 - p2 + vsubw.u8 q15, d8 ; oq1 -= q0 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddw.u8 q15, d11 ; oq1 += q3 + vqrshrn.u16 d22, q15, #3 ; r_oq1 + + vsubw.u8 q15, d6 ; oq2 = oq0 - p1 + vsubw.u8 q15, d9 ; oq2 -= q1 + vaddw.u8 q15, d10 ; oq2 += q2 + vaddw.u8 q15, d11 ; oq2 += q3 + vqrshrn.u16 d27, q15, #3 ; r_oq2 + + ; Filter does not set op2 or oq2, so use p2 and q2. + vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask) + vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask) + vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask) + vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask) + vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask) + + vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask) + vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask) + + tst r7, #2 + bxne lr + + ; wide_mbfilter flat2 && flat && mask branch + vmov.u8 d16, #7 + vaddl.u8 q15, d7, d8 ; op6 = p0 + q0 + vmlal.u8 q15, d0, d16 ; op6 += p7 * 3 + vmlal.u8 q15, d1, d29 ; op6 += p6 * 2 + vaddw.u8 q15, d2 ; op6 += p5 + vaddw.u8 q15, d3 ; op6 += p4 + vaddw.u8 q15, d4 ; op6 += p3 + vaddw.u8 q15, d5 ; op6 += p2 + vaddw.u8 q15, d6 ; op6 += p1 + vqrshrn.u16 d16, q15, #4 ; w_op6 + + vsubw.u8 q15, d0 ; op5 = op6 - p7 + vsubw.u8 q15, d1 ; op5 -= p6 + vaddw.u8 q15, d2 ; op5 += p5 + vaddw.u8 q15, d9 ; op5 += q1 + vqrshrn.u16 d24, q15, #4 ; w_op5 + + vsubw.u8 q15, d0 ; op4 = op5 - p7 + vsubw.u8 q15, d2 ; op4 -= p5 + vaddw.u8 q15, d3 ; op4 += p4 + vaddw.u8 q15, d10 ; op4 += q2 + vqrshrn.u16 d25, q15, #4 ; w_op4 + + vsubw.u8 q15, d0 ; op3 = op4 - p7 + vsubw.u8 q15, d3 ; op3 -= p4 + vaddw.u8 q15, d4 ; op3 += p3 + vaddw.u8 q15, d11 ; op3 += q3 + vqrshrn.u16 d26, q15, #4 ; w_op3 + + vsubw.u8 q15, d0 ; op2 = op3 - p7 + vsubw.u8 q15, d4 ; op2 -= p3 + vaddw.u8 q15, d5 ; op2 += p2 + vaddw.u8 q15, d12 ; op2 += q4 + vqrshrn.u16 d27, q15, #4 ; w_op2 + + vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m) + + vsubw.u8 q15, d0 ; op1 = op2 - p7 + vsubw.u8 q15, d5 ; op1 -= p2 + vaddw.u8 q15, d6 ; op1 += p1 + vaddw.u8 q15, d13 ; op1 += q5 + vqrshrn.u16 d18, q15, #4 ; w_op1 + + vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m) + + vsubw.u8 q15, d0 ; op0 = op1 - p7 + vsubw.u8 q15, d6 ; op0 -= p1 + vaddw.u8 q15, d7 ; op0 += p0 + vaddw.u8 q15, d14 ; op0 += q6 + vqrshrn.u16 d19, q15, #4 ; w_op0 + + vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m) + + vsubw.u8 q15, d0 ; oq0 = op0 - p7 + vsubw.u8 q15, d7 ; oq0 -= p0 + vaddw.u8 q15, d8 ; oq0 += q0 + vaddw.u8 q15, d15 ; oq0 += q7 + vqrshrn.u16 d20, q15, #4 ; w_oq0 + + vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m) + + vsubw.u8 q15, d1 ; oq1 = oq0 - p6 + vsubw.u8 q15, d8 ; oq1 -= q0 + vaddw.u8 q15, d9 ; oq1 += q1 + vaddw.u8 q15, d15 ; oq1 += q7 + vqrshrn.u16 d21, q15, #4 ; w_oq1 + + vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m) + + vsubw.u8 q15, d2 ; oq2 = oq1 - p5 + vsubw.u8 q15, d9 ; oq2 -= q1 + vaddw.u8 q15, d10 ; oq2 += q2 + vaddw.u8 q15, d15 ; oq2 += q7 + vqrshrn.u16 d22, q15, #4 ; w_oq2 + + vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m) + + vsubw.u8 q15, d3 ; oq3 = oq2 - p4 + vsubw.u8 q15, d10 ; oq3 -= q2 + vaddw.u8 q15, d11 ; oq3 += q3 + vaddw.u8 q15, d15 ; oq3 += q7 + vqrshrn.u16 d23, q15, #4 ; w_oq3 + + vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m) + + vsubw.u8 q15, d4 ; oq4 = oq3 - p3 + vsubw.u8 q15, d11 ; oq4 -= q3 + vaddw.u8 q15, d12 ; oq4 += q4 + vaddw.u8 q15, d15 ; oq4 += q7 + vqrshrn.u16 d1, q15, #4 ; w_oq4 + + vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m) + + vsubw.u8 q15, d5 ; oq5 = oq4 - p2 + vsubw.u8 q15, d12 ; oq5 -= q4 + vaddw.u8 q15, d13 ; oq5 += q5 + vaddw.u8 q15, d15 ; oq5 += q7 + vqrshrn.u16 d2, q15, #4 ; w_oq5 + + vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m) + + vsubw.u8 q15, d6 ; oq6 = oq5 - p1 + vsubw.u8 q15, d13 ; oq6 -= q5 + vaddw.u8 q15, d14 ; oq6 += q6 + vaddw.u8 q15, d15 ; oq6 += q7 + vqrshrn.u16 d3, q15, #4 ; w_oq6 + + vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m) + vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m) + vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m) + vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m) + vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m) + + bx lr + ENDP ; |vp9_wide_mbfilter_neon| + + END diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm index 8e4aadac2..f82966577 100644 --- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm +++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm @@ -22,8 +22,8 @@ MACRO IDCT8x8_1D ; stage 1 - vdup.16 d0, r3; ; duplicate cospi_28_64 - vdup.16 d1, r4; ; duplicate cospi_4_64 + vdup.16 d0, r3 ; duplicate cospi_28_64 + vdup.16 d1, r4 ; duplicate cospi_4_64 ; input[1] * cospi_28_64 vmull.s16 q2, d18, d0 @@ -57,8 +57,8 @@ vqrshrn.s32 d14, q2, #14 ; >> 14 vqrshrn.s32 d15, q3, #14 ; >> 14 - vdup.16 d0, r5; ; duplicate cospi_12_64 - vdup.16 d1, r6; ; duplicate cospi_20_64 + vdup.16 d0, r5 ; duplicate cospi_12_64 + vdup.16 d1, r6 ; duplicate cospi_20_64 ; input[5] * cospi_12_64 vmull.s16 q2, d26, d0 @@ -93,7 +93,7 @@ vqrshrn.s32 d13, q1, #14 ; >> 14 ; stage 2 & stage 3 - even half - vdup.16 d0, r7; ; duplicate cospi_16_64 + vdup.16 d0, r7 ; duplicate cospi_16_64 ; input[0] * cospi_16_64 vmull.s16 q2, d16, d0 @@ -128,8 +128,8 @@ vqrshrn.s32 d23, q3, #14 ; >> 14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vdup.16 d0, r8; ; duplicate cospi_24_64 - vdup.16 d1, r9; ; duplicate cospi_8_64 + vdup.16 d0, r8 ; duplicate cospi_24_64 + vdup.16 d1, r9 ; duplicate cospi_8_64 ; input[1] * cospi_24_64 vmull.s16 q2, d20, d0 @@ -176,7 +176,7 @@ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] ; stage 3 -odd half - vdup.16 d16, r7; ; duplicate cospi_16_64 + vdup.16 d16, r7 ; duplicate cospi_16_64 ; step2[6] * cospi_16_64 vmull.s16 q9, d28, d16 @@ -211,14 +211,14 @@ vqrshrn.s32 d13, q10, #14 ; >> 14 ; stage 4 - vadd.s16 q8, q0, q7; ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6; ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5; ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4; ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4; ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5; ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6; ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7; ; output[7] = step1[0] - step1[7]; + vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; + vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; + vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; + vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; + vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; + vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; + vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; + vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; MEND ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. @@ -310,14 +310,14 @@ mov r0, r1 ; load destination data - vld1.u8 {d0}, [r1], r2 - vld1.u8 {d1}, [r1], r2 - vld1.s16 {d2}, [r1], r2 - vld1.s16 {d3}, [r1], r2 - vld1.s16 {d4}, [r1], r2 - vld1.s16 {d5}, [r1], r2 - vld1.s16 {d6}, [r1], r2 - vld1.s16 {d7}, [r1] + vld1.64 {d0}, [r1], r2 + vld1.64 {d1}, [r1], r2 + vld1.64 {d2}, [r1], r2 + vld1.64 {d3}, [r1], r2 + vld1.64 {d4}, [r1], r2 + vld1.64 {d5}, [r1], r2 + vld1.64 {d6}, [r1], r2 + vld1.64 {d7}, [r1] ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] vaddw.u8 q8, q8, d0 diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h index 129711412..f68c5c6ea 100644 --- a/libvpx/vp9/common/vp9_blockd.h +++ b/libvpx/vp9/common/vp9_blockd.h @@ -26,9 +26,6 @@ #include "vp9/common/vp9_treecoder.h" #define BLOCK_SIZE_GROUPS 4 - -#define PREDICTION_PROBS 3 - #define MBSKIP_CONTEXTS 3 /* Segment Feature Masks */ @@ -164,6 +161,11 @@ typedef struct { union b_mode_info bmi[4]; } MODE_INFO; +static int is_inter_block(const MB_MODE_INFO *mbmi) { + return mbmi->ref_frame[0] > INTRA_FRAME; +} + + enum mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 @@ -286,22 +288,22 @@ typedef struct macroblockd { static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { switch (subsize) { - case BLOCK_SIZE_SB64X64: - case BLOCK_SIZE_SB64X32: - case BLOCK_SIZE_SB32X64: - case BLOCK_SIZE_SB32X32: + case BLOCK_64X64: + case BLOCK_64X32: + case BLOCK_32X64: + case BLOCK_32X32: return &xd->sb_index; - case BLOCK_SIZE_SB32X16: - case BLOCK_SIZE_SB16X32: - case BLOCK_SIZE_MB16X16: + case BLOCK_32X16: + case BLOCK_16X32: + case BLOCK_16X16: return &xd->mb_index; - case BLOCK_SIZE_SB16X8: - case BLOCK_SIZE_SB8X16: - case BLOCK_SIZE_SB8X8: + case BLOCK_16X8: + case BLOCK_8X16: + case BLOCK_8X8: return &xd->b_index; - case BLOCK_SIZE_SB8X4: - case BLOCK_SIZE_SB4X8: - case BLOCK_SIZE_AB4X4: + case BLOCK_8X4: + case BLOCK_4X8: + case BLOCK_4X4: return &xd->ab_index; default: assert(0); @@ -315,7 +317,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd, const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; const int bwl = b_width_log2(sb_type); const int bhl = b_height_log2(sb_type); - const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl; + const int boffset = b_width_log2(BLOCK_64X64) - bsl; const char pcval0 = ~(0xe << boffset); const char pcval1 = ~(0xf << boffset); const char pcvalue[2] = {pcval0, pcval1}; @@ -333,7 +335,7 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd, BLOCK_SIZE_TYPE sb_type) { int bsl = mi_width_log2(sb_type), bs = 1 << bsl; int above = 0, left = 0, i; - int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl; + int boffset = mi_width_log2(BLOCK_64X64) - bsl; assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); assert(bsl >= 0); @@ -366,10 +368,10 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type, if (plane_type != PLANE_TYPE_Y_WITH_DC || xd->lossless || - mbmi->ref_frame[0] != INTRA_FRAME) + is_inter_block(mbmi)) return DCT_DCT; - return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ? + return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? mi->bmi[ib].as_mode : mbmi->mode]; } @@ -496,16 +498,16 @@ static INLINE void foreach_transformed_block_in_plane( // it to 4x4 block sizes. if (xd->mb_to_right_edge < 0) max_blocks_wide += - + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x)); + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x)); if (xd->mb_to_bottom_edge < 0) max_blocks_high += - + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y)); + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y)); i = 0; // Unlike the normal case - in here we have to keep track of the // row and column of the blocks we use so that we know if we are in - // the unrestricted motion border.. + // the unrestricted motion border. for (r = 0; r < (1 << sh); r += (1 << tx_size)) { for (c = 0; c < (1 << sw); c += (1 << tx_size)) { if (r < max_blocks_high && c < max_blocks_wide) @@ -563,8 +565,8 @@ static INLINE void foreach_predicted_block_in_plane( // size of the predictor to use. int pred_w, pred_h; - if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - assert(bsize == BLOCK_SIZE_SB8X8); + if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) { + assert(bsize == BLOCK_8X8); pred_w = 0; pred_h = 0; } else { @@ -689,46 +691,39 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block, } } static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, - int plane, int ss_tx_size, int eob, int aoff, - int loff, ENTROPY_CONTEXT *A, - ENTROPY_CONTEXT *L) { - const int bw = b_width_log2(bsize), bh = b_height_log2(bsize); - const int sw = bw - xd->plane[plane].subsampling_x; - const int sh = bh - xd->plane[plane].subsampling_y; - int mi_blocks_wide = 1 << sw; - int mi_blocks_high = 1 << sh; - int tx_size_in_blocks = (1 << ss_tx_size); + int plane, int tx_size_in_blocks, + int eob, int aoff, int loff, + ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { + struct macroblockd_plane *pd = &xd->plane[plane]; int above_contexts = tx_size_in_blocks; int left_contexts = tx_size_in_blocks; + int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd); + int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd); int pt; // xd->mb_to_right_edge is in units of pixels * 8. This converts // it to 4x4 block sizes. - if (xd->mb_to_right_edge < 0) { - mi_blocks_wide += (xd->mb_to_right_edge - >> (5 + xd->plane[plane].subsampling_x)); - } + if (xd->mb_to_right_edge < 0) + mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x)); // this code attempts to avoid copying into contexts that are outside // our border. Any blocks that do are set to 0... if (above_contexts + aoff > mi_blocks_wide) above_contexts = mi_blocks_wide - aoff; - if (xd->mb_to_bottom_edge < 0) { - mi_blocks_high += (xd->mb_to_bottom_edge - >> (5 + xd->plane[plane].subsampling_y)); - } - if (left_contexts + loff > mi_blocks_high) { + if (xd->mb_to_bottom_edge < 0) + mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + + if (left_contexts + loff > mi_blocks_high) left_contexts = mi_blocks_high - loff; - } for (pt = 0; pt < above_contexts; pt++) A[pt] = eob > 0; - for (pt = above_contexts; pt < (1 << ss_tx_size); pt++) + for (pt = above_contexts; pt < tx_size_in_blocks; pt++) A[pt] = 0; for (pt = 0; pt < left_contexts; pt++) L[pt] = eob > 0; - for (pt = left_contexts; pt < (1 << ss_tx_size); pt++) + for (pt = left_contexts; pt < tx_size_in_blocks; pt++) L[pt] = 0; } diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c index dee44ec63..fdf37e46a 100644 --- a/libvpx/vp9/common/vp9_common_data.c +++ b/libvpx/vp9/common/vp9_common_data.c @@ -31,6 +31,14 @@ const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] = const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] = {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8}; +// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize))) +const int size_group_lookup[BLOCK_SIZE_TYPES] = + {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3}; + +const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] = + {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12}; + + const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { { // 4X4 // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 @@ -40,25 +48,25 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID }, { // 8X8 - // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID }, { // 16X16 - // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID }, { // 32X32 - // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID }, { // 64X64 - // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 + // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64 PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, @@ -68,29 +76,29 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = { const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = { { // PARTITION_NONE - BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4, - BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8, - BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16, - BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32, - BLOCK_SIZE_SB64X64, + BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, + BLOCK_8X8, BLOCK_8X16, BLOCK_16X8, + BLOCK_16X16, BLOCK_16X32, BLOCK_32X16, + BLOCK_32X32, BLOCK_32X64, BLOCK_64X32, + BLOCK_64X64, }, { // PARTITION_HORZ BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB64X32, + BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_64X32, }, { // PARTITION_VERT BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB32X64, + BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_32X64, }, { // PARTITION_SPLIT BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, - BLOCK_SIZE_SB32X32, + BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, + BLOCK_32X32, } }; @@ -108,14 +116,9 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = { }; const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = { - {BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8, - BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8}, - {BLOCK_SIZE_SB8X4, BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, - BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB8X16}, - {BLOCK_SIZE_SB16X8, BLOCK_SIZE_SB16X8, BLOCK_SIZE_MB16X16, - BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32}, - {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, - BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64}, - {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, - BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64} + { BLOCK_4X4, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8 }, + { BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_8X16, BLOCK_8X16 }, + { BLOCK_16X8, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 }, + { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 }, + { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 } }; diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h index 8b0f8a500..bc8c01a77 100644 --- a/libvpx/vp9/common/vp9_common_data.h +++ b/libvpx/vp9/common/vp9_common_data.h @@ -21,10 +21,9 @@ extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES]; extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES]; extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES]; extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES]; -extern const PARTITION_TYPE - partition_lookup[][BLOCK_SIZE_TYPES]; - - +extern const int size_group_lookup[BLOCK_SIZE_TYPES]; +extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES]; +extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES]; extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES]; extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES]; extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES]; diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c index 0ad0dbccd..df3a9fed5 100644 --- a/libvpx/vp9/common/vp9_entropy.c +++ b/libvpx/vp9/common/vp9_entropy.c @@ -73,7 +73,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = { 13, 11, 14, 15, }; -DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = { +DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = { 0, 8, 1, 16, 9, 2, 17, 24, 10, 3, 18, 25, 32, 11, 4, 26, 33, 19, 40, 12, 34, 27, 5, 41, @@ -419,7 +419,7 @@ static void init_bit_trees() { init_bit_tree(cat6, 14); } -vp9_extra_bit vp9_extra_bits[12] = { +const vp9_extra_bit vp9_extra_bits[12] = { { 0, 0, 0, 0}, { 0, 0, 0, 1}, { 0, 0, 0, 2}, @@ -437,14 +437,10 @@ vp9_extra_bit vp9_extra_bits[12] = { #include "vp9/common/vp9_default_coef_probs.h" void vp9_default_coef_probs(VP9_COMMON *pc) { - vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4, - sizeof(pc->fc.coef_probs[TX_4X4])); - vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8, - sizeof(pc->fc.coef_probs[TX_8X8])); - vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16, - sizeof(pc->fc.coef_probs[TX_16X16])); - vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32, - sizeof(pc->fc.coef_probs[TX_32X32])); + vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4); + vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8); + vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16); + vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32); } // Neighborhood 5-tuples for various scans and blocksizes, @@ -613,17 +609,17 @@ void vp9_coef_tree_initialize() { #define COEF_COUNT_SAT_AFTER_KEY 24 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 -static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size, - int count_sat, int update_factor) { +static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size, + unsigned int count_sat, + unsigned int update_factor) { FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; - vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size]; - vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size]; - vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size]; + vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size]; + vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size]; + vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size]; unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = - cm->counts.eob_branch[txfm_size]; - int t, i, j, k, l, count; - int factor; + cm->counts.eob_branch[tx_size]; + int t, i, j, k, l; unsigned int branch_ct[UNCONSTRAINED_NODES][2]; vp9_prob coef_probs[UNCONSTRAINED_NODES]; int entropy_nodes_adapt = UNCONSTRAINED_NODES; @@ -634,29 +630,23 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size, for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { if (l >= 3 && k == 0) continue; - vp9_tree_probs_from_distribution( - vp9_coefmodel_tree, - coef_probs, branch_ct, - coef_counts[i][j][k][l], 0); + vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs, + branch_ct, coef_counts[i][j][k][l], + 0); branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); - for (t = 0; t < entropy_nodes_adapt; ++t) { - count = branch_ct[t][0] + branch_ct[t][1]; - count = count > count_sat ? count_sat : count; - factor = (update_factor * count / count_sat); - dst_coef_probs[i][j][k][l][t] = - weighted_prob(pre_coef_probs[i][j][k][l][t], - coef_probs[t], factor); - } + for (t = 0; t < entropy_nodes_adapt; ++t) + dst_coef_probs[i][j][k][l][t] = merge_probs( + pre_coef_probs[i][j][k][l][t], coef_probs[t], + branch_ct[t], count_sat, update_factor); } } void vp9_adapt_coef_probs(VP9_COMMON *cm) { TX_SIZE t; - int count_sat; - int update_factor; /* denominator 256 */ + unsigned int count_sat, update_factor; - if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { + if (cm->frame_type == KEY_FRAME || cm->intra_only) { update_factor = COEF_MAX_UPDATE_FACTOR_KEY; count_sat = COEF_COUNT_SAT_KEY; } else if (cm->last_frame_type == KEY_FRAME) { diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h index 4ea727ff4..861c0786c 100644 --- a/libvpx/vp9/common/vp9_entropy.h +++ b/libvpx/vp9/common/vp9_entropy.h @@ -50,7 +50,7 @@ typedef struct { int base_val; } vp9_extra_bit; -extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ +extern const vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ #define MAX_PROB 255 #define DCT_MAX_VALUE 16384 @@ -80,7 +80,6 @@ extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ coefficient band (and since zigzag positions 0, 1, and 2 are in distinct bands). */ -/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ #define PREV_COEF_CONTEXTS 6 // #define ENTROPY_STATS @@ -102,7 +101,7 @@ extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]); extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]); extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]); -extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]); extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]); extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]); @@ -119,7 +118,7 @@ extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]); -extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]); +extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]); extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]); extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]); diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c index ca188e438..768e5f523 100644 --- a/libvpx/vp9/common/vp9_entropymode.c +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -356,53 +356,15 @@ void vp9_entropy_mode_init() { vp9_inter_mode_tree, NEARESTMV); } -void vp9_accum_mv_refs(VP9_COMMON *pc, - MB_PREDICTION_MODE m, - const int context) { - unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = - pc->counts.inter_mode; - - if (m == ZEROMV) { - ++inter_mode_counts[context][0][0]; - } else { - ++inter_mode_counts[context][0][1]; - if (m == NEARESTMV) { - ++inter_mode_counts[context][1][0]; - } else { - ++inter_mode_counts[context][1][1]; - if (m == NEARMV) { - ++inter_mode_counts[context][2][0]; - } else { - ++inter_mode_counts[context][2][1]; - } - } - } -} - #define COUNT_SAT 20 #define MAX_UPDATE_FACTOR 128 -static int update_ct(vp9_prob pre_prob, vp9_prob prob, - unsigned int ct[2]) { - const int count = MIN(ct[0] + ct[1], COUNT_SAT); - const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT; - return weighted_prob(pre_prob, prob, factor); +static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) { + return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) { - return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct); -} - -void vp9_adapt_mode_context(VP9_COMMON *pc) { - int i, j; - FRAME_CONTEXT *const fc = &pc->fc; - FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx]; - FRAME_COUNTS *const counts = &pc->counts; - - for (j = 0; j < INTER_MODE_CONTEXTS; j++) - for (i = 0; i < VP9_INTER_MODES - 1; i++) - fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i], - counts->inter_mode[j][i]); + return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR); } static void update_mode_probs(int n_modes, @@ -440,6 +402,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]); + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree, + counts->inter_mode[i], pre_fc->inter_mode_probs[i], + fc->inter_mode_probs[i], NEARESTMV); + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, counts->y_mode[i], pre_fc->y_mode_prob[i], @@ -466,25 +433,25 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { if (cm->tx_mode == TX_MODE_SELECT) { int j; - unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2]; - unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2]; - unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2]; + unsigned int branch_ct_8x8p[TX_SIZES - 3][2]; + unsigned int branch_ct_16x16p[TX_SIZES - 2][2]; + unsigned int branch_ct_32x32p[TX_SIZES - 1][2]; for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p); - for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) + for (j = 0; j < TX_SIZES - 3; ++j) fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]); tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p); - for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) + for (j = 0; j < TX_SIZES - 2; ++j) fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]); tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p); - for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) + for (j = 0; j < TX_SIZES - 1; ++j) fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]); } @@ -495,22 +462,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) { counts->mbskip[i]); } -static void set_default_lf_deltas(MACROBLOCKD *xd) { - xd->lf.mode_ref_delta_enabled = 1; - xd->lf.mode_ref_delta_update = 1; +static void set_default_lf_deltas(struct loopfilter *lf) { + lf->mode_ref_delta_enabled = 1; + lf->mode_ref_delta_update = 1; - xd->lf.ref_deltas[INTRA_FRAME] = 1; - xd->lf.ref_deltas[LAST_FRAME] = 0; - xd->lf.ref_deltas[GOLDEN_FRAME] = -1; - xd->lf.ref_deltas[ALTREF_FRAME] = -1; + lf->ref_deltas[INTRA_FRAME] = 1; + lf->ref_deltas[LAST_FRAME] = 0; + lf->ref_deltas[GOLDEN_FRAME] = -1; + lf->ref_deltas[ALTREF_FRAME] = -1; - xd->lf.mode_deltas[0] = 0; - xd->lf.mode_deltas[1] = 0; + lf->mode_deltas[0] = 0; + lf->mode_deltas[1] = 0; } void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { // Reset the segment feature data to the default stats: // Features disabled, 0, with delta coding (Default state). + struct loopfilter *const lf = &xd->lf; + int i; vp9_clearall_segfeatures(&xd->seg); xd->seg.abs_delta = SEGMENT_DELTADATA; @@ -518,12 +487,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); // Reset the mode ref deltas for loop filter - vp9_zero(xd->lf.last_ref_deltas); - vp9_zero(xd->lf.last_mode_deltas); - set_default_lf_deltas(xd); + vp9_zero(lf->last_ref_deltas); + vp9_zero(lf->last_mode_deltas); + set_default_lf_deltas(lf); // To force update of the sharpness - xd->lf.last_sharpness_level = -1; + lf->last_sharpness_level = -1; vp9_default_coef_probs(cm); vp9_init_mbmode_probs(cm); diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h index 8c14e7e17..17a7c2634 100644 --- a/libvpx/vp9/common/vp9_entropymode.h +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -24,15 +24,15 @@ struct VP9Common; struct tx_probs { - vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; - vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; - vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; + vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2]; + vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3]; }; struct tx_counts { - unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB]; - unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; - unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; + unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES]; + unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1]; + unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2]; }; extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; @@ -61,18 +61,12 @@ extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; void vp9_entropy_mode_init(); -int vp9_mv_cont(const int_mv *l, const int_mv *a); - void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); void vp9_init_mbmode_probs(struct VP9Common *x); -void vp9_adapt_mode_context(struct VP9Common *pc); - void vp9_adapt_mode_probs(struct VP9Common *); -void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context); - void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, unsigned int (*ct_32x32p)[2]); void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c index 343b6241d..6cfc34697 100644 --- a/libvpx/vp9/common/vp9_entropymv.c +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -16,7 +16,7 @@ #define MV_MAX_UPDATE_FACTOR 128 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */ -#define COMPANDED_MVREF_THRESH 8 +#define COMPANDED_MVREF_THRESH 8 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { -MV_JOINT_ZERO, 2, @@ -107,12 +107,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { return mv_class_base(c) + offset; } -static void inc_mv_component_count(int v, nmv_component_counts *comp_counts, - int incr) { - assert (v != 0); - comp_counts->mvcount[MV_MAX + v] += incr; -} - static void inc_mv_component(int v, nmv_component_counts *comp_counts, int incr, int usehp) { int s, z, c, o, d, e, f; @@ -164,25 +158,19 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { } } -void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx) { +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) { const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); - mvctx->joints[j]++; + ++counts->joints[j]; + if (mv_joint_vertical(j)) - inc_mv_component_count(mv->row, &mvctx->comps[0], 1); + ++counts->comps[0].mvcount[MV_MAX + mv->row]; if (mv_joint_horizontal(j)) - inc_mv_component_count(mv->col, &mvctx->comps[1], 1); + ++counts->comps[1].mvcount[MV_MAX + mv->col]; } -static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) { - const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT); - if (count) { - const vp9_prob newp = get_binary_prob(ct[0], ct[1]); - const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT; - *dest = weighted_prob(prep, newp, factor); - } else { - *dest = prep; - } +static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) { + return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR); } void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) { @@ -195,31 +183,22 @@ static unsigned int adapt_probs(unsigned int i, vp9_prob this_probs[], const vp9_prob last_probs[], const unsigned int num_events[]) { - vp9_prob this_prob; - const uint32_t left = tree[i] <= 0 + + const unsigned int left = tree[i] <= 0 ? num_events[-tree[i]] : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); - const uint32_t right = tree[i + 1] <= 0 + const unsigned int right = tree[i + 1] <= 0 ? num_events[-tree[i + 1]] : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events); - - uint32_t weight = left + right; - if (weight) { - this_prob = get_binary_prob(left, right); - weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight; - this_prob = weighted_prob(last_probs[i >> 1], this_prob, - MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT); - } else { - this_prob = last_probs[i >> 1]; - } - this_probs[i >> 1] = this_prob; + const unsigned int ct[2] = { left, right }; + this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct); return left + right; } -void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) { +void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) { int i, j; FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx]; @@ -228,36 +207,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) { nmv_context *pre_ctx = &pre_fc->nmvc; nmv_context_counts *cts = &cm->counts.mv; - vp9_counts_process(cts, usehp); + vp9_counts_process(cts, allow_hp); adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints); for (i = 0; i < 2; ++i) { - adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign); + ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign); adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes, pre_ctx->comps[i].classes, cts->comps[i].classes); adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0, pre_ctx->comps[i].class0, cts->comps[i].class0); for (j = 0; j < MV_OFFSET_BITS; ++j) - adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j], - cts->comps[i].bits[j]); - } + ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j], + cts->comps[i].bits[j]); - for (i = 0; i < 2; ++i) { for (j = 0; j < CLASS0_SIZE; ++j) adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j], pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]); adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp, cts->comps[i].fp); - } - if (usehp) { - for (i = 0; i < 2; ++i) { - adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp, - cts->comps[i].class0_hp); - adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp); + if (allow_hp) { + ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp, + cts->comps[i].class0_hp); + ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp); } } } diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h index 86f0d0bfd..3208b7270 100644 --- a/libvpx/vp9/common/vp9_enums.h +++ b/libvpx/vp9/common/vp9_enums.h @@ -54,7 +54,7 @@ typedef enum { TX_8X8 = 1, // 8x8 dct transform TX_16X16 = 2, // 16x16 dct transform TX_32X32 = 3, // 32x32 dct transform - TX_SIZE_MAX_SB, // Number of transforms available to SBs + TX_SIZES } TX_SIZE; typedef enum { @@ -63,7 +63,7 @@ typedef enum { ALLOW_16X16 = 2, ALLOW_32X32 = 3, TX_MODE_SELECT = 4, - NB_TXFM_MODES = 5, + TX_MODES = 5, } TX_MODE; typedef enum { diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c index 95ec59061..d8496c4f2 100644 --- a/libvpx/vp9/common/vp9_extend.c +++ b/libvpx/vp9/common/vp9_extend.c @@ -8,9 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "vp9/common/vp9_extend.h" #include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_extend.h" + static void copy_and_extend_plane(const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch, int w, int h, @@ -107,14 +109,14 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, const int src_y_offset = srcy * src->y_stride + srcx; const int dst_y_offset = srcy * dst->y_stride + srcx; - const int et_uv = (et_y + 1) >> 1; - const int el_uv = (el_y + 1) >> 1; - const int eb_uv = (eb_y + 1) >> 1; - const int er_uv = (er_y + 1) >> 1; + const int et_uv = ROUND_POWER_OF_TWO(et_y, 1); + const int el_uv = ROUND_POWER_OF_TWO(el_y, 1); + const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1); + const int er_uv = ROUND_POWER_OF_TWO(er_y, 1); const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); - const int srch_uv = (srch + 1) >> 1; - const int srcw_uv = (srcw + 1) >> 1; + const int srch_uv = ROUND_POWER_OF_TWO(srch, 1); + const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1); copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, dst->y_buffer + dst_y_offset, dst->y_stride, diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c index 643b229a6..3af8b8d21 100644 --- a/libvpx/vp9/common/vp9_findnearmv.c +++ b/libvpx/vp9/common/vp9_findnearmv.c @@ -14,8 +14,9 @@ #include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_sadmxn.h" -static void lower_mv_precision(int_mv *mv, int usehp) { - if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) { +static void lower_mv_precision(int_mv *mv, int allow_hp) { + const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv); + if (!use_hp) { if (mv->as_mv.row & 1) mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); if (mv->as_mv.col & 1) @@ -32,7 +33,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, // Make sure all the candidates are properly clamped etc for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv); - clamp_mv2(&mvlist[i], xd); + clamp_mv2(&mvlist[i].as_mv, xd); } *nearest = mvlist[0]; *near = mvlist[1]; @@ -41,7 +42,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, int_mv *dst_nearest, int_mv *dst_near, - int block_idx, int ref_idx) { + int block_idx, int ref_idx, + int mi_row, int mi_col) { int_mv dst_list[MAX_MV_REF_CANDIDATES]; int_mv mv_list[MAX_MV_REF_CANDIDATES]; MODE_INFO *mi = xd->mode_info_context; @@ -53,7 +55,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context, xd->prev_mode_info_context, mbmi->ref_frame[ref_idx], - mv_list, cm->ref_frame_sign_bias, block_idx); + mv_list, cm->ref_frame_sign_bias, block_idx, + mi_row, mi_col); dst_list[1].as_int = 0; if (block_idx == 0) { diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h index b0fa505b5..e5221ed67 100644 --- a/libvpx/vp9/common/vp9_findnearmv.h +++ b/libvpx/vp9/common/vp9_findnearmv.h @@ -29,31 +29,19 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int_mv *near); // TODO(jingning): this mv clamping function should be block size dependent. -static void clamp_mv(int_mv *mv, - int mb_to_left_edge, - int mb_to_right_edge, - int mb_to_top_edge, - int mb_to_bottom_edge) { - mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge); - mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge); -} - -static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { - int_mv tmp_mv; - tmp_mv.as_int = mv->as_int; - clamp_mv(mv, - xd->mb_to_left_edge - LEFT_TOP_MARGIN, - xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, - xd->mb_to_top_edge - LEFT_TOP_MARGIN, - xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); - return tmp_mv.as_int != mv->as_int; +static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) { + clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); } void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc, MACROBLOCKD *xd, int_mv *dst_nearest, int_mv *dst_near, - int block_idx, int ref_idx); + int block_idx, int ref_idx, + int mi_row, int mi_col); static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { // FIXME(rbultje, jingning): temporary hack because jenkins doesn't @@ -62,7 +50,7 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { /* On L edge, get from MB to left of us */ --cur_mb; - if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&cur_mb->mbmi)) { return DC_PRED; } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { return ((cur_mb->bmi + 1 + b)->as_mode); @@ -80,7 +68,7 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, /* On top edge, get from MB above us */ cur_mb -= mi_stride; - if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { + if (is_inter_block(&cur_mb->mbmi)) { return DC_PRED; } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { return ((cur_mb->bmi + 2 + b)->as_mode); diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c index a95560a55..a2245259e 100644 --- a/libvpx/vp9/common/vp9_idct.c +++ b/libvpx/vp9/common/vp9_idct.c @@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { } } +void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int i, j; + int a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 5); + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } +} + static void iadst4_1d(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, } } -void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) { - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 5); -} - static void idct16_1d(int16_t *input, int16_t *output) { int16_t step1[16], step2[16]; int temp1, temp2; @@ -857,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, } } -void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { +void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int i, j; + int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 6); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } } static void idct32_1d(int16_t *input, int16_t *output) { @@ -1259,29 +1274,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { out = dct_const_round_shift(out * cospi_16_64); output[0] = ROUND_POWER_OF_TWO(out, 6); } - -void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t out[32 * 32] = { 0 }; - int16_t *outptr = out; - int i, j; - int16_t temp_in[32], temp_out[32]; - - // First transform rows. Since all non-zero dct coefficients are in - // upper-left 4x4 area, we only need to calculate first 4 rows here. - for (i = 0; i < 4; ++i) { - idct32_1d(input, outptr); - input += 32; - outptr += 32; - } - - // Columns - for (i = 0; i < 32; ++i) { - for (j = 0; j < 32; ++j) - temp_in[j] = out[j * 32 + i]; - idct32_1d(temp_in, temp_out); - for (j = 0; j < 32; ++j) - dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) - + dest[j * dest_stride + i]); - } -} diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c index 5498b1717..66df62753 100644 --- a/libvpx/vp9/common/vp9_loopfilter.c +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -16,6 +16,12 @@ #include "vp9/common/vp9_seg_common.h" +struct loop_filter_info { + const uint8_t *mblim; + const uint8_t *lim; + const uint8_t *hev_thr; +}; + static void lf_init_lut(loop_filter_info_n *lfi) { lfi->mode_lf_lut[DC_PRED] = 0; lfi->mode_lf_lut[D45_PRED] = 0; @@ -73,13 +79,14 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) { void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, int default_filt_lvl) { - int seg; + int seg_id; // n_shift is the a multiplier for lf_deltas // the multiplier is 1 for when filter_lvl is between 0 and 31; // 2 when filter_lvl is between 32 and 63 const int n_shift = default_filt_lvl >> 5; loop_filter_info_n *const lfi = &cm->lf_info; - struct loopfilter *lf = &xd->lf; + struct loopfilter *const lf = &xd->lf; + struct segmentation *const seg = &xd->seg; // update limits if sharpness has changed if (lf->last_sharpness_level != lf->sharpness_level) { @@ -87,13 +94,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, lf->last_sharpness_level = lf->sharpness_level; } - for (seg = 0; seg < MAX_SEGMENTS; seg++) { + for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) { int lvl_seg = default_filt_lvl, ref, mode, intra_lvl; // Set the baseline filter values for each segment - if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) { - const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF); - lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA + if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) { + const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF); + lvl_seg = seg->abs_delta == SEGMENT_ABSDATA ? data : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER); } @@ -101,18 +108,18 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd, if (!lf->mode_ref_delta_enabled) { // we could get rid of this if we assume that deltas are set to // zero when not in use; encoder always uses deltas - vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); + vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4); continue; } intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift); - lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); + lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER); for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) { const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift) + (lf->mode_deltas[mode] << n_shift); - lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); + lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER); } } } @@ -256,7 +263,7 @@ static void filter_block_plane(VP9_COMMON *const cm, // Determine the vertical edges that need filtering for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) { const int skip_this = mi[c].mbmi.mb_skip_coeff - && mi[c].mbmi.ref_frame[0] != INTRA_FRAME; + && is_inter_block(&mi[c].mbmi); // left edge of current unit is block/partition edge -> no skip const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ? !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; @@ -376,3 +383,11 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd, vp9_loop_filter_rows(cm->frame_to_show, cm, xd, 0, cm->mi_rows, y_only); } + +int vp9_loop_filter_worker(void *arg1, void *arg2) { + LFWorkerData *const lf_data = (LFWorkerData*)arg1; + (void)arg2; + vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd, + lf_data->start, lf_data->stop, lf_data->y_only); + return 1; +} diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h index e59cc6485..5fc909495 100644 --- a/libvpx/vp9/common/vp9_loopfilter.h +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -35,13 +35,6 @@ typedef struct { uint8_t mode_lf_lut[MB_MODE_COUNT]; } loop_filter_info_n; -struct loop_filter_info { - const uint8_t *mblim; - const uint8_t *lim; - const uint8_t *hev_thr; -}; - - /* assorted loopfilter functions which get used elsewhere */ struct VP9Common; struct macroblockd; @@ -64,4 +57,18 @@ void vp9_loop_filter_frame(struct VP9Common *cm, void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer, struct VP9Common *cm, struct macroblockd *xd, int start, int stop, int y_only); + +typedef struct LoopFilterWorkerData { + const YV12_BUFFER_CONFIG *frame_buffer; + struct VP9Common *cm; + struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the + // loopfilter. the planes are necessary as their state + // is changed during decode. + int start; + int stop; + int y_only; +} LFWorkerData; + +// Operates on the rows described by LFWorkerData passed as 'arg1'. +int vp9_loop_filter_worker(void *arg1, void *arg2); #endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h index a095258be..31a79b984 100644 --- a/libvpx/vp9/common/vp9_mv.h +++ b/libvpx/vp9/common/vp9_mv.h @@ -13,6 +13,8 @@ #include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + typedef struct { int16_t row; int16_t col; @@ -28,4 +30,10 @@ typedef struct { int32_t col; } MV32; +static void clamp_mv(MV *mv, int min_col, int max_col, + int min_row, int max_row) { + mv->col = clamp(mv->col, min_col, max_col); + mv->row = clamp(mv->row, min_row, max_row); +} + #endif // VP9_COMMON_VP9_MV_H_ diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c index ae009b0ff..3b72f41c2 100644 --- a/libvpx/vp9/common/vp9_mvref_common.c +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -11,6 +11,65 @@ #include "vp9/common/vp9_mvref_common.h" #define MVREF_NEIGHBOURS 8 + +typedef enum { + BOTH_ZERO = 0, + ZERO_PLUS_PREDICTED = 1, + BOTH_PREDICTED = 2, + NEW_PLUS_NON_INTRA = 3, + BOTH_NEW = 4, + INTRA_PLUS_NON_INTRA = 5, + BOTH_INTRA = 6, + INVALID_CASE = 9 +} motion_vector_context; + +// This is used to figure out a context for the ref blocks. The code flattens +// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by +// adding 9 for each intra block, 3 for each zero mv and 1 for each new +// motion vector. This single number is then converted into a context +// with a single lookup ( counter_to_context ). +static const int mode_2_counter[MB_MODE_COUNT] = { + 9, // DC_PRED + 9, // V_PRED + 9, // H_PRED + 9, // D45_PRED + 9, // D135_PRED + 9, // D117_PRED + 9, // D153_PRED + 9, // D27_PRED + 9, // D63_PRED + 9, // TM_PRED + 0, // NEARESTMV + 0, // NEARMV + 3, // ZEROMV + 1, // NEWMV +}; + +// There are 3^3 different combinations of 3 counts that can be either 0,1 or +// 2. However the actual count can never be greater than 2 so the highest +// counter we need is 18. 9 is an invalid counter that's never used. +static const int counter_to_context[19] = { + BOTH_PREDICTED, // 0 + NEW_PLUS_NON_INTRA, // 1 + BOTH_NEW, // 2 + ZERO_PLUS_PREDICTED, // 3 + NEW_PLUS_NON_INTRA, // 4 + INVALID_CASE, // 5 + BOTH_ZERO, // 6 + INVALID_CASE, // 7 + INVALID_CASE, // 8 + INTRA_PLUS_NON_INTRA, // 9 + INTRA_PLUS_NON_INTRA, // 10 + INVALID_CASE, // 11 + INTRA_PLUS_NON_INTRA, // 12 + INVALID_CASE, // 13 + INVALID_CASE, // 14 + INVALID_CASE, // 15 + INVALID_CASE, // 16 + INVALID_CASE, // 17 + BOTH_INTRA // 18 +}; + static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { // SB4X4 {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, @@ -39,263 +98,212 @@ static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { // SB64X64 {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}} }; + +static const int idx_n_column_to_subblock[4][2] = { + {1, 2}, + {1, 3}, + {3, 2}, + {3, 3} +}; + // clamp_mv_ref #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) { - mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER, - xd->mb_to_right_edge + MV_BORDER); - mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER, - xd->mb_to_bottom_edge + MV_BORDER); -} - -// Gets a candidate reference motion vector from the given mode info -// structure if one exists that matches the given reference frame. -static int get_matching_candidate(const MODE_INFO *candidate_mi, - MV_REFERENCE_FRAME ref_frame, - int_mv *c_mv, int block_idx) { - if (ref_frame == candidate_mi->mbmi.ref_frame[0]) { - if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) - c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int; - else - c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; - } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) { - if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) - c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int; - else - c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - } else { - return 0; - } - - return 1; + clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER, + xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); } -// Gets candidate reference motion vector(s) from the given mode info -// structure if they exists and do NOT match the given reference frame. -static void get_non_matching_candidates(const MODE_INFO *candidate_mi, - MV_REFERENCE_FRAME ref_frame, - MV_REFERENCE_FRAME *c_ref_frame, - int_mv *c_mv, - MV_REFERENCE_FRAME *c2_ref_frame, - int_mv *c2_mv) { - - c_mv->as_int = 0; - c2_mv->as_int = 0; - *c_ref_frame = INTRA_FRAME; - *c2_ref_frame = INTRA_FRAME; - - // If first candidate not valid neither will be. - if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) { - // First candidate - if (candidate_mi->mbmi.ref_frame[0] != ref_frame) { - *c_ref_frame = candidate_mi->mbmi.ref_frame[0]; - c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; - } - - // Second candidate - if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) && - (candidate_mi->mbmi.ref_frame[1] != ref_frame) && - (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) { - *c2_ref_frame = candidate_mi->mbmi.ref_frame[1]; - c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; - } - } +// This function returns either the appropriate sub block or block's mv +// on whether the block_size < 8x8 and we have check_sub_blocks set. +static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate, + int check_sub_blocks, int which_mv, + int search_col, int block_idx) { + return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8 + ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]] + .as_mv[which_mv] + : candidate->mbmi.mv[which_mv]); } // Performs mv sign inversion if indicated by the reference frame combination. -static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame, - MV_REFERENCE_FRAME candidate_ref_frame, - int_mv *candidate_mv, int *ref_sign_bias) { +static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv, + const MV_REFERENCE_FRAME this_ref_frame, + const int *ref_sign_bias) { + int_mv return_mv = candidate->mbmi.mv[which_mv]; // Sign inversion where appropriate. - if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { - candidate_mv->as_mv.row = -candidate_mv->as_mv.row; - candidate_mv->as_mv.col = -candidate_mv->as_mv.col; + if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] != + ref_sign_bias[this_ref_frame]) { + return_mv.as_mv.row *= -1; + return_mv.as_mv.col *= -1; } + return return_mv; } -// Add a candidate mv. -// Discard if it has already been seen. -static void add_candidate_mv(int_mv *mv_list, int *mv_scores, - int *candidate_count, int_mv candidate_mv, - int weight) { - if (*candidate_count == 0) { - mv_list[0].as_int = candidate_mv.as_int; - mv_scores[0] = weight; - *candidate_count += 1; - } else if ((*candidate_count == 1) && - (candidate_mv.as_int != mv_list[0].as_int)) { - mv_list[1].as_int = candidate_mv.as_int; - mv_scores[1] = weight; - *candidate_count += 1; +// This macro is used to add a motion vector mv_ref list if it isn't +// already in the list. If it's the second motion vector it will also +// skip all additional processing and jump to done! +#define ADD_MV_REF_LIST(MV) \ + if (refmv_count) { \ + if ((MV).as_int != mv_ref_list[0].as_int) { \ + mv_ref_list[refmv_count] = (MV); \ + goto Done; \ + } \ + } else { \ + mv_ref_list[refmv_count++] = (MV); \ + } + +// If either reference frame is different, not INTRA, and they +// are different from each other scale and add the mv to our list. +#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \ + if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \ + ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \ + } \ + if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \ + (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \ + (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \ + ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \ } + +// Checks that the given mi_row, mi_col and search point +// are inside the borders of the tile. +static INLINE int is_inside(const int mi_col, const int mi_row, + const int cur_tile_mi_col_start, + const int cur_tile_mi_col_end, const int mi_rows, + const int (*mv_ref_search)[2], int idx) { + int mi_search_col; + const int mi_search_row = mi_row + mv_ref_search[idx][1];; + + // Check that the candidate is within the border. We only need to check + // the left side because all the positive right side ones are for blocks that + // are large enough to support the + value they have within their border. + if (mi_search_row < 0) + return 0; + + mi_search_col = mi_col + mv_ref_search[idx][0]; + if (mi_search_col < cur_tile_mi_col_start) + return 0; + + return 1; } // This function searches the neighbourhood of a given MB/SB // to try and find candidate reference vectors. -// void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, - MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame, - int_mv *mv_ref_list, int *ref_sign_bias, - int block_idx) { - int i; - MODE_INFO *candidate_mi; - MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; - int_mv c_refmv; - int_mv c2_refmv; - MV_REFERENCE_FRAME c_ref_frame; - MV_REFERENCE_FRAME c2_ref_frame; - int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 }; + const MODE_INFO *lf_here, + const MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, const int *ref_sign_bias, + const int block_idx, + const int mi_row, const int mi_col) { + int idx; + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; int refmv_count = 0; const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type]; - const int mi_col = get_mi_col(xd); - const int mi_row = get_mi_row(xd); - int intra_count = 0; - int zero_count = 0; - int newmv_count = 0; - int x_idx = 0, y_idx = 0; - - // Blank the reference vector lists and other local structures. - vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES); - - if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { - x_idx = block_idx & 1; - y_idx = block_idx >> 1; - } - - // We first scan for candidate vectors that match the current reference frame - // Look at nearest neigbours - for (i = 0; i < 2; ++i) { - const int mi_search_col = mi_col + mv_ref_search[i][0]; - const int mi_search_row = mi_row + mv_ref_search[i][1]; - if ((mi_search_col >= cm->cur_tile_mi_col_start) && - (mi_search_col < cm->cur_tile_mi_col_end) && - (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { - int b; - - candidate_mi = here + mv_ref_search[i][0] + - (mv_ref_search[i][1] * xd->mode_info_stride); - - if (block_idx >= 0) { - if (mv_ref_search[i][0]) - b = 1 + y_idx * 2; - else - b = 2 + x_idx; - } else { - b = -1; - } - if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) { - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c_refmv, 16); + const MODE_INFO *candidate; + const int check_sub_blocks = block_idx >= 0; + int different_ref_found = 0; + int context_counter = 0; + + // Blank the reference vector list + vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES); + + // The nearest 2 blocks are treated differently + // if the size < 8x8 we get the mv from the bmi substructure, + // and we also need to keep a mode count. + for (idx = 0; idx < 2; ++idx) { + if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, + cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) + continue; + + candidate = here + mv_ref_search[idx][0] + + mv_ref_search[idx][1] * xd->mode_info_stride; + + // Keep counts for entropy encoding. + context_counter += mode_2_counter[candidate->mbmi.mode]; + + // Check if the candidate comes from the same reference frame. + if (candidate->mbmi.ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0, + mv_ref_search[idx][0], block_idx)); + different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame; + } else { + different_ref_found = 1; + if (candidate->mbmi.ref_frame[1] == ref_frame) { + // Add second motion vector if it has the same ref_frame. + ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1, + mv_ref_search[idx][0], block_idx)); } - - // Count number of neihgbours coded intra and zeromv - intra_count += (candidate_mi->mbmi.mode < NEARESTMV); - zero_count += (candidate_mi->mbmi.mode == ZEROMV); - newmv_count += (candidate_mi->mbmi.mode >= NEWMV); } } - // More distant neigbours - for (i = 2; (i < MVREF_NEIGHBOURS) && - (refmv_count < MAX_MV_REF_CANDIDATES); ++i) { - const int mi_search_col = mi_col + mv_ref_search[i][0]; - const int mi_search_row = mi_row + mv_ref_search[i][1]; - if ((mi_search_col >= cm->cur_tile_mi_col_start) && - (mi_search_col < cm->cur_tile_mi_col_end) && - (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { - candidate_mi = here + mv_ref_search[i][0] + - (mv_ref_search[i][1] * xd->mode_info_stride); - - if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) { - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c_refmv, 16); + // Check the rest of the neighbors in much the same way + // as before except we don't need to keep track of sub blocks or + // mode counts. + for (; idx < MVREF_NEIGHBOURS; ++idx) { + if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, + cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) + continue; + + candidate = here + mv_ref_search[idx][0] + + mv_ref_search[idx][1] * xd->mode_info_stride; + + if (candidate->mbmi.ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(candidate->mbmi.mv[0]); + different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame; + } else { + different_ref_found = 1; + if (candidate->mbmi.ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST(candidate->mbmi.mv[1]); } } } - // Look in the last frame if it exists - if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) { - candidate_mi = lf_here; - if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) { - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c_refmv, 16); + // Check the last frame's mode and mv info. + if (lf_here != NULL) { + if (lf_here->mbmi.ref_frame[0] == ref_frame) { + ADD_MV_REF_LIST(lf_here->mbmi.mv[0]); + } else if (lf_here->mbmi.ref_frame[1] == ref_frame) { + ADD_MV_REF_LIST(lf_here->mbmi.mv[1]); } } - // If we have not found enough candidates consider ones where the - // reference frame does not match. Break out when we have - // MAX_MV_REF_CANDIDATES candidates. - // Look first at spatial neighbours - for (i = 0; (i < MVREF_NEIGHBOURS) && - (refmv_count < MAX_MV_REF_CANDIDATES); ++i) { - const int mi_search_col = mi_col + mv_ref_search[i][0]; - const int mi_search_row = mi_row + mv_ref_search[i][1]; - if ((mi_search_col >= cm->cur_tile_mi_col_start) && - (mi_search_col < cm->cur_tile_mi_col_end) && - (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { - candidate_mi = here + mv_ref_search[i][0] + - (mv_ref_search[i][1] * xd->mode_info_stride); - - get_non_matching_candidates(candidate_mi, ref_frame, - &c_ref_frame, &c_refmv, - &c2_ref_frame, &c2_refmv); - - if (c_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c_refmv, 1); - } + // Since we couldn't find 2 mvs from the same reference frame + // go back through the neighbors and find motion vectors from + // different reference frames. + if (different_ref_found) { + for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) { + if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start, + cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx)) + continue; - if (c2_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c2_refmv, 1); - } - } - } + candidate = here + mv_ref_search[idx][0] + + mv_ref_search[idx][1] * xd->mode_info_stride; - // Look at the last frame if it exists - if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) { - candidate_mi = lf_here; - get_non_matching_candidates(candidate_mi, ref_frame, - &c_ref_frame, &c_refmv, - &c2_ref_frame, &c2_refmv); - - if (c_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c_refmv, 1); - } + // If the candidate is INTRA we don't want to consider its mv. + if (candidate->mbmi.ref_frame[0] == INTRA_FRAME) + continue; - if (c2_ref_frame != INTRA_FRAME) { - scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); - add_candidate_mv(mv_ref_list, candidate_scores, - &refmv_count, c2_refmv, 1); + IF_DIFF_REF_FRAME_ADD_MV(candidate); } } - if (!intra_count) { - if (!newmv_count) { - // 0 = both zero mv - // 1 = one zero mv + one a predicted mv - // 2 = two predicted mvs - mbmi->mb_mode_context[ref_frame] = 2 - zero_count; - } else { - // 3 = one predicted/zero and one new mv - // 4 = two new mvs - mbmi->mb_mode_context[ref_frame] = 2 + newmv_count; - } - } else { - // 5 = one intra neighbour + x - // 6 = two intra neighbours - mbmi->mb_mode_context[ref_frame] = 4 + intra_count; + // Since we still don't have a candidate we'll try the last frame. + if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) { + IF_DIFF_REF_FRAME_ADD_MV(lf_here); } + Done: + + mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter]; + // Clamp vectors - for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { - clamp_mv_ref(xd, &mv_ref_list[i]); + for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) { + clamp_mv_ref(xd, &mv_ref_list[idx]); } } + +#undef ADD_MV_REF_LIST +#undef IF_DIFF_REF_FRAME_ADD_MV diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h index 7290f10ab..c5f89eb57 100644 --- a/libvpx/vp9/common/vp9_mvref_common.h +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -17,11 +17,13 @@ void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, - MODE_INFO *lf_here, - MV_REFERENCE_FRAME ref_frame, + const MODE_INFO *lf_here, + const MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, - int *ref_sign_bias, - int block_idx); + const int *ref_sign_bias, + const int block_idx, + const int mi_row, + const int mi_col); static INLINE void vp9_find_mv_refs(VP9_COMMON *cm, MACROBLOCKD *xd, @@ -29,9 +31,10 @@ static INLINE void vp9_find_mv_refs(VP9_COMMON *cm, MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame, int_mv *mv_ref_list, - int *ref_sign_bias) { + int *ref_sign_bias, + int mi_row, int mi_col) { vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame, - mv_ref_list, ref_sign_bias, -1); + mv_ref_list, ref_sign_bias, -1, mi_row, mi_col); } #endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h index f31f24b26..152a93293 100644 --- a/libvpx/vp9/common/vp9_onyxc_int.h +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -42,7 +42,7 @@ typedef struct frame_contexts { vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] [PARTITION_TYPES - 1]; - vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES]; + vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES]; vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] [VP9_SWITCHABLE_FILTERS - 1]; vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; @@ -59,12 +59,12 @@ typedef struct { unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES]; unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES]; unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; - vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES]; - unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES] + vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES]; + unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES] [COEF_BANDS][PREV_COEF_CONTEXTS]; unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1] [VP9_SWITCHABLE_FILTERS]; - unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; + unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES]; unsigned int intra_inter[INTRA_INTER_CONTEXTS][2]; unsigned int comp_inter[COMP_INTER_CONTEXTS][2]; unsigned int single_ref[REF_CONTEXTS][2][2]; @@ -240,8 +240,7 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd, xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); } -static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd, - int mi_row, int mi_col, +static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { int bsl = mi_width_log2(bsize), bs = 1 << bsl; int ms = bs / 2; @@ -278,14 +277,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd, xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end); } -static int get_mi_row(const MACROBLOCKD *xd) { - return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE)); -} - -static int get_mi_col(const MACROBLOCKD *xd) { - return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE)); -} - static int get_token_alloc(int mb_rows, int mb_cols) { return mb_rows * mb_cols * (48 * 16 + 4); } diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c index e8bcdea82..795962a71 100644 --- a/libvpx/vp9/common/vp9_pred_common.c +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -55,34 +55,28 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) { } // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) { - int pred_context; const MODE_INFO *const mi = xd->mode_info_context; const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi; const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi; const int left_in_image = xd->left_available && left_mbmi->mb_in_image; const int above_in_image = xd->up_available && above_mbmi->mb_in_image; - // Note: - // The mode info data structure has a one element border above and to the - // left of the entries correpsonding to real macroblocks. - // The prediction flags in these dummy entries are initialised to 0. - if (above_in_image && left_in_image) { // both edges available - if (left_mbmi->ref_frame[0] == INTRA_FRAME && - above_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (3) - pred_context = 3; - } else { // intra/inter (1) or inter/inter (0) - pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME || - above_mbmi->ref_frame[0] == INTRA_FRAME; - } - } else if (above_in_image || left_in_image) { // one edge available - const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi; + const int left_intra = !is_inter_block(left_mbmi); + const int above_intra = !is_inter_block(above_mbmi); - // inter: 0, intra: 2 - pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME); - } else { - pred_context = 0; - } - assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS); - return pred_context; + // The mode info data structure has a one element border above and to the + // left of the entries corresponding to real macroblocks. + // The prediction flags in these dummy entries are initialized to 0. + // 0 - inter/inter, inter/--, --/inter, --/-- + // 1 - intra/inter, inter/intra + // 2 - intra/--, --/intra + // 3 - intra/intra + if (above_in_image && left_in_image) // both edges available + return left_intra && above_intra ? 3 + : left_intra || above_intra; + else if (above_in_image || left_in_image) // one edge available + return 2 * (above_in_image ? above_intra : left_intra); + else + return 0; } // Returns a context number for the given MB prediction signal unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm, diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h index e4b6575e3..238290b41 100644 --- a/libvpx/vp9/common/vp9_pred_common.h +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -110,9 +110,9 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd); static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context, const struct tx_probs *tx_probs) { - if (bsize < BLOCK_SIZE_MB16X16) + if (bsize < BLOCK_16X16) return tx_probs->p8x8[context]; - else if (bsize < BLOCK_SIZE_SB32X32) + else if (bsize < BLOCK_32X32) return tx_probs->p16x16[context]; else return tx_probs->p32x32[context]; @@ -127,9 +127,9 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd, static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context, TX_SIZE tx_size, struct tx_counts *tx_counts) { - if (bsize >= BLOCK_SIZE_SB32X32) + if (bsize >= BLOCK_32X32) tx_counts->p32x32[context][tx_size]++; - else if (bsize >= BLOCK_SIZE_MB16X16) + else if (bsize >= BLOCK_16X16) tx_counts->p16x16[context][tx_size]++; else tx_counts->p8x8[context][tx_size]++; diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c index 63e5646ad..0b65e0610 100644 --- a/libvpx/vp9/common/vp9_reconinter.c +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -197,14 +197,14 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd, void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const int_mv *src_mv, + const MV *src_mv, const struct scale_factors *scale, int w, int h, int weight, const struct subpix_fn_table *subpix, enum mv_precision precision) { const MV32 mv = precision == MV_PRECISION_Q4 - ? scale->scale_mv_q4(&src_mv->as_mv, scale) - : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale); + ? scale->scale_mv_q4(src_mv, scale) + : scale->scale_mv_q3_to_q4(src_mv, scale); const int subpel_x = mv.col & 15; const int subpel_y = mv.row & 15; @@ -220,45 +220,44 @@ static INLINE int round_mv_comp_q4(int value) { return (value < 0 ? value - 2 : value + 2) / 4; } -static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) { - const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row + - mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row + - mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row + - mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row; - return round_mv_comp_q4(temp); +static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) { + MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row + + mi->bmi[1].as_mv[idx].as_mv.row + + mi->bmi[2].as_mv[idx].as_mv.row + + mi->bmi[3].as_mv[idx].as_mv.row), + round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col + + mi->bmi[1].as_mv[idx].as_mv.col + + mi->bmi[2].as_mv[idx].as_mv.col + + mi->bmi[3].as_mv[idx].as_mv.col) }; + return res; } -static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) { - const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col + - mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col + - mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col + - mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col; - return round_mv_comp_q4(temp); -} + // TODO(jkoleszar): yet another mv clamping function :-( MV clamp_mv_to_umv_border_sb(const MV *src_mv, int bwl, int bhl, int ss_x, int ss_y, int mb_to_left_edge, int mb_to_top_edge, int mb_to_right_edge, int mb_to_bottom_edge) { - /* If the MV points so far into the UMV border that no visible pixels - * are used for reconstruction, the subpel part of the MV can be - * discarded and the MV limited to 16 pixels with equivalent results. - */ + // If the MV points so far into the UMV border that no visible pixels + // are used for reconstruction, the subpel part of the MV can be + // discarded and the MV limited to 16 pixels with equivalent results. const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4; const int spel_right = spel_left - (1 << 4); const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4; const int spel_bottom = spel_top - (1 << 4); - MV clamped_mv; - + MV clamped_mv = { + src_mv->row << (1 - ss_y), + src_mv->col << (1 - ss_x) + }; assert(ss_x <= 1); assert(ss_y <= 1); - clamped_mv.col = clamp(src_mv->col << (1 - ss_x), - (mb_to_left_edge << (1 - ss_x)) - spel_left, - (mb_to_right_edge << (1 - ss_x)) + spel_right); - clamped_mv.row = clamp(src_mv->row << (1 - ss_y), - (mb_to_top_edge << (1 - ss_y)) - spel_top, - (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); + + clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left, + (mb_to_right_edge << (1 - ss_x)) + spel_right, + (mb_to_top_edge << (1 - ss_y)) - spel_top, + (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); + return clamped_mv; } @@ -280,15 +279,14 @@ static void build_inter_predictors(int plane, int block, const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl); - const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0; + const MODE_INFO *const mi = xd->mode_info_context; + const int use_second_ref = mi->mbmi.ref_frame[1] > 0; int which_mv; assert(x < (4 << bwl)); assert(y < (4 << bhl)); - assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || - 4 << pred_w == (4 << bwl)); - assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || - 4 << pred_h == (4 << bhl)); + assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl)); + assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl)); for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { // source @@ -301,44 +299,30 @@ static void build_inter_predictors(int plane, int block, // dest uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x; - // motion vector - const MV *mv; - MV split_chroma_mv; - int_mv clamped_mv; - - if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { - if (plane == 0) { - mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv; - } else { - // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the - // same MV (the average of the 4 luma MVs) but we could do something - // smarter for non-4:2:0. Just punt for now, pending the changes to get - // rid of SPLITMV mode entirely. - split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv); - split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv); - mv = &split_chroma_mv; - } - } else { - mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv; - } - - /* TODO(jkoleszar): This clamping is done in the incorrect place for the - * scaling case. It needs to be done on the scaled MV, not the pre-scaling - * MV. Note however that it performs the subsampling aware scaling so - * that the result is always q4. - */ - clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl, - xd->plane[plane].subsampling_x, - xd->plane[plane].subsampling_y, - xd->mb_to_left_edge, - xd->mb_to_top_edge, - xd->mb_to_right_edge, - xd->mb_to_bottom_edge); + // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the + // same MV (the average of the 4 luma MVs) but we could do something + // smarter for non-4:2:0. Just punt for now, pending the changes to get + // rid of SPLITMV mode entirely. + const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 + ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv + : mi_mv_pred_q4(mi, which_mv)) + : mi->mbmi.mv[which_mv].as_mv; + + // TODO(jkoleszar): This clamping is done in the incorrect place for the + // scaling case. It needs to be done on the scaled MV, not the pre-scaling + // MV. Note however that it performs the subsampling aware scaling so + // that the result is always q4. + const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y, + xd->mb_to_left_edge, + xd->mb_to_top_edge, + xd->mb_to_right_edge, + xd->mb_to_bottom_edge); scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); - vp9_build_inter_predictor(pre, pre_stride, dst, arg->dst_stride[plane], - &clamped_mv, &xd->scale_factor[which_mv], + &res_mv, &xd->scale_factor[which_mv], 4 << pred_w, 4 << pred_h, which_mv, &xd->subpix, MV_PRECISION_Q4); } @@ -400,7 +384,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { const int ref = cm->active_ref_idx[i]; struct scale_factors *const sf = &cm->active_ref_scale[i]; if (ref >= NUM_YV12_BUFFERS) { - memset(sf, 0, sizeof(*sf)); + vp9_zero(*sf); } else { YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; vp9_setup_scale_factors_for_frame(sf, diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h index e37750dea..6ec7323e1 100644 --- a/libvpx/vp9/common/vp9_reconinter.h +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -39,7 +39,7 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, - const int_mv *mv_q3, + const MV *mv_q3, const struct scale_factors *scale, int w, int h, int do_avg, const struct subpix_fn_table *subpix, diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh index c357ef62a..6bb3cb888 100644 --- a/libvpx/vp9/common/vp9_rtcd_defs.sh +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -7,9 +7,7 @@ cat <<EOF #include "vpx/vpx_integer.h" #include "vp9/common/vp9_enums.h" -struct loop_filter_info; struct macroblockd; -struct loop_filter_info; /* Encoder forward decls */ struct macroblock; @@ -22,7 +20,11 @@ EOF } forward_decls vp9_common_forward_decls -[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 +# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly. +[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2 && ssse3_x86inc=ssse3 + +# this variable is for functions that are 64 bit only. +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3 # # Dequant @@ -47,7 +49,7 @@ prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui specialize vp9_d27_predictor_4x4 prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_4x4 +specialize vp9_d45_predictor_4x4 ssse3 prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_d63_predictor_4x4 @@ -86,7 +88,7 @@ prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui specialize vp9_d27_predictor_8x8 prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_8x8 +specialize vp9_d45_predictor_8x8 ssse3 prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_d63_predictor_8x8 @@ -125,7 +127,7 @@ prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, specialize vp9_d27_predictor_16x16 prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_16x16 +specialize vp9_d45_predictor_16x16 ssse3 prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_d63_predictor_16x16 @@ -164,7 +166,7 @@ prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, specialize vp9_d27_predictor_32x32 prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" -specialize vp9_d45_predictor_32x32 +specialize vp9_d45_predictor_32x32 ssse3 prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col" specialize vp9_d63_predictor_32x32 @@ -214,7 +216,7 @@ fi # Loopfilter # prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" -specialize vp9_mb_lpf_vertical_edge_w sse2 +specialize vp9_mb_lpf_vertical_edge_w sse2 neon prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_vertical_edge sse2 neon @@ -223,7 +225,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8 specialize vp9_loop_filter_vertical_edge mmx neon prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" -specialize vp9_mb_lpf_horizontal_edge_w sse2 +specialize vp9_mb_lpf_horizontal_edge_w sse2 neon prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_horizontal_edge sse2 neon @@ -265,10 +267,10 @@ specialize vp9_blend_b # Sub Pixel Filters # prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_copy sse2 +specialize vp9_convolve_copy $sse2_x86inc prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve_avg sse2 +specialize vp9_convolve_avg $sse2_x86inc prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" specialize vp9_convolve8 ssse3 neon @@ -297,14 +299,17 @@ specialize vp9_short_idct4x4_1_add sse2 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct4x4_add sse2 +prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_1_add sse2 + prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct8x8_add sse2 neon prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_8x8_add sse2 -prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_8x8 +prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_1_add sse2 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 @@ -312,18 +317,12 @@ specialize vp9_short_idct16x16_add sse2 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_16x16_add sse2 -prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_16x16 - prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 -prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" -specialize vp9_short_idct10_32x32_add - prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" specialize vp9_short_iht4x4_add sse2 @@ -702,12 +701,10 @@ specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz" -specialize vp9_block_error sse2 +specialize vp9_block_error $sse2_x86inc prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" -specialize vp9_subtract_block sse2 - -[ $arch = "x86_64" ] && ssse3_x86_64=ssse3 +specialize vp9_subtract_block $sse2_x86inc prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" specialize vp9_quantize_b $ssse3_x86_64 @@ -719,13 +716,11 @@ specialize vp9_quantize_b_32x32 $ssse3_x86_64 # Structured Similarity (SSIM) # if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then - [ $arch = "x86_64" ] && sse2_on_x86_64=sse2 - prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp9_ssim_parms_8x8 $sse2_on_x86_64 + specialize vp9_ssim_parms_8x8 $sse2_x86_64 prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" - specialize vp9_ssim_parms_16x16 $sse2_on_x86_64 + specialize vp9_ssim_parms_16x16 $sse2_x86_64 fi # fdct functions diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h index ebcd4116f..31182c35c 100644 --- a/libvpx/vp9/common/vp9_treecoder.h +++ b/libvpx/vp9/common/vp9_treecoder.h @@ -79,4 +79,22 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); } +static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob, + const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + const unsigned int count = MIN(ct[0] + ct[1], count_sat); + const unsigned int factor = max_update_factor * count / count_sat; + return weighted_prob(pre_prob, prob, factor); +} + +static INLINE vp9_prob merge_probs2(vp9_prob pre_prob, + const unsigned int ct[2], + unsigned int count_sat, + unsigned int max_update_factor) { + return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat, + max_update_factor); +} + + #endif // VP9_COMMON_VP9_TREECODER_H_ diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c index a1e14b482..8f740f412 100644 --- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride, { \ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ d0 = _mm_unpacklo_epi8(d0, zero); \ - in_x = _mm_add_epi16(in_x, d0); \ - in_x = _mm_packus_epi16(in_x, in_x); \ - _mm_storel_epi64((__m128i *)(dest), in_x); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + _mm_storel_epi64((__m128i *)(dest), d0); \ dest += stride; \ } @@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { RECON_AND_STORE(dest, in7); } +void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 5); + + dc_value = _mm_set1_epi16(a); + + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); +} + // perform 8x8 transpose static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); @@ -1449,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { } } +void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 2; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 16); + } +} + static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; array_transpose_8x8(res0, res0); @@ -2760,6 +2813,12 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, } } +#define LOAD_DQCOEFF(reg, input) \ + { \ + reg = _mm_load_si128((__m128i *) input); \ + input += 8; \ + } \ + void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1<<5); @@ -2827,48 +2886,126 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; - int i, j; + int i, j, i32; + __m128i zero_idx[16]; + int zero_flag[2]; // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. for (i = 0; i < 8; i++) { + i32 = (i << 5); if (i < 4) { // First 1-D idct // Load input data. - in0 = _mm_load_si128((__m128i *)input); - in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); - in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); - in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); - in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); - in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); - in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); - in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); - in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); - in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); - in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); - in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); - in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); - in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); - in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); - in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); - - in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); - in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); - in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); - in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); - in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); - in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); - in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); - in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); - in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); - in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); - in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); - in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); - in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); - in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); - in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); - in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); - - input += 256; + LOAD_DQCOEFF(in0, input); + LOAD_DQCOEFF(in8, input); + LOAD_DQCOEFF(in16, input); + LOAD_DQCOEFF(in24, input); + LOAD_DQCOEFF(in1, input); + LOAD_DQCOEFF(in9, input); + LOAD_DQCOEFF(in17, input); + LOAD_DQCOEFF(in25, input); + LOAD_DQCOEFF(in2, input); + LOAD_DQCOEFF(in10, input); + LOAD_DQCOEFF(in18, input); + LOAD_DQCOEFF(in26, input); + LOAD_DQCOEFF(in3, input); + LOAD_DQCOEFF(in11, input); + LOAD_DQCOEFF(in19, input); + LOAD_DQCOEFF(in27, input); + + LOAD_DQCOEFF(in4, input); + LOAD_DQCOEFF(in12, input); + LOAD_DQCOEFF(in20, input); + LOAD_DQCOEFF(in28, input); + LOAD_DQCOEFF(in5, input); + LOAD_DQCOEFF(in13, input); + LOAD_DQCOEFF(in21, input); + LOAD_DQCOEFF(in29, input); + LOAD_DQCOEFF(in6, input); + LOAD_DQCOEFF(in14, input); + LOAD_DQCOEFF(in22, input); + LOAD_DQCOEFF(in30, input); + LOAD_DQCOEFF(in7, input); + LOAD_DQCOEFF(in15, input); + LOAD_DQCOEFF(in23, input); + LOAD_DQCOEFF(in31, input); + + // checking if all entries are zero + zero_idx[0] = _mm_or_si128(in0, in1); + zero_idx[1] = _mm_or_si128(in2, in3); + zero_idx[2] = _mm_or_si128(in4, in5); + zero_idx[3] = _mm_or_si128(in6, in7); + zero_idx[4] = _mm_or_si128(in8, in9); + zero_idx[5] = _mm_or_si128(in10, in11); + zero_idx[6] = _mm_or_si128(in12, in13); + zero_idx[7] = _mm_or_si128(in14, in15); + zero_idx[8] = _mm_or_si128(in16, in17); + zero_idx[9] = _mm_or_si128(in18, in19); + zero_idx[10] = _mm_or_si128(in20, in21); + zero_idx[11] = _mm_or_si128(in22, in23); + zero_idx[12] = _mm_or_si128(in24, in25); + zero_idx[13] = _mm_or_si128(in26, in27); + zero_idx[14] = _mm_or_si128(in28, in29); + zero_idx[15] = _mm_or_si128(in30, in31); + + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); + + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); + + zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); + zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); + zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); + zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); + zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); + + if (!zero_flag[0] && !zero_flag[1]) { + col[i32 + 0] = _mm_setzero_si128(); + col[i32 + 1] = _mm_setzero_si128(); + col[i32 + 2] = _mm_setzero_si128(); + col[i32 + 3] = _mm_setzero_si128(); + col[i32 + 4] = _mm_setzero_si128(); + col[i32 + 5] = _mm_setzero_si128(); + col[i32 + 6] = _mm_setzero_si128(); + col[i32 + 7] = _mm_setzero_si128(); + col[i32 + 8] = _mm_setzero_si128(); + col[i32 + 9] = _mm_setzero_si128(); + col[i32 + 10] = _mm_setzero_si128(); + col[i32 + 11] = _mm_setzero_si128(); + col[i32 + 12] = _mm_setzero_si128(); + col[i32 + 13] = _mm_setzero_si128(); + col[i32 + 14] = _mm_setzero_si128(); + col[i32 + 15] = _mm_setzero_si128(); + col[i32 + 16] = _mm_setzero_si128(); + col[i32 + 17] = _mm_setzero_si128(); + col[i32 + 18] = _mm_setzero_si128(); + col[i32 + 19] = _mm_setzero_si128(); + col[i32 + 20] = _mm_setzero_si128(); + col[i32 + 21] = _mm_setzero_si128(); + col[i32 + 22] = _mm_setzero_si128(); + col[i32 + 23] = _mm_setzero_si128(); + col[i32 + 24] = _mm_setzero_si128(); + col[i32 + 25] = _mm_setzero_si128(); + col[i32 + 26] = _mm_setzero_si128(); + col[i32 + 27] = _mm_setzero_si128(); + col[i32 + 28] = _mm_setzero_si128(); + col[i32 + 29] = _mm_setzero_si128(); + col[i32 + 30] = _mm_setzero_si128(); + col[i32 + 31] = _mm_setzero_si128(); + continue; + } // Transpose 32x8 block to 8x32 block TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, @@ -3239,38 +3376,38 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { // final stage if (i < 4) { // 1_D: Store 32 intermediate results for each 8x32 block. - col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); - col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); - col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); - col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); - col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); - col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); - col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); - col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); - col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); - col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); - col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); - col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); - col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); - col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); - col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); - col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); - col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); - col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); - col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); - col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); - col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); - col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); - col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); - col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); - col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); - col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); - col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); - col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); - col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); - col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); - col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); - col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); } else { const __m128i zero = _mm_setzero_si128(); diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm index bc8ed5c1f..8ba26f310 100644 --- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm +++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm @@ -10,6 +10,31 @@ %include "third_party/x86inc/x86inc.asm" +SECTION_RODATA + +pb_1: times 16 db 1 +pw_2: times 8 dw 2 +pb_7m1: times 8 db 7, -1 +pb_15: times 16 db 15 + +sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7 +sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7 +sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 +sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 +sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1 +sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1 +sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1 +sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1 +sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 +sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1 +sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1 +sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1 +sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1 +sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 +sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 + SECTION .text INIT_MMX ssse3 @@ -85,3 +110,182 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left inc lineq jnz .loop REP_RET + +INIT_MMX ssse3 +cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above + movq m0, [aboveq] + pshufb m2, m0, [sh_b23456777] + pshufb m1, m0, [sh_b01234577] + pshufb m0, [sh_b12345677] + pavgb m3, m2, m1 + pxor m2, m1 + pand m2, [pb_1] + psubb m3, m2 + pavgb m0, m3 + + ; store 4 lines + movd [dstq ], m0 + psrlq m0, 8 + movd [dstq+strideq], m0 + lea dstq, [dstq+strideq*2] + psrlq m0, 8 + movd [dstq ], m0 + psrlq m0, 8 + movd [dstq+strideq], m0 + RET + +INIT_MMX ssse3 +cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above + movq m0, [aboveq] + mova m1, [sh_b12345677] + DEFINE_ARGS dst, stride, stride3, line + lea stride3q, [strideq*3] + pshufb m2, m0, [sh_b23456777] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [pb_1] + psubb m3, m2 + pavgb m0, m3 + + ; store 4 lines + movq [dstq ], m0 + pshufb m0, m1 + movq [dstq+strideq ], m0 + pshufb m0, m1 + movq [dstq+strideq*2], m0 + pshufb m0, m1 + movq [dstq+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq+strideq*4] + + ; store next 4 lines + movq [dstq ], m0 + pshufb m0, m1 + movq [dstq+strideq ], m0 + pshufb m0, m1 + movq [dstq+strideq*2], m0 + pshufb m0, m1 + movq [dstq+stride3q ], m0 + RET + +INIT_XMM ssse3 +cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3, dst8, line + lea stride3q, [strideq*3] + lea dst8q, [dstq+strideq*8] + mova m1, [sh_b123456789abcdeff] + pshufb m2, m0, [sh_b23456789abcdefff] + pavgb m3, m2, m0 + pxor m2, m0 + pshufb m0, m1 + pand m2, [pb_1] + psubb m3, m2 + pavgb m0, m3 + + ; first 4 lines and first half of 3rd 4 lines + mov lined, 2 +.loop: + mova [dstq ], m0 + movhps [dst8q ], m0 + pshufb m0, m1 + mova [dstq +strideq ], m0 + movhps [dst8q+strideq ], m0 + pshufb m0, m1 + mova [dstq +strideq*2 ], m0 + movhps [dst8q+strideq*2 ], m0 + pshufb m0, m1 + mova [dstq +stride3q ], m0 + movhps [dst8q+stride3q ], m0 + pshufb m0, m1 + lea dstq, [dstq +strideq*4] + lea dst8q, [dst8q+strideq*4] + dec lined + jnz .loop + + ; bottom-right 8x8 block + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + lea dstq, [dstq+strideq*4] + movhps [dstq +8], m0 + movhps [dstq+strideq +8], m0 + movhps [dstq+strideq*2+8], m0 + movhps [dstq+stride3q +8], m0 + RET + +INIT_XMM ssse3 +cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line + mova m0, [aboveq] + mova m4, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, dst16, line + lea stride3q, [strideq*3] + lea dst16q, [dstq +strideq*8] + lea dst16q, [dst16q+strideq*8] + mova m1, [sh_b123456789abcdeff] + pshufb m2, m4, [sh_b23456789abcdefff] + pavgb m3, m2, m4 + pxor m2, m4 + palignr m5, m4, m0, 1 + palignr m6, m4, m0, 2 + pshufb m4, m1 + pand m2, [pb_1] + psubb m3, m2 + pavgb m4, m3 + pavgb m3, m0, m6 + pxor m0, m6 + pand m0, [pb_1] + psubb m3, m0 + pavgb m5, m3 + + ; write 4x4 lines (and the first half of the second 4x4 lines) + mov lined, 4 +.loop: + mova [dstq ], m5 + mova [dstq +16], m4 + mova [dst16q ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +strideq ], m3 + mova [dstq +strideq +16], m4 + mova [dst16q+strideq ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + mova [dstq +strideq*2 ], m5 + mova [dstq +strideq*2+16], m4 + mova [dst16q+strideq*2 ], m4 + palignr m3, m4, m5, 1 + pshufb m4, m1 + mova [dstq +stride3q ], m3 + mova [dstq +stride3q +16], m4 + mova [dst16q+stride3q ], m4 + palignr m5, m4, m3, 1 + pshufb m4, m1 + lea dstq, [dstq +strideq*4] + lea dst16q, [dst16q+strideq*4] + dec lined + jnz .loop + + ; write second half of second 4x4 lines + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + lea dstq, [dstq +strideq*4] + mova [dstq +16], m4 + mova [dstq +strideq +16], m4 + mova [dstq +strideq*2+16], m4 + mova [dstq +stride3q +16], m4 + RET |