aboutsummaryrefslogtreecommitdiff
path: root/libvpx/vp9/common
diff options
context:
space:
mode:
authorhkuang <hkuang@google.com>2013-08-06 11:07:19 -0700
committerHangyu Kuang <hkuang@google.com>2013-08-06 18:31:37 +0000
commitf3bed9137f66ef693bd406e43b17e9a1114f1e14 (patch)
treecd1bea0cd923c6d125cb5b3e7b3404d7c2f70208 /libvpx/vp9/common
parenta8b927ab4f06e2fc0d16d9606b57672df9899ac1 (diff)
downloadlibvpx-f3bed9137f66ef693bd406e43b17e9a1114f1e14.tar.gz
Roll latest libvpx into Android.android-4.4_r0.8android-4.4_r0.7
The latest libvpx just added initial multithread vp9 decoding support and more neon optimization. Checkout is from master branch(hash:33afddadb9af6569bd8296ef1d48d0511b651e9d). Change-Id: I54be2f48bc033c00876b6b1d0a3ff1eccb92a2fa
Diffstat (limited to 'libvpx/vp9/common')
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm63
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm53
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm618
-rw-r--r--libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm48
-rw-r--r--libvpx/vp9/common/vp9_blockd.h87
-rw-r--r--libvpx/vp9/common/vp9_common_data.c65
-rw-r--r--libvpx/vp9/common/vp9_common_data.h7
-rw-r--r--libvpx/vp9/common/vp9_entropy.c56
-rw-r--r--libvpx/vp9/common/vp9_entropy.h7
-rw-r--r--libvpx/vp9/common/vp9_entropymode.c89
-rw-r--r--libvpx/vp9/common/vp9_entropymode.h18
-rw-r--r--libvpx/vp9/common/vp9_entropymv.c69
-rw-r--r--libvpx/vp9/common/vp9_enums.h4
-rw-r--r--libvpx/vp9/common/vp9_extend.c16
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.c13
-rw-r--r--libvpx/vp9/common/vp9_findnearmv.h30
-rw-r--r--libvpx/vp9/common/vp9_idct.c57
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.c35
-rw-r--r--libvpx/vp9/common/vp9_loopfilter.h21
-rw-r--r--libvpx/vp9/common/vp9_mv.h8
-rw-r--r--libvpx/vp9/common/vp9_mvref_common.c438
-rw-r--r--libvpx/vp9/common/vp9_mvref_common.h15
-rw-r--r--libvpx/vp9/common/vp9_onyxc_int.h19
-rw-r--r--libvpx/vp9/common/vp9_pred_common.c38
-rw-r--r--libvpx/vp9/common/vp9_pred_common.h8
-rw-r--r--libvpx/vp9/common/vp9_reconinter.c122
-rw-r--r--libvpx/vp9/common/vp9_reconinter.h2
-rw-r--r--libvpx/vp9/common/vp9_rtcd_defs.sh49
-rw-r--r--libvpx/vp9/common/vp9_treecoder.h18
-rw-r--r--libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c279
-rw-r--r--libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm204
31 files changed, 1706 insertions, 850 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
index 15039e267..110a56cdd 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -52,15 +52,15 @@
; sp[]int h
|vp9_convolve8_avg_horiz_neon| PROC
+ ldr r12, [sp, #4] ; x_step_q4
+ cmp r12, #16
+ bne vp9_convolve8_avg_horiz_c
+
push {r4-r10, lr}
sub r0, r0, #3 ; adjust for taps
- ldr r4, [sp, #36] ; x_step_q4
ldr r5, [sp, #32] ; filter_x
- cmp r4, #16
- bne call_horiz_c_convolve ; x_step_q4 != 16
-
ldr r6, [sp, #48] ; w
ldr r7, [sp, #52] ; h
@@ -82,22 +82,22 @@
mov r10, r6 ; w loop counter
loop_horiz
- vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]!
- vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+ vld1.8 {d24}, [r0]!
vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9
- vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]!
- vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+ vld1.8 {d25}, [r0]!
vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9
- vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]!
- vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+ vld1.8 {d26}, [r0]!
vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
- vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
- vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+ vld1.8 {d27}, [r0]!
vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
; extract to s16
vmovl.u8 q8, d24
vmovl.u8 q9, d25
@@ -128,8 +128,8 @@ loop_horiz
vqrshrun.s32 d5, q15, #7
; saturate
- vqshrn.u16 d2, q1, #0
- vqshrn.u16 d3, q2, #0
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
; transpose
vtrn.16 d2, d3
@@ -137,10 +137,7 @@ loop_horiz
vtrn.8 d2, d3
; average the new value and the dst value
- vaddl.u8 q8, d2, d6
- vaddl.u8 q9, d3, d7
- vqrshrn.u16 d2, q8, #1
- vqrshrn.u16 d3, q9, #1
+ vrhadd.u8 q1, q1, q3
vst1.u32 {d2[0]}, [r2], r3
vst1.u32 {d3[0]}, [r2], r3
@@ -159,26 +156,20 @@ loop_horiz
pop {r4-r10, pc}
-call_horiz_c_convolve
- pop {r4-r10, lr}
- add r0, r0, #3 ; un-adjust for taps
- b vp9_convolve8_avg_horiz_c
-
-
ENDP
|vp9_convolve8_avg_vert_neon| PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vp9_convolve8_avg_vert_c
+
push {r4-r10, lr}
; adjust for taps
sub r0, r0, r1
sub r0, r0, r1, lsl #1
- ldr r6, [sp, #44] ; y_step_q4
ldr r7, [sp, #40] ; filter_y
- cmp r6, #16
- bne call_vert_c_convolve ; y_step_q4 != 16
-
ldr r8, [sp, #48] ; w
ldr r9, [sp, #52] ; h
@@ -240,14 +231,11 @@ loop_vert
vqrshrun.s32 d5, q15, #7
; saturate
- vqshrn.u16 d2, q1, #0
- vqshrn.u16 d3, q2, #0
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
; average the new value and the dst value
- vaddl.u8 q8, d2, d6
- vaddl.u8 q9, d3, d7
- vqrshrn.u16 d2, q8, #1
- vqrshrn.u16 d3, q9, #1
+ vrhadd.u8 q1, q1, q3
vst1.u32 {d2[0]}, [r2], r3
vst1.u32 {d2[1]}, [r2], r3
@@ -266,12 +254,5 @@ loop_vert
pop {r4-r10, pc}
-call_vert_c_convolve
- pop {r4-r10, lr}
- ; un-adjust for taps
- add r0, r0, r1
- add r0, r0, r1, lsl #1
- b vp9_convolve8_avg_vert_c
-
ENDP
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
index 842c73c90..845e4a866 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -52,15 +52,15 @@
; sp[]int h
|vp9_convolve8_horiz_neon| PROC
+ ldr r12, [sp, #4] ; x_step_q4
+ cmp r12, #16
+ bne vp9_convolve8_horiz_c
+
push {r4-r10, lr}
sub r0, r0, #3 ; adjust for taps
- ldr r4, [sp, #36] ; x_step_q4
ldr r5, [sp, #32] ; filter_x
- cmp r4, #16
- bne call_horiz_c_convolve ; x_step_q4 != 16
-
ldr r6, [sp, #48] ; w
ldr r7, [sp, #52] ; h
@@ -82,22 +82,22 @@
mov r10, r6 ; w loop counter
loop_horiz
- vld4.u8 {d24[0], d25[0], d26[0], d27[0]}, [r0]!
- vld4.u8 {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+ vld1.8 {d24}, [r0]!
vld3.u8 {d28[0], d29[0], d30[0]}, [r0], r9
- vld4.u8 {d24[1], d25[1], d26[1], d27[1]}, [r0]!
- vld4.u8 {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+ vld1.8 {d25}, [r0]!
vld3.u8 {d28[1], d29[1], d30[1]}, [r0], r9
- vld4.u8 {d24[2], d25[2], d26[2], d27[2]}, [r0]!
- vld4.u8 {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+ vld1.8 {d26}, [r0]!
vld3.u8 {d28[2], d29[2], d30[2]}, [r0], r9
- vld4.u8 {d24[3], d25[3], d26[3], d27[3]}, [r0]!
- vld4.u8 {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+ vld1.8 {d27}, [r0]!
vld3.u8 {d28[3], d29[3], d30[3]}, [r0], r8
+ vtrn.16 q12, q13
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+
; extract to s16
vmovl.u8 q8, d24
vmovl.u8 q9, d25
@@ -120,8 +120,8 @@ loop_horiz
vqrshrun.s32 d5, q15, #7
; saturate
- vqshrn.u16 d2, q1, #0
- vqshrn.u16 d3, q2, #0
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
; transpose
vtrn.16 d2, d3
@@ -145,26 +145,20 @@ loop_horiz
pop {r4-r10, pc}
-call_horiz_c_convolve
- pop {r4-r10, lr}
- add r0, r0, #3 ; un-adjust for taps
- b vp9_convolve8_horiz_c
-
-
ENDP
|vp9_convolve8_vert_neon| PROC
+ ldr r12, [sp, #12]
+ cmp r12, #16
+ bne vp9_convolve8_vert_c
+
push {r4-r10, lr}
; adjust for taps
sub r0, r0, r1
sub r0, r0, r1, lsl #1
- ldr r6, [sp, #44] ; y_step_q4
ldr r7, [sp, #40] ; filter_y
- cmp r6, #16
- bne call_vert_c_convolve ; y_step_q4 != 16
-
ldr r8, [sp, #48] ; w
ldr r9, [sp, #52] ; h
@@ -219,8 +213,8 @@ loop_vert
vqrshrun.s32 d5, q15, #7
; saturate
- vqshrn.u16 d2, q1, #0
- vqshrn.u16 d3, q2, #0
+ vqmovn.u16 d2, q1
+ vqmovn.u16 d3, q2
vst1.u32 {d2[0]}, [r2], r3
vst1.u32 {d2[1]}, [r2], r3
@@ -239,12 +233,5 @@ loop_vert
pop {r4-r10, pc}
-call_vert_c_convolve
- pop {r4-r10, lr}
- ; un-adjust for taps
- add r0, r0, r1
- add r0, r0, r1, lsl #1
- b vp9_convolve8_vert_c
-
ENDP
END
diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
new file mode 100644
index 000000000..edf5786e3
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -0,0 +1,618 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_mb_lpf_horizontal_edge_w_neon|
+ EXPORT |vp9_mb_lpf_vertical_edge_w_neon|
+ ARM
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh
+; int count)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+ ldr r12, [sp, #92] ; load count
+
+h_count
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, r1, lsl #3 ; move src pointer down by 8 lines
+
+ vld1.u8 {d0}, [r8@64], r1 ; p7
+ vld1.u8 {d1}, [r8@64], r1 ; p6
+ vld1.u8 {d2}, [r8@64], r1 ; p5
+ vld1.u8 {d3}, [r8@64], r1 ; p4
+ vld1.u8 {d4}, [r8@64], r1 ; p3
+ vld1.u8 {d5}, [r8@64], r1 ; p2
+ vld1.u8 {d6}, [r8@64], r1 ; p1
+ vld1.u8 {d7}, [r8@64], r1 ; p0
+ vld1.u8 {d8}, [r8@64], r1 ; q0
+ vld1.u8 {d9}, [r8@64], r1 ; q1
+ vld1.u8 {d10}, [r8@64], r1 ; q2
+ vld1.u8 {d11}, [r8@64], r1 ; q3
+ vld1.u8 {d12}, [r8@64], r1 ; q4
+ vld1.u8 {d13}, [r8@64], r1 ; q5
+ vld1.u8 {d14}, [r8@64], r1 ; q6
+ vld1.u8 {d15}, [r8@64], r1 ; q7
+
+ bl vp9_wide_mbfilter_neon
+
+ tst r7, #1
+ beq h_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, r1, lsl #1
+
+ vst1.u8 {d25}, [r8@64], r1 ; store op1
+ vst1.u8 {d24}, [r8@64], r1 ; store op0
+ vst1.u8 {d23}, [r8@64], r1 ; store oq0
+ vst1.u8 {d26}, [r8@64], r1 ; store oq1
+
+ b h_next
+
+h_mbfilter
+ tst r7, #2
+ beq h_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, r1, lsl #1
+ sub r8, r8, r1
+
+ vst1.u8 {d18}, [r8@64], r1 ; store op2
+ vst1.u8 {d19}, [r8@64], r1 ; store op1
+ vst1.u8 {d20}, [r8@64], r1 ; store op0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq0
+ vst1.u8 {d22}, [r8@64], r1 ; store oq1
+ vst1.u8 {d23}, [r8@64], r1 ; store oq2
+
+ b h_next
+
+h_wide_mbfilter
+ sub r8, r0, r1, lsl #3
+ add r8, r8, r1
+
+ vst1.u8 {d16}, [r8@64], r1 ; store op6
+ vst1.u8 {d24}, [r8@64], r1 ; store op5
+ vst1.u8 {d25}, [r8@64], r1 ; store op4
+ vst1.u8 {d26}, [r8@64], r1 ; store op3
+ vst1.u8 {d27}, [r8@64], r1 ; store op2
+ vst1.u8 {d18}, [r8@64], r1 ; store op1
+ vst1.u8 {d19}, [r8@64], r1 ; store op0
+ vst1.u8 {d20}, [r8@64], r1 ; store oq0
+ vst1.u8 {d21}, [r8@64], r1 ; store oq1
+ vst1.u8 {d22}, [r8@64], r1 ; store oq2
+ vst1.u8 {d23}, [r8@64], r1 ; store oq3
+ vst1.u8 {d1}, [r8@64], r1 ; store oq4
+ vst1.u8 {d2}, [r8@64], r1 ; store oq5
+ vst1.u8 {d3}, [r8@64], r1 ; store oq6
+
+h_next
+ add r0, r0, #8
+ subs r12, r12, #1
+ bne h_count
+
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |vp9_mb_lpf_horizontal_edge_w_neon|
+
+; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
+; const uint8_t *blimit,
+; const uint8_t *limit,
+; const uint8_t *thresh)
+; r0 uint8_t *s,
+; r1 int p, /* pitch */
+; r2 const uint8_t *blimit,
+; r3 const uint8_t *limit,
+; sp const uint8_t *thresh,
+|vp9_mb_lpf_vertical_edge_w_neon| PROC
+ push {r4-r8, lr}
+ vpush {d8-d15}
+ ldr r4, [sp, #88] ; load thresh
+
+ vld1.8 {d16[]}, [r2] ; load *blimit
+ vld1.8 {d17[]}, [r3] ; load *limit
+ vld1.8 {d18[]}, [r4] ; load *thresh
+
+ sub r8, r0, #8
+
+ vld1.8 {d0}, [r8@64], r1
+ vld1.8 {d8}, [r0@64], r1
+ vld1.8 {d1}, [r8@64], r1
+ vld1.8 {d9}, [r0@64], r1
+ vld1.8 {d2}, [r8@64], r1
+ vld1.8 {d10}, [r0@64], r1
+ vld1.8 {d3}, [r8@64], r1
+ vld1.8 {d11}, [r0@64], r1
+ vld1.8 {d4}, [r8@64], r1
+ vld1.8 {d12}, [r0@64], r1
+ vld1.8 {d5}, [r8@64], r1
+ vld1.8 {d13}, [r0@64], r1
+ vld1.8 {d6}, [r8@64], r1
+ vld1.8 {d14}, [r0@64], r1
+ vld1.8 {d7}, [r8@64], r1
+ vld1.8 {d15}, [r0@64], r1
+
+ sub r0, r0, r1, lsl #3
+
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ vtrn.8 d0, d1
+ vtrn.8 d2, d3
+ vtrn.8 d4, d5
+ vtrn.8 d6, d7
+
+ vtrn.8 d8, d9
+ vtrn.8 d10, d11
+ vtrn.8 d12, d13
+ vtrn.8 d14, d15
+
+ bl vp9_wide_mbfilter_neon
+
+ tst r7, #1
+ beq v_mbfilter
+
+ ; flat && mask were not set for any of the channels. Just store the values
+ ; from filter.
+ sub r8, r0, #2
+
+ vswp d23, d25
+
+ vst4.8 {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+ vst4.8 {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+ vst4.8 {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+ vst4.8 {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+ vst4.8 {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+ vst4.8 {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+ vst4.8 {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+ vst4.8 {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+ b v_end
+
+v_mbfilter
+ tst r7, #2
+ beq v_wide_mbfilter
+
+ ; flat2 was not set for any of the channels. Just store the values from
+ ; mbfilter.
+ sub r8, r0, #3
+
+ vst3.8 {d18[0], d19[0], d20[0]}, [r8], r1
+ vst3.8 {d21[0], d22[0], d23[0]}, [r0], r1
+ vst3.8 {d18[1], d19[1], d20[1]}, [r8], r1
+ vst3.8 {d21[1], d22[1], d23[1]}, [r0], r1
+ vst3.8 {d18[2], d19[2], d20[2]}, [r8], r1
+ vst3.8 {d21[2], d22[2], d23[2]}, [r0], r1
+ vst3.8 {d18[3], d19[3], d20[3]}, [r8], r1
+ vst3.8 {d21[3], d22[3], d23[3]}, [r0], r1
+ vst3.8 {d18[4], d19[4], d20[4]}, [r8], r1
+ vst3.8 {d21[4], d22[4], d23[4]}, [r0], r1
+ vst3.8 {d18[5], d19[5], d20[5]}, [r8], r1
+ vst3.8 {d21[5], d22[5], d23[5]}, [r0], r1
+ vst3.8 {d18[6], d19[6], d20[6]}, [r8], r1
+ vst3.8 {d21[6], d22[6], d23[6]}, [r0], r1
+ vst3.8 {d18[7], d19[7], d20[7]}, [r8], r1
+ vst3.8 {d21[7], d22[7], d23[7]}, [r0], r1
+
+ b v_end
+
+v_wide_mbfilter
+ sub r8, r0, #8
+
+ vtrn.32 d0, d26
+ vtrn.32 d16, d27
+ vtrn.32 d24, d18
+ vtrn.32 d25, d19
+
+ vtrn.16 d0, d24
+ vtrn.16 d16, d25
+ vtrn.16 d26, d18
+ vtrn.16 d27, d19
+
+ vtrn.8 d0, d16
+ vtrn.8 d24, d25
+ vtrn.8 d26, d27
+ vtrn.8 d18, d19
+
+ vtrn.32 d20, d1
+ vtrn.32 d21, d2
+ vtrn.32 d22, d3
+ vtrn.32 d23, d15
+
+ vtrn.16 d20, d22
+ vtrn.16 d21, d23
+ vtrn.16 d1, d3
+ vtrn.16 d2, d15
+
+ vtrn.8 d20, d21
+ vtrn.8 d22, d23
+ vtrn.8 d1, d2
+ vtrn.8 d3, d15
+
+ vst1.8 {d0}, [r8@64], r1
+ vst1.8 {d20}, [r0@64], r1
+ vst1.8 {d16}, [r8@64], r1
+ vst1.8 {d21}, [r0@64], r1
+ vst1.8 {d24}, [r8@64], r1
+ vst1.8 {d22}, [r0@64], r1
+ vst1.8 {d25}, [r8@64], r1
+ vst1.8 {d23}, [r0@64], r1
+ vst1.8 {d26}, [r8@64], r1
+ vst1.8 {d1}, [r0@64], r1
+ vst1.8 {d27}, [r8@64], r1
+ vst1.8 {d2}, [r0@64], r1
+ vst1.8 {d18}, [r8@64], r1
+ vst1.8 {d3}, [r0@64], r1
+ vst1.8 {d19}, [r8@64], r1
+ vst1.8 {d15}, [r0@64], r1
+
+v_end
+ vpop {d8-d15}
+ pop {r4-r8, pc}
+
+ ENDP ; |vp9_mb_lpf_vertical_edge_w_neon|
+
+; void vp9_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16 blimit
+; d17 limit
+; d18 thresh
+; d0 p7
+; d1 p6
+; d2 p5
+; d3 p4
+; d4 p3
+; d5 p2
+; d6 p1
+; d7 p0
+; d8 q0
+; d9 q1
+; d10 q2
+; d11 q3
+; d12 q4
+; d13 q5
+; d14 q6
+; d15 q7
+|vp9_wide_mbfilter_neon| PROC
+ mov r7, #0
+
+ ; filter_mask
+ vabd.u8 d19, d4, d5 ; abs(p3 - p2)
+ vabd.u8 d20, d5, d6 ; abs(p2 - p1)
+ vabd.u8 d21, d6, d7 ; abs(p1 - p0)
+ vabd.u8 d22, d9, d8 ; abs(q1 - q0)
+ vabd.u8 d23, d10, d9 ; abs(q2 - q1)
+ vabd.u8 d24, d11, d10 ; abs(q3 - q2)
+
+ ; only compare the largest value to limit
+ vmax.u8 d19, d19, d20 ; max(abs(p3 - p2), abs(p2 - p1))
+ vmax.u8 d20, d21, d22 ; max(abs(p1 - p0), abs(q1 - q0))
+ vmax.u8 d23, d23, d24 ; max(abs(q2 - q1), abs(q3 - q2))
+ vmax.u8 d19, d19, d20
+
+ vabd.u8 d24, d7, d8 ; abs(p0 - q0)
+
+ vmax.u8 d19, d19, d23
+
+ vabd.u8 d23, d6, d9 ; a = abs(p1 - q1)
+ vqadd.u8 d24, d24, d24 ; b = abs(p0 - q0) * 2
+
+ ; abs () > limit
+ vcge.u8 d19, d17, d19
+
+ ; flatmask4
+ vabd.u8 d25, d7, d5 ; abs(p0 - p2)
+ vabd.u8 d26, d8, d10 ; abs(q0 - q2)
+ vabd.u8 d27, d4, d7 ; abs(p3 - p0)
+ vabd.u8 d28, d11, d8 ; abs(q3 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d25, d25, d26 ; max(abs(p0 - p2), abs(q0 - q2))
+ vmax.u8 d26, d27, d28 ; max(abs(p3 - p0), abs(q3 - q0))
+ vmax.u8 d25, d25, d26
+ vmax.u8 d20, d20, d25
+
+ vshr.u8 d23, d23, #1 ; a = a / 2
+ vqadd.u8 d24, d24, d23 ; a = b + a
+
+ vmov.u8 d30, #1
+ vcge.u8 d24, d16, d24 ; (a > blimit * 2 + limit) * -1
+
+ vcge.u8 d20, d30, d20 ; flat
+
+ vand d19, d19, d24 ; mask
+
+ ; hevmask
+ vcgt.u8 d21, d21, d18 ; (abs(p1 - p0) > thresh)*-1
+ vcgt.u8 d22, d22, d18 ; (abs(q1 - q0) > thresh)*-1
+ vorr d21, d21, d22 ; hev
+
+ vand d16, d20, d19 ; flat && mask
+ vmov r5, r6, d16
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #1 ; Only do filter branch
+
+ ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+ vabd.u8 d22, d3, d7 ; abs(p4 - p0)
+ vabd.u8 d23, d12, d8 ; abs(q4 - q0)
+ vabd.u8 d24, d7, d2 ; abs(p0 - p5)
+ vabd.u8 d25, d8, d13 ; abs(q0 - q5)
+ vabd.u8 d26, d1, d7 ; abs(p6 - p0)
+ vabd.u8 d27, d14, d8 ; abs(q6 - q0)
+ vabd.u8 d28, d0, d7 ; abs(p7 - p0)
+ vabd.u8 d29, d15, d8 ; abs(q7 - q0)
+
+ ; only compare the largest value to thresh
+ vmax.u8 d22, d22, d23 ; max(abs(p4 - p0), abs(q4 - q0))
+ vmax.u8 d23, d24, d25 ; max(abs(p0 - p5), abs(q0 - q5))
+ vmax.u8 d24, d26, d27 ; max(abs(p6 - p0), abs(q6 - q0))
+ vmax.u8 d25, d28, d29 ; max(abs(p7 - p0), abs(q7 - q0))
+
+ vmax.u8 d26, d22, d23
+ vmax.u8 d27, d24, d25
+ vmax.u8 d23, d26, d27
+
+ vcge.u8 d18, d30, d23 ; flat2
+
+ vmov.u8 d22, #0x80
+
+ vand d17, d18, d16 ; flat2 && flat && mask
+ vmov r5, r6, d17
+ orrs r5, r5, r6 ; Check for 0
+ orreq r7, r7, #2 ; Only do mbfilter branch
+
+ ; mbfilter() function
+
+ ; filter() function
+ ; convert to signed
+ veor d23, d8, d22 ; qs0
+ veor d24, d7, d22 ; ps0
+ veor d25, d6, d22 ; ps1
+ veor d26, d9, d22 ; qs1
+
+ vmov.u8 d27, #3
+
+ vsub.s8 d28, d23, d24 ; ( qs0 - ps0)
+
+ vqsub.s8 d29, d25, d26 ; filter = clamp(ps1-qs1)
+
+ vmull.s8 q15, d28, d27 ; 3 * ( qs0 - ps0)
+
+ vand d29, d29, d21 ; filter &= hev
+
+ vaddw.s8 q15, q15, d29 ; filter + 3 * (qs0 - ps0)
+
+ vmov.u8 d29, #4
+
+ ; filter = clamp(filter + 3 * ( qs0 - ps0))
+ vqmovn.s16 d28, q15
+
+ vand d28, d28, d19 ; filter &= mask
+
+ vqadd.s8 d30, d28, d27 ; filter2 = clamp(filter+3)
+ vqadd.s8 d29, d28, d29 ; filter1 = clamp(filter+4)
+ vshr.s8 d30, d30, #3 ; filter2 >>= 3
+ vshr.s8 d29, d29, #3 ; filter1 >>= 3
+
+
+ vqadd.s8 d24, d24, d30 ; op0 = clamp(ps0 + filter2)
+ vqsub.s8 d23, d23, d29 ; oq0 = clamp(qs0 - filter1)
+
+ ; outer tap adjustments: ++filter1 >> 1
+ vrshr.s8 d29, d29, #1
+ vbic d29, d29, d21 ; filter &= ~hev
+
+ vqadd.s8 d25, d25, d29 ; op1 = clamp(ps1 + filter)
+ vqsub.s8 d26, d26, d29 ; oq1 = clamp(qs1 - filter)
+
+ veor d24, d24, d22 ; *f_op0 = u^0x80
+ veor d23, d23, d22 ; *f_oq0 = u^0x80
+ veor d25, d25, d22 ; *f_op1 = u^0x80
+ veor d26, d26, d22 ; *f_oq1 = u^0x80
+
+ tst r7, #1
+ bxne lr
+
+ ; mbfilter flat && mask branch
+ ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+ ; and using vibt on the q's?
+ vmov.u8 d29, #2
+ vaddl.u8 q15, d7, d8 ; op2 = p0 + q0
+ vmlal.u8 q15, d4, d27 ; op2 = p0 + q0 + p3 * 3
+ vmlal.u8 q15, d5, d29 ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+ vaddw.u8 q15, d6 ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+ vqrshrn.u16 d18, q15, #3 ; r_op2
+
+ vsubw.u8 q15, d4 ; op1 = op2 - p3
+ vsubw.u8 q15, d5 ; op1 -= p2
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d9 ; op1 += q1
+ vqrshrn.u16 d19, q15, #3 ; r_op1
+
+ vsubw.u8 q15, d4 ; op0 = op1 - p3
+ vsubw.u8 q15, d6 ; op0 -= p1
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d10 ; op0 += q2
+ vqrshrn.u16 d20, q15, #3 ; r_op0
+
+ vsubw.u8 q15, d4 ; oq0 = op0 - p3
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d11 ; oq0 += q3
+ vqrshrn.u16 d21, q15, #3 ; r_oq0
+
+ vsubw.u8 q15, d5 ; oq1 = oq0 - p2
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddw.u8 q15, d11 ; oq1 += q3
+ vqrshrn.u16 d22, q15, #3 ; r_oq1
+
+ vsubw.u8 q15, d6 ; oq2 = oq0 - p1
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vaddw.u8 q15, d10 ; oq2 += q2
+ vaddw.u8 q15, d11 ; oq2 += q3
+ vqrshrn.u16 d27, q15, #3 ; r_oq2
+
+ ; Filter does not set op2 or oq2, so use p2 and q2.
+ vbif d18, d5, d16 ; t_op2 |= p2 & ~(flat & mask)
+ vbif d19, d25, d16 ; t_op1 |= f_op1 & ~(flat & mask)
+ vbif d20, d24, d16 ; t_op0 |= f_op0 & ~(flat & mask)
+ vbif d21, d23, d16 ; t_oq0 |= f_oq0 & ~(flat & mask)
+ vbif d22, d26, d16 ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+ vbit d23, d27, d16 ; t_oq2 |= r_oq2 & (flat & mask)
+ vbif d23, d10, d16 ; t_oq2 |= q2 & ~(flat & mask)
+
+ tst r7, #2
+ bxne lr
+
+ ; wide_mbfilter flat2 && flat && mask branch
+ vmov.u8 d16, #7
+ vaddl.u8 q15, d7, d8 ; op6 = p0 + q0
+ vmlal.u8 q15, d0, d16 ; op6 += p7 * 3
+ vmlal.u8 q15, d1, d29 ; op6 += p6 * 2
+ vaddw.u8 q15, d2 ; op6 += p5
+ vaddw.u8 q15, d3 ; op6 += p4
+ vaddw.u8 q15, d4 ; op6 += p3
+ vaddw.u8 q15, d5 ; op6 += p2
+ vaddw.u8 q15, d6 ; op6 += p1
+ vqrshrn.u16 d16, q15, #4 ; w_op6
+
+ vsubw.u8 q15, d0 ; op5 = op6 - p7
+ vsubw.u8 q15, d1 ; op5 -= p6
+ vaddw.u8 q15, d2 ; op5 += p5
+ vaddw.u8 q15, d9 ; op5 += q1
+ vqrshrn.u16 d24, q15, #4 ; w_op5
+
+ vsubw.u8 q15, d0 ; op4 = op5 - p7
+ vsubw.u8 q15, d2 ; op4 -= p5
+ vaddw.u8 q15, d3 ; op4 += p4
+ vaddw.u8 q15, d10 ; op4 += q2
+ vqrshrn.u16 d25, q15, #4 ; w_op4
+
+ vsubw.u8 q15, d0 ; op3 = op4 - p7
+ vsubw.u8 q15, d3 ; op3 -= p4
+ vaddw.u8 q15, d4 ; op3 += p3
+ vaddw.u8 q15, d11 ; op3 += q3
+ vqrshrn.u16 d26, q15, #4 ; w_op3
+
+ vsubw.u8 q15, d0 ; op2 = op3 - p7
+ vsubw.u8 q15, d4 ; op2 -= p3
+ vaddw.u8 q15, d5 ; op2 += p2
+ vaddw.u8 q15, d12 ; op2 += q4
+ vqrshrn.u16 d27, q15, #4 ; w_op2
+
+ vbif d27, d18, d17 ; op2 |= t_op2 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d0 ; op1 = op2 - p7
+ vsubw.u8 q15, d5 ; op1 -= p2
+ vaddw.u8 q15, d6 ; op1 += p1
+ vaddw.u8 q15, d13 ; op1 += q5
+ vqrshrn.u16 d18, q15, #4 ; w_op1
+
+ vbif d18, d19, d17 ; op1 |= t_op1 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d0 ; op0 = op1 - p7
+ vsubw.u8 q15, d6 ; op0 -= p1
+ vaddw.u8 q15, d7 ; op0 += p0
+ vaddw.u8 q15, d14 ; op0 += q6
+ vqrshrn.u16 d19, q15, #4 ; w_op0
+
+ vbif d19, d20, d17 ; op0 |= t_op0 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d0 ; oq0 = op0 - p7
+ vsubw.u8 q15, d7 ; oq0 -= p0
+ vaddw.u8 q15, d8 ; oq0 += q0
+ vaddw.u8 q15, d15 ; oq0 += q7
+ vqrshrn.u16 d20, q15, #4 ; w_oq0
+
+ vbif d20, d21, d17 ; oq0 |= t_oq0 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d1 ; oq1 = oq0 - p6
+ vsubw.u8 q15, d8 ; oq1 -= q0
+ vaddw.u8 q15, d9 ; oq1 += q1
+ vaddw.u8 q15, d15 ; oq1 += q7
+ vqrshrn.u16 d21, q15, #4 ; w_oq1
+
+ vbif d21, d22, d17 ; oq1 |= t_oq1 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d2 ; oq2 = oq1 - p5
+ vsubw.u8 q15, d9 ; oq2 -= q1
+ vaddw.u8 q15, d10 ; oq2 += q2
+ vaddw.u8 q15, d15 ; oq2 += q7
+ vqrshrn.u16 d22, q15, #4 ; w_oq2
+
+ vbif d22, d23, d17 ; oq2 |= t_oq2 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d3 ; oq3 = oq2 - p4
+ vsubw.u8 q15, d10 ; oq3 -= q2
+ vaddw.u8 q15, d11 ; oq3 += q3
+ vaddw.u8 q15, d15 ; oq3 += q7
+ vqrshrn.u16 d23, q15, #4 ; w_oq3
+
+ vbif d16, d1, d17 ; op6 |= p6 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d4 ; oq4 = oq3 - p3
+ vsubw.u8 q15, d11 ; oq4 -= q3
+ vaddw.u8 q15, d12 ; oq4 += q4
+ vaddw.u8 q15, d15 ; oq4 += q7
+ vqrshrn.u16 d1, q15, #4 ; w_oq4
+
+ vbif d24, d2, d17 ; op5 |= p5 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d5 ; oq5 = oq4 - p2
+ vsubw.u8 q15, d12 ; oq5 -= q4
+ vaddw.u8 q15, d13 ; oq5 += q5
+ vaddw.u8 q15, d15 ; oq5 += q7
+ vqrshrn.u16 d2, q15, #4 ; w_oq5
+
+ vbif d25, d3, d17 ; op4 |= p4 & ~(f2 & f & m)
+
+ vsubw.u8 q15, d6 ; oq6 = oq5 - p1
+ vsubw.u8 q15, d13 ; oq6 -= q5
+ vaddw.u8 q15, d14 ; oq6 += q6
+ vaddw.u8 q15, d15 ; oq6 += q7
+ vqrshrn.u16 d3, q15, #4 ; w_oq6
+
+ vbif d26, d4, d17 ; op3 |= p3 & ~(f2 & f & m)
+ vbif d23, d11, d17 ; oq3 |= q3 & ~(f2 & f & m)
+ vbif d1, d12, d17 ; oq4 |= q4 & ~(f2 & f & m)
+ vbif d2, d13, d17 ; oq5 |= q5 & ~(f2 & f & m)
+ vbif d3, d14, d17 ; oq6 |= q6 & ~(f2 & f & m)
+
+ bx lr
+ ENDP ; |vp9_wide_mbfilter_neon|
+
+ END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index 8e4aadac2..f82966577 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -22,8 +22,8 @@
MACRO
IDCT8x8_1D
; stage 1
- vdup.16 d0, r3; ; duplicate cospi_28_64
- vdup.16 d1, r4; ; duplicate cospi_4_64
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r4 ; duplicate cospi_4_64
; input[1] * cospi_28_64
vmull.s16 q2, d18, d0
@@ -57,8 +57,8 @@
vqrshrn.s32 d14, q2, #14 ; >> 14
vqrshrn.s32 d15, q3, #14 ; >> 14
- vdup.16 d0, r5; ; duplicate cospi_12_64
- vdup.16 d1, r6; ; duplicate cospi_20_64
+ vdup.16 d0, r5 ; duplicate cospi_12_64
+ vdup.16 d1, r6 ; duplicate cospi_20_64
; input[5] * cospi_12_64
vmull.s16 q2, d26, d0
@@ -93,7 +93,7 @@
vqrshrn.s32 d13, q1, #14 ; >> 14
; stage 2 & stage 3 - even half
- vdup.16 d0, r7; ; duplicate cospi_16_64
+ vdup.16 d0, r7 ; duplicate cospi_16_64
; input[0] * cospi_16_64
vmull.s16 q2, d16, d0
@@ -128,8 +128,8 @@
vqrshrn.s32 d23, q3, #14 ; >> 14
; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vdup.16 d0, r8; ; duplicate cospi_24_64
- vdup.16 d1, r9; ; duplicate cospi_8_64
+ vdup.16 d0, r8 ; duplicate cospi_24_64
+ vdup.16 d1, r9 ; duplicate cospi_8_64
; input[1] * cospi_24_64
vmull.s16 q2, d20, d0
@@ -176,7 +176,7 @@
vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
; stage 3 -odd half
- vdup.16 d16, r7; ; duplicate cospi_16_64
+ vdup.16 d16, r7 ; duplicate cospi_16_64
; step2[6] * cospi_16_64
vmull.s16 q9, d28, d16
@@ -211,14 +211,14 @@
vqrshrn.s32 d13, q10, #14 ; >> 14
; stage 4
- vadd.s16 q8, q0, q7; ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6; ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5; ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4; ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4; ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5; ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6; ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7; ; output[7] = step1[0] - step1[7];
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
MEND
; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
@@ -310,14 +310,14 @@
mov r0, r1
; load destination data
- vld1.u8 {d0}, [r1], r2
- vld1.u8 {d1}, [r1], r2
- vld1.s16 {d2}, [r1], r2
- vld1.s16 {d3}, [r1], r2
- vld1.s16 {d4}, [r1], r2
- vld1.s16 {d5}, [r1], r2
- vld1.s16 {d6}, [r1], r2
- vld1.s16 {d7}, [r1]
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
vaddw.u8 q8, q8, d0
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 129711412..f68c5c6ea 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -26,9 +26,6 @@
#include "vp9/common/vp9_treecoder.h"
#define BLOCK_SIZE_GROUPS 4
-
-#define PREDICTION_PROBS 3
-
#define MBSKIP_CONTEXTS 3
/* Segment Feature Masks */
@@ -164,6 +161,11 @@ typedef struct {
union b_mode_info bmi[4];
} MODE_INFO;
+static int is_inter_block(const MB_MODE_INFO *mbmi) {
+ return mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+
enum mv_precision {
MV_PRECISION_Q3,
MV_PRECISION_Q4
@@ -286,22 +288,22 @@ typedef struct macroblockd {
static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
switch (subsize) {
- case BLOCK_SIZE_SB64X64:
- case BLOCK_SIZE_SB64X32:
- case BLOCK_SIZE_SB32X64:
- case BLOCK_SIZE_SB32X32:
+ case BLOCK_64X64:
+ case BLOCK_64X32:
+ case BLOCK_32X64:
+ case BLOCK_32X32:
return &xd->sb_index;
- case BLOCK_SIZE_SB32X16:
- case BLOCK_SIZE_SB16X32:
- case BLOCK_SIZE_MB16X16:
+ case BLOCK_32X16:
+ case BLOCK_16X32:
+ case BLOCK_16X16:
return &xd->mb_index;
- case BLOCK_SIZE_SB16X8:
- case BLOCK_SIZE_SB8X16:
- case BLOCK_SIZE_SB8X8:
+ case BLOCK_16X8:
+ case BLOCK_8X16:
+ case BLOCK_8X8:
return &xd->b_index;
- case BLOCK_SIZE_SB8X4:
- case BLOCK_SIZE_SB4X8:
- case BLOCK_SIZE_AB4X4:
+ case BLOCK_8X4:
+ case BLOCK_4X8:
+ case BLOCK_4X4:
return &xd->ab_index;
default:
assert(0);
@@ -315,7 +317,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
const int bwl = b_width_log2(sb_type);
const int bhl = b_height_log2(sb_type);
- const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+ const int boffset = b_width_log2(BLOCK_64X64) - bsl;
const char pcval0 = ~(0xe << boffset);
const char pcval1 = ~(0xf << boffset);
const char pcvalue[2] = {pcval0, pcval1};
@@ -333,7 +335,7 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
BLOCK_SIZE_TYPE sb_type) {
int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
int above = 0, left = 0, i;
- int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+ int boffset = mi_width_log2(BLOCK_64X64) - bsl;
assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
assert(bsl >= 0);
@@ -366,10 +368,10 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
if (plane_type != PLANE_TYPE_Y_WITH_DC ||
xd->lossless ||
- mbmi->ref_frame[0] != INTRA_FRAME)
+ is_inter_block(mbmi))
return DCT_DCT;
- return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+ return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
mi->bmi[ib].as_mode : mbmi->mode];
}
@@ -496,16 +498,16 @@ static INLINE void foreach_transformed_block_in_plane(
// it to 4x4 block sizes.
if (xd->mb_to_right_edge < 0)
max_blocks_wide +=
- + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+ (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
if (xd->mb_to_bottom_edge < 0)
max_blocks_high +=
- + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+ (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
i = 0;
// Unlike the normal case - in here we have to keep track of the
// row and column of the blocks we use so that we know if we are in
- // the unrestricted motion border..
+ // the unrestricted motion border.
for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
if (r < max_blocks_high && c < max_blocks_wide)
@@ -563,8 +565,8 @@ static INLINE void foreach_predicted_block_in_plane(
// size of the predictor to use.
int pred_w, pred_h;
- if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- assert(bsize == BLOCK_SIZE_SB8X8);
+ if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
+ assert(bsize == BLOCK_8X8);
pred_w = 0;
pred_h = 0;
} else {
@@ -689,46 +691,39 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
}
}
static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
- int plane, int ss_tx_size, int eob, int aoff,
- int loff, ENTROPY_CONTEXT *A,
- ENTROPY_CONTEXT *L) {
- const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
- const int sw = bw - xd->plane[plane].subsampling_x;
- const int sh = bh - xd->plane[plane].subsampling_y;
- int mi_blocks_wide = 1 << sw;
- int mi_blocks_high = 1 << sh;
- int tx_size_in_blocks = (1 << ss_tx_size);
+ int plane, int tx_size_in_blocks,
+ int eob, int aoff, int loff,
+ ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
+ struct macroblockd_plane *pd = &xd->plane[plane];
int above_contexts = tx_size_in_blocks;
int left_contexts = tx_size_in_blocks;
+ int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
+ int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
int pt;
// xd->mb_to_right_edge is in units of pixels * 8. This converts
// it to 4x4 block sizes.
- if (xd->mb_to_right_edge < 0) {
- mi_blocks_wide += (xd->mb_to_right_edge
- >> (5 + xd->plane[plane].subsampling_x));
- }
+ if (xd->mb_to_right_edge < 0)
+ mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
// this code attempts to avoid copying into contexts that are outside
// our border. Any blocks that do are set to 0...
if (above_contexts + aoff > mi_blocks_wide)
above_contexts = mi_blocks_wide - aoff;
- if (xd->mb_to_bottom_edge < 0) {
- mi_blocks_high += (xd->mb_to_bottom_edge
- >> (5 + xd->plane[plane].subsampling_y));
- }
- if (left_contexts + loff > mi_blocks_high) {
+ if (xd->mb_to_bottom_edge < 0)
+ mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+ if (left_contexts + loff > mi_blocks_high)
left_contexts = mi_blocks_high - loff;
- }
for (pt = 0; pt < above_contexts; pt++)
A[pt] = eob > 0;
- for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
+ for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
A[pt] = 0;
for (pt = 0; pt < left_contexts; pt++)
L[pt] = eob > 0;
- for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
+ for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
L[pt] = 0;
}
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index dee44ec63..fdf37e46a 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -31,6 +31,14 @@ const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
{1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
+const int size_group_lookup[BLOCK_SIZE_TYPES] =
+ {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+
+const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+ {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+
+
const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
{ // 4X4
// 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
@@ -40,25 +48,25 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
PARTITION_INVALID
}, { // 8X8
- // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
}, { // 16X16
- // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
PARTITION_INVALID, PARTITION_INVALID
}, { // 32X32
- // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
PARTITION_INVALID, PARTITION_INVALID
}, { // 64X64
- // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+ // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
@@ -68,29 +76,29 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
{ // PARTITION_NONE
- BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4,
- BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8,
- BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16,
- BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32,
- BLOCK_SIZE_SB64X64,
+ BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+ BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+ BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+ BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+ BLOCK_64X64,
}, { // PARTITION_HORZ
BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB64X32,
+ BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_64X32,
}, { // PARTITION_VERT
BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB32X64,
+ BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_32X64,
}, { // PARTITION_SPLIT
BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
- BLOCK_SIZE_SB32X32,
+ BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+ BLOCK_32X32,
}
};
@@ -108,14 +116,9 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
};
const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
- {BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8,
- BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB4X8},
- {BLOCK_SIZE_SB8X4, BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16,
- BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB8X16},
- {BLOCK_SIZE_SB16X8, BLOCK_SIZE_SB16X8, BLOCK_SIZE_MB16X16,
- BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32},
- {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16,
- BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64},
- {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32,
- BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64}
+ { BLOCK_4X4, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8, BLOCK_4X8 },
+ { BLOCK_8X4, BLOCK_8X8, BLOCK_8X16, BLOCK_8X16, BLOCK_8X16 },
+ { BLOCK_16X8, BLOCK_16X8, BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
+ { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
+ { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
};
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index 8b0f8a500..bc8c01a77 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -21,10 +21,9 @@ extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE
- partition_lookup[][BLOCK_SIZE_TYPES];
-
-
+extern const int size_group_lookup[BLOCK_SIZE_TYPES];
+extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 0ad0dbccd..df3a9fed5 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -73,7 +73,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
13, 11, 14, 15,
};
-DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
0, 8, 1, 16, 9, 2, 17, 24,
10, 3, 18, 25, 32, 11, 4, 26,
33, 19, 40, 12, 34, 27, 5, 41,
@@ -419,7 +419,7 @@ static void init_bit_trees() {
init_bit_tree(cat6, 14);
}
-vp9_extra_bit vp9_extra_bits[12] = {
+const vp9_extra_bit vp9_extra_bits[12] = {
{ 0, 0, 0, 0},
{ 0, 0, 0, 1},
{ 0, 0, 0, 2},
@@ -437,14 +437,10 @@ vp9_extra_bit vp9_extra_bits[12] = {
#include "vp9/common/vp9_default_coef_probs.h"
void vp9_default_coef_probs(VP9_COMMON *pc) {
- vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
- sizeof(pc->fc.coef_probs[TX_4X4]));
- vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,
- sizeof(pc->fc.coef_probs[TX_8X8]));
- vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,
- sizeof(pc->fc.coef_probs[TX_16X16]));
- vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,
- sizeof(pc->fc.coef_probs[TX_32X32]));
+ vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+ vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+ vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+ vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
}
// Neighborhood 5-tuples for various scans and blocksizes,
@@ -613,17 +609,17 @@ void vp9_coef_tree_initialize() {
#define COEF_COUNT_SAT_AFTER_KEY 24
#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
- int count_sat, int update_factor) {
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
+ unsigned int count_sat,
+ unsigned int update_factor) {
FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
- vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
- vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size];
- vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size];
+ vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
+ vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+ vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
- cm->counts.eob_branch[txfm_size];
- int t, i, j, k, l, count;
- int factor;
+ cm->counts.eob_branch[tx_size];
+ int t, i, j, k, l;
unsigned int branch_ct[UNCONSTRAINED_NODES][2];
vp9_prob coef_probs[UNCONSTRAINED_NODES];
int entropy_nodes_adapt = UNCONSTRAINED_NODES;
@@ -634,29 +630,23 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
if (l >= 3 && k == 0)
continue;
- vp9_tree_probs_from_distribution(
- vp9_coefmodel_tree,
- coef_probs, branch_ct,
- coef_counts[i][j][k][l], 0);
+ vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
+ branch_ct, coef_counts[i][j][k][l],
+ 0);
branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
- for (t = 0; t < entropy_nodes_adapt; ++t) {
- count = branch_ct[t][0] + branch_ct[t][1];
- count = count > count_sat ? count_sat : count;
- factor = (update_factor * count / count_sat);
- dst_coef_probs[i][j][k][l][t] =
- weighted_prob(pre_coef_probs[i][j][k][l][t],
- coef_probs[t], factor);
- }
+ for (t = 0; t < entropy_nodes_adapt; ++t)
+ dst_coef_probs[i][j][k][l][t] = merge_probs(
+ pre_coef_probs[i][j][k][l][t], coef_probs[t],
+ branch_ct[t], count_sat, update_factor);
}
}
void vp9_adapt_coef_probs(VP9_COMMON *cm) {
TX_SIZE t;
- int count_sat;
- int update_factor; /* denominator 256 */
+ unsigned int count_sat, update_factor;
- if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+ if (cm->frame_type == KEY_FRAME || cm->intra_only) {
update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
count_sat = COEF_COUNT_SAT_KEY;
} else if (cm->last_frame_type == KEY_FRAME) {
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 4ea727ff4..861c0786c 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -50,7 +50,7 @@ typedef struct {
int base_val;
} vp9_extra_bit;
-extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */
+extern const vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */
#define MAX_PROB 255
#define DCT_MAX_VALUE 16384
@@ -80,7 +80,6 @@ extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */
coefficient band (and since zigzag positions 0, 1, and 2 are in
distinct bands). */
-/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */
#define PREV_COEF_CONTEXTS 6
// #define ENTROPY_STATS
@@ -102,7 +101,7 @@ extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
@@ -119,7 +118,7 @@ extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index ca188e438..768e5f523 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -356,53 +356,15 @@ void vp9_entropy_mode_init() {
vp9_inter_mode_tree, NEARESTMV);
}
-void vp9_accum_mv_refs(VP9_COMMON *pc,
- MB_PREDICTION_MODE m,
- const int context) {
- unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
- pc->counts.inter_mode;
-
- if (m == ZEROMV) {
- ++inter_mode_counts[context][0][0];
- } else {
- ++inter_mode_counts[context][0][1];
- if (m == NEARESTMV) {
- ++inter_mode_counts[context][1][0];
- } else {
- ++inter_mode_counts[context][1][1];
- if (m == NEARMV) {
- ++inter_mode_counts[context][2][0];
- } else {
- ++inter_mode_counts[context][2][1];
- }
- }
- }
-}
-
#define COUNT_SAT 20
#define MAX_UPDATE_FACTOR 128
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
- unsigned int ct[2]) {
- const int count = MIN(ct[0] + ct[1], COUNT_SAT);
- const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT;
- return weighted_prob(pre_prob, prob, factor);
+static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
+ return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
}
static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
- return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct);
-}
-
-void vp9_adapt_mode_context(VP9_COMMON *pc) {
- int i, j;
- FRAME_CONTEXT *const fc = &pc->fc;
- FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx];
- FRAME_COUNTS *const counts = &pc->counts;
-
- for (j = 0; j < INTER_MODE_CONTEXTS; j++)
- for (i = 0; i < VP9_INTER_MODES - 1; i++)
- fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i],
- counts->inter_mode[j][i]);
+ return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
}
static void update_mode_probs(int n_modes,
@@ -440,6 +402,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
counts->single_ref[i][j]);
+ for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+ update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+ counts->inter_mode[i], pre_fc->inter_mode_probs[i],
+ fc->inter_mode_probs[i], NEARESTMV);
+
for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
counts->y_mode[i], pre_fc->y_mode_prob[i],
@@ -466,25 +433,25 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
if (cm->tx_mode == TX_MODE_SELECT) {
int j;
- unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
- unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
- unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+ unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
+ unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
+ unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
- for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+ for (j = 0; j < TX_SIZES - 3; ++j)
fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
branch_ct_8x8p[j]);
tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
branch_ct_16x16p);
- for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+ for (j = 0; j < TX_SIZES - 2; ++j)
fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
branch_ct_16x16p[j]);
tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
branch_ct_32x32p);
- for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+ for (j = 0; j < TX_SIZES - 1; ++j)
fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
branch_ct_32x32p[j]);
}
@@ -495,22 +462,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
counts->mbskip[i]);
}
-static void set_default_lf_deltas(MACROBLOCKD *xd) {
- xd->lf.mode_ref_delta_enabled = 1;
- xd->lf.mode_ref_delta_update = 1;
+static void set_default_lf_deltas(struct loopfilter *lf) {
+ lf->mode_ref_delta_enabled = 1;
+ lf->mode_ref_delta_update = 1;
- xd->lf.ref_deltas[INTRA_FRAME] = 1;
- xd->lf.ref_deltas[LAST_FRAME] = 0;
- xd->lf.ref_deltas[GOLDEN_FRAME] = -1;
- xd->lf.ref_deltas[ALTREF_FRAME] = -1;
+ lf->ref_deltas[INTRA_FRAME] = 1;
+ lf->ref_deltas[LAST_FRAME] = 0;
+ lf->ref_deltas[GOLDEN_FRAME] = -1;
+ lf->ref_deltas[ALTREF_FRAME] = -1;
- xd->lf.mode_deltas[0] = 0;
- xd->lf.mode_deltas[1] = 0;
+ lf->mode_deltas[0] = 0;
+ lf->mode_deltas[1] = 0;
}
void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
// Reset the segment feature data to the default stats:
// Features disabled, 0, with delta coding (Default state).
+ struct loopfilter *const lf = &xd->lf;
+
int i;
vp9_clearall_segfeatures(&xd->seg);
xd->seg.abs_delta = SEGMENT_DELTADATA;
@@ -518,12 +487,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
// Reset the mode ref deltas for loop filter
- vp9_zero(xd->lf.last_ref_deltas);
- vp9_zero(xd->lf.last_mode_deltas);
- set_default_lf_deltas(xd);
+ vp9_zero(lf->last_ref_deltas);
+ vp9_zero(lf->last_mode_deltas);
+ set_default_lf_deltas(lf);
// To force update of the sharpness
- xd->lf.last_sharpness_level = -1;
+ lf->last_sharpness_level = -1;
vp9_default_coef_probs(cm);
vp9_init_mbmode_probs(cm);
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 8c14e7e17..17a7c2634 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -24,15 +24,15 @@
struct VP9Common;
struct tx_probs {
- vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
- vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
- vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+ vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+ vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+ vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
};
struct tx_counts {
- unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
- unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
- unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+ unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
+ unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+ unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
};
extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
@@ -61,18 +61,12 @@ extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
void vp9_entropy_mode_init();
-int vp9_mv_cont(const int_mv *l, const int_mv *a);
-
void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
void vp9_init_mbmode_probs(struct VP9Common *x);
-void vp9_adapt_mode_context(struct VP9Common *pc);
-
void vp9_adapt_mode_probs(struct VP9Common *);
-void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context);
-
void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
unsigned int (*ct_32x32p)[2]);
void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 343b6241d..6cfc34697 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -16,7 +16,7 @@
#define MV_MAX_UPDATE_FACTOR 128
/* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH 8
+#define COMPANDED_MVREF_THRESH 8
const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
-MV_JOINT_ZERO, 2,
@@ -107,12 +107,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
return mv_class_base(c) + offset;
}
-static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
- int incr) {
- assert (v != 0);
- comp_counts->mvcount[MV_MAX + v] += incr;
-}
-
static void inc_mv_component(int v, nmv_component_counts *comp_counts,
int incr, int usehp) {
int s, z, c, o, d, e, f;
@@ -164,25 +158,19 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
}
}
-void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx) {
+void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
- mvctx->joints[j]++;
+ ++counts->joints[j];
+
if (mv_joint_vertical(j))
- inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
+ ++counts->comps[0].mvcount[MV_MAX + mv->row];
if (mv_joint_horizontal(j))
- inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
+ ++counts->comps[1].mvcount[MV_MAX + mv->col];
}
-static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
- const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);
- if (count) {
- const vp9_prob newp = get_binary_prob(ct[0], ct[1]);
- const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;
- *dest = weighted_prob(prep, newp, factor);
- } else {
- *dest = prep;
- }
+static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
+ return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
}
void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
@@ -195,31 +183,22 @@ static unsigned int adapt_probs(unsigned int i,
vp9_prob this_probs[],
const vp9_prob last_probs[],
const unsigned int num_events[]) {
- vp9_prob this_prob;
- const uint32_t left = tree[i] <= 0
+
+ const unsigned int left = tree[i] <= 0
? num_events[-tree[i]]
: adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
- const uint32_t right = tree[i + 1] <= 0
+ const unsigned int right = tree[i + 1] <= 0
? num_events[-tree[i + 1]]
: adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-
- uint32_t weight = left + right;
- if (weight) {
- this_prob = get_binary_prob(left, right);
- weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight;
- this_prob = weighted_prob(last_probs[i >> 1], this_prob,
- MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT);
- } else {
- this_prob = last_probs[i >> 1];
- }
- this_probs[i >> 1] = this_prob;
+ const unsigned int ct[2] = { left, right };
+ this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
return left + right;
}
-void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
int i, j;
FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
@@ -228,36 +207,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
nmv_context *pre_ctx = &pre_fc->nmvc;
nmv_context_counts *cts = &cm->counts.mv;
- vp9_counts_process(cts, usehp);
+ vp9_counts_process(cts, allow_hp);
adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
for (i = 0; i < 2; ++i) {
- adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+ ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
pre_ctx->comps[i].classes, cts->comps[i].classes);
adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
pre_ctx->comps[i].class0, cts->comps[i].class0);
for (j = 0; j < MV_OFFSET_BITS; ++j)
- adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
- cts->comps[i].bits[j]);
- }
+ ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
+ cts->comps[i].bits[j]);
- for (i = 0; i < 2; ++i) {
for (j = 0; j < CLASS0_SIZE; ++j)
adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
cts->comps[i].fp);
- }
- if (usehp) {
- for (i = 0; i < 2; ++i) {
- adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
- cts->comps[i].class0_hp);
- adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
+ if (allow_hp) {
+ ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
+ cts->comps[i].class0_hp);
+ ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
}
}
}
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 86f0d0bfd..3208b7270 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -54,7 +54,7 @@ typedef enum {
TX_8X8 = 1, // 8x8 dct transform
TX_16X16 = 2, // 16x16 dct transform
TX_32X32 = 3, // 32x32 dct transform
- TX_SIZE_MAX_SB, // Number of transforms available to SBs
+ TX_SIZES
} TX_SIZE;
typedef enum {
@@ -63,7 +63,7 @@ typedef enum {
ALLOW_16X16 = 2,
ALLOW_32X32 = 3,
TX_MODE_SELECT = 4,
- NB_TXFM_MODES = 5,
+ TX_MODES = 5,
} TX_MODE;
typedef enum {
diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c
index 95ec59061..d8496c4f2 100644
--- a/libvpx/vp9/common/vp9_extend.c
+++ b/libvpx/vp9/common/vp9_extend.c
@@ -8,9 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/vp9_extend.h"
#include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_extend.h"
+
static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
uint8_t *dst, int dst_pitch,
int w, int h,
@@ -107,14 +109,14 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
const int src_y_offset = srcy * src->y_stride + srcx;
const int dst_y_offset = srcy * dst->y_stride + srcx;
- const int et_uv = (et_y + 1) >> 1;
- const int el_uv = (el_y + 1) >> 1;
- const int eb_uv = (eb_y + 1) >> 1;
- const int er_uv = (er_y + 1) >> 1;
+ const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+ const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+ const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+ const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
- const int srch_uv = (srch + 1) >> 1;
- const int srcw_uv = (srcw + 1) >> 1;
+ const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+ const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
dst->y_buffer + dst_y_offset, dst->y_stride,
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 643b229a6..3af8b8d21 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -14,8 +14,9 @@
#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_sadmxn.h"
-static void lower_mv_precision(int_mv *mv, int usehp) {
- if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
+static void lower_mv_precision(int_mv *mv, int allow_hp) {
+ const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv);
+ if (!use_hp) {
if (mv->as_mv.row & 1)
mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
if (mv->as_mv.col & 1)
@@ -32,7 +33,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
// Make sure all the candidates are properly clamped etc
for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
- clamp_mv2(&mvlist[i], xd);
+ clamp_mv2(&mvlist[i].as_mv, xd);
}
*nearest = mvlist[0];
*near = mvlist[1];
@@ -41,7 +42,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
int_mv *dst_nearest,
int_mv *dst_near,
- int block_idx, int ref_idx) {
+ int block_idx, int ref_idx,
+ int mi_row, int mi_col) {
int_mv dst_list[MAX_MV_REF_CANDIDATES];
int_mv mv_list[MAX_MV_REF_CANDIDATES];
MODE_INFO *mi = xd->mode_info_context;
@@ -53,7 +55,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
xd->prev_mode_info_context,
mbmi->ref_frame[ref_idx],
- mv_list, cm->ref_frame_sign_bias, block_idx);
+ mv_list, cm->ref_frame_sign_bias, block_idx,
+ mi_row, mi_col);
dst_list[1].as_int = 0;
if (block_idx == 0) {
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index b0fa505b5..e5221ed67 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -29,31 +29,19 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
int_mv *near);
// TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv(int_mv *mv,
- int mb_to_left_edge,
- int mb_to_right_edge,
- int mb_to_top_edge,
- int mb_to_bottom_edge) {
- mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);
- mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);
-}
-
-static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
- int_mv tmp_mv;
- tmp_mv.as_int = mv->as_int;
- clamp_mv(mv,
- xd->mb_to_left_edge - LEFT_TOP_MARGIN,
- xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
- xd->mb_to_top_edge - LEFT_TOP_MARGIN,
- xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
- return tmp_mv.as_int != mv->as_int;
+static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+ clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
}
void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
MACROBLOCKD *xd,
int_mv *dst_nearest,
int_mv *dst_near,
- int block_idx, int ref_idx);
+ int block_idx, int ref_idx,
+ int mi_row, int mi_col);
static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
// FIXME(rbultje, jingning): temporary hack because jenkins doesn't
@@ -62,7 +50,7 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
/* On L edge, get from MB to left of us */
--cur_mb;
- if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+ if (is_inter_block(&cur_mb->mbmi)) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
return ((cur_mb->bmi + 1 + b)->as_mode);
@@ -80,7 +68,7 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
/* On top edge, get from MB above us */
cur_mb -= mi_stride;
- if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+ if (is_inter_block(&cur_mb->mbmi)) {
return DC_PRED;
} else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
return ((cur_mb->bmi + 2 + b)->as_mode);
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index a95560a55..a2245259e 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
}
}
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+ int i, j;
+ int a1;
+ int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ for (j = 0; j < 8; ++j) {
+ for (i = 0; i < 8; ++i)
+ dest[i] = clip_pixel(dest[i] + a1);
+ dest += dest_stride;
+ }
+}
+
static void iadst4_1d(int16_t *input, int16_t *output) {
int s0, s1, s2, s3, s4, s5, s6, s7;
@@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
}
}
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
- int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
- out = dct_const_round_shift(out * cospi_16_64);
- output[0] = ROUND_POWER_OF_TWO(out, 5);
-}
-
static void idct16_1d(int16_t *input, int16_t *output) {
int16_t step1[16], step2[16];
int temp1, temp2;
@@ -857,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
}
}
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ int i, j;
+ int a1;
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
out = dct_const_round_shift(out * cospi_16_64);
- output[0] = ROUND_POWER_OF_TWO(out, 6);
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ for (j = 0; j < 16; ++j) {
+ for (i = 0; i < 16; ++i)
+ dest[i] = clip_pixel(dest[i] + a1);
+ dest += dest_stride;
+ }
}
static void idct32_1d(int16_t *input, int16_t *output) {
@@ -1259,29 +1274,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
out = dct_const_round_shift(out * cospi_16_64);
output[0] = ROUND_POWER_OF_TWO(out, 6);
}
-
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
- int dest_stride) {
- int16_t out[32 * 32] = { 0 };
- int16_t *outptr = out;
- int i, j;
- int16_t temp_in[32], temp_out[32];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- idct32_1d(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j)
- temp_in[j] = out[j * 32 + i];
- idct32_1d(temp_in, temp_out);
- for (j = 0; j < 32; ++j)
- dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
- + dest[j * dest_stride + i]);
- }
-}
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 5498b1717..66df62753 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -16,6 +16,12 @@
#include "vp9/common/vp9_seg_common.h"
+struct loop_filter_info {
+ const uint8_t *mblim;
+ const uint8_t *lim;
+ const uint8_t *hev_thr;
+};
+
static void lf_init_lut(loop_filter_info_n *lfi) {
lfi->mode_lf_lut[DC_PRED] = 0;
lfi->mode_lf_lut[D45_PRED] = 0;
@@ -73,13 +79,14 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
int default_filt_lvl) {
- int seg;
+ int seg_id;
// n_shift is the a multiplier for lf_deltas
// the multiplier is 1 for when filter_lvl is between 0 and 31;
// 2 when filter_lvl is between 32 and 63
const int n_shift = default_filt_lvl >> 5;
loop_filter_info_n *const lfi = &cm->lf_info;
- struct loopfilter *lf = &xd->lf;
+ struct loopfilter *const lf = &xd->lf;
+ struct segmentation *const seg = &xd->seg;
// update limits if sharpness has changed
if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -87,13 +94,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
lf->last_sharpness_level = lf->sharpness_level;
}
- for (seg = 0; seg < MAX_SEGMENTS; seg++) {
+ for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
// Set the baseline filter values for each segment
- if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) {
- const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF);
- lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA
+ if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
+ const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+ lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
? data
: clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
}
@@ -101,18 +108,18 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
if (!lf->mode_ref_delta_enabled) {
// we could get rid of this if we assume that deltas are set to
// zero when not in use; encoder always uses deltas
- vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+ vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
continue;
}
intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
- lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+ lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
+ (lf->mode_deltas[mode] << n_shift);
- lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+ lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
}
}
}
@@ -256,7 +263,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
// Determine the vertical edges that need filtering
for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
const int skip_this = mi[c].mbmi.mb_skip_coeff
- && mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
+ && is_inter_block(&mi[c].mbmi);
// left edge of current unit is block/partition edge -> no skip
const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
!(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
@@ -376,3 +383,11 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
0, cm->mi_rows, y_only);
}
+
+int vp9_loop_filter_worker(void *arg1, void *arg2) {
+ LFWorkerData *const lf_data = (LFWorkerData*)arg1;
+ (void)arg2;
+ vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+ lf_data->start, lf_data->stop, lf_data->y_only);
+ return 1;
+}
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index e59cc6485..5fc909495 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -35,13 +35,6 @@ typedef struct {
uint8_t mode_lf_lut[MB_MODE_COUNT];
} loop_filter_info_n;
-struct loop_filter_info {
- const uint8_t *mblim;
- const uint8_t *lim;
- const uint8_t *hev_thr;
-};
-
-
/* assorted loopfilter functions which get used elsewhere */
struct VP9Common;
struct macroblockd;
@@ -64,4 +57,18 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
struct VP9Common *cm, struct macroblockd *xd,
int start, int stop, int y_only);
+
+typedef struct LoopFilterWorkerData {
+ const YV12_BUFFER_CONFIG *frame_buffer;
+ struct VP9Common *cm;
+ struct macroblockd xd; // TODO(jzern): most of this is unnecessary to the
+ // loopfilter. the planes are necessary as their state
+ // is changed during decode.
+ int start;
+ int stop;
+ int y_only;
+} LFWorkerData;
+
+// Operates on the rows described by LFWorkerData passed as 'arg1'.
+int vp9_loop_filter_worker(void *arg1, void *arg2);
#endif // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index a095258be..31a79b984 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h
@@ -13,6 +13,8 @@
#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+
typedef struct {
int16_t row;
int16_t col;
@@ -28,4 +30,10 @@ typedef struct {
int32_t col;
} MV32;
+static void clamp_mv(MV *mv, int min_col, int max_col,
+ int min_row, int max_row) {
+ mv->col = clamp(mv->col, min_col, max_col);
+ mv->row = clamp(mv->row, min_row, max_row);
+}
+
#endif // VP9_COMMON_VP9_MV_H_
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index ae009b0ff..3b72f41c2 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -11,6 +11,65 @@
#include "vp9/common/vp9_mvref_common.h"
#define MVREF_NEIGHBOURS 8
+
+typedef enum {
+ BOTH_ZERO = 0,
+ ZERO_PLUS_PREDICTED = 1,
+ BOTH_PREDICTED = 2,
+ NEW_PLUS_NON_INTRA = 3,
+ BOTH_NEW = 4,
+ INTRA_PLUS_NON_INTRA = 5,
+ BOTH_INTRA = 6,
+ INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+ 9, // DC_PRED
+ 9, // V_PRED
+ 9, // H_PRED
+ 9, // D45_PRED
+ 9, // D135_PRED
+ 9, // D117_PRED
+ 9, // D153_PRED
+ 9, // D27_PRED
+ 9, // D63_PRED
+ 9, // TM_PRED
+ 0, // NEARESTMV
+ 0, // NEARMV
+ 3, // ZEROMV
+ 1, // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+ BOTH_PREDICTED, // 0
+ NEW_PLUS_NON_INTRA, // 1
+ BOTH_NEW, // 2
+ ZERO_PLUS_PREDICTED, // 3
+ NEW_PLUS_NON_INTRA, // 4
+ INVALID_CASE, // 5
+ BOTH_ZERO, // 6
+ INVALID_CASE, // 7
+ INVALID_CASE, // 8
+ INTRA_PLUS_NON_INTRA, // 9
+ INTRA_PLUS_NON_INTRA, // 10
+ INVALID_CASE, // 11
+ INTRA_PLUS_NON_INTRA, // 12
+ INVALID_CASE, // 13
+ INVALID_CASE, // 14
+ INVALID_CASE, // 15
+ INVALID_CASE, // 16
+ INVALID_CASE, // 17
+ BOTH_INTRA // 18
+};
+
static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
// SB4X4
{{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
@@ -39,263 +98,212 @@ static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
// SB64X64
{{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
};
+
+static const int idx_n_column_to_subblock[4][2] = {
+ {1, 2},
+ {1, 3},
+ {3, 2},
+ {3, 3}
+};
+
// clamp_mv_ref
#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
- mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER,
- xd->mb_to_right_edge + MV_BORDER);
- mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER,
- xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// Gets a candidate reference motion vector from the given mode info
-// structure if one exists that matches the given reference frame.
-static int get_matching_candidate(const MODE_INFO *candidate_mi,
- MV_REFERENCE_FRAME ref_frame,
- int_mv *c_mv, int block_idx) {
- if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {
- if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
- c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;
- else
- c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
- } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {
- if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
- c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;
- else
- c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- } else {
- return 0;
- }
-
- return 1;
+ clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER,
+ xd->mb_to_right_edge + MV_BORDER,
+ xd->mb_to_top_edge - MV_BORDER,
+ xd->mb_to_bottom_edge + MV_BORDER);
}
-// Gets candidate reference motion vector(s) from the given mode info
-// structure if they exists and do NOT match the given reference frame.
-static void get_non_matching_candidates(const MODE_INFO *candidate_mi,
- MV_REFERENCE_FRAME ref_frame,
- MV_REFERENCE_FRAME *c_ref_frame,
- int_mv *c_mv,
- MV_REFERENCE_FRAME *c2_ref_frame,
- int_mv *c2_mv) {
-
- c_mv->as_int = 0;
- c2_mv->as_int = 0;
- *c_ref_frame = INTRA_FRAME;
- *c2_ref_frame = INTRA_FRAME;
-
- // If first candidate not valid neither will be.
- if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {
- // First candidate
- if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {
- *c_ref_frame = candidate_mi->mbmi.ref_frame[0];
- c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
- }
-
- // Second candidate
- if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&
- (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&
- (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {
- *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];
- c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
- }
- }
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
+ int check_sub_blocks, int which_mv,
+ int search_col, int block_idx) {
+ return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8
+ ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+ .as_mv[which_mv]
+ : candidate->mbmi.mv[which_mv]);
}
// Performs mv sign inversion if indicated by the reference frame combination.
-static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,
- MV_REFERENCE_FRAME candidate_ref_frame,
- int_mv *candidate_mv, int *ref_sign_bias) {
+static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
+ const MV_REFERENCE_FRAME this_ref_frame,
+ const int *ref_sign_bias) {
+ int_mv return_mv = candidate->mbmi.mv[which_mv];
// Sign inversion where appropriate.
- if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
- candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
- candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+ if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
+ ref_sign_bias[this_ref_frame]) {
+ return_mv.as_mv.row *= -1;
+ return_mv.as_mv.col *= -1;
}
+ return return_mv;
}
-// Add a candidate mv.
-// Discard if it has already been seen.
-static void add_candidate_mv(int_mv *mv_list, int *mv_scores,
- int *candidate_count, int_mv candidate_mv,
- int weight) {
- if (*candidate_count == 0) {
- mv_list[0].as_int = candidate_mv.as_int;
- mv_scores[0] = weight;
- *candidate_count += 1;
- } else if ((*candidate_count == 1) &&
- (candidate_mv.as_int != mv_list[0].as_int)) {
- mv_list[1].as_int = candidate_mv.as_int;
- mv_scores[1] = weight;
- *candidate_count += 1;
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list. If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(MV) \
+ if (refmv_count) { \
+ if ((MV).as_int != mv_ref_list[0].as_int) { \
+ mv_ref_list[refmv_count] = (MV); \
+ goto Done; \
+ } \
+ } else { \
+ mv_ref_list[refmv_count++] = (MV); \
+ }
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
+ if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
+ ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+ } \
+ if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
+ (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
+ (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
+ ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
}
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const int mi_col, const int mi_row,
+ const int cur_tile_mi_col_start,
+ const int cur_tile_mi_col_end, const int mi_rows,
+ const int (*mv_ref_search)[2], int idx) {
+ int mi_search_col;
+ const int mi_search_row = mi_row + mv_ref_search[idx][1];;
+
+ // Check that the candidate is within the border. We only need to check
+ // the left side because all the positive right side ones are for blocks that
+ // are large enough to support the + value they have within their border.
+ if (mi_search_row < 0)
+ return 0;
+
+ mi_search_col = mi_col + mv_ref_search[idx][0];
+ if (mi_search_col < cur_tile_mi_col_start)
+ return 0;
+
+ return 1;
}
// This function searches the neighbourhood of a given MB/SB
// to try and find candidate reference vectors.
-//
void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
- MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
- int_mv *mv_ref_list, int *ref_sign_bias,
- int block_idx) {
- int i;
- MODE_INFO *candidate_mi;
- MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
- int_mv c_refmv;
- int_mv c2_refmv;
- MV_REFERENCE_FRAME c_ref_frame;
- MV_REFERENCE_FRAME c2_ref_frame;
- int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 };
+ const MODE_INFO *lf_here,
+ const MV_REFERENCE_FRAME ref_frame,
+ int_mv *mv_ref_list, const int *ref_sign_bias,
+ const int block_idx,
+ const int mi_row, const int mi_col) {
+ int idx;
+ MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
int refmv_count = 0;
const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
- const int mi_col = get_mi_col(xd);
- const int mi_row = get_mi_row(xd);
- int intra_count = 0;
- int zero_count = 0;
- int newmv_count = 0;
- int x_idx = 0, y_idx = 0;
-
- // Blank the reference vector lists and other local structures.
- vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
-
- if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
- x_idx = block_idx & 1;
- y_idx = block_idx >> 1;
- }
-
- // We first scan for candidate vectors that match the current reference frame
- // Look at nearest neigbours
- for (i = 0; i < 2; ++i) {
- const int mi_search_col = mi_col + mv_ref_search[i][0];
- const int mi_search_row = mi_row + mv_ref_search[i][1];
- if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
- (mi_search_col < cm->cur_tile_mi_col_end) &&
- (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
- int b;
-
- candidate_mi = here + mv_ref_search[i][0] +
- (mv_ref_search[i][1] * xd->mode_info_stride);
-
- if (block_idx >= 0) {
- if (mv_ref_search[i][0])
- b = 1 + y_idx * 2;
- else
- b = 2 + x_idx;
- } else {
- b = -1;
- }
- if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c_refmv, 16);
+ const MODE_INFO *candidate;
+ const int check_sub_blocks = block_idx >= 0;
+ int different_ref_found = 0;
+ int context_counter = 0;
+
+ // Blank the reference vector list
+ vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+ // The nearest 2 blocks are treated differently
+ // if the size < 8x8 we get the mv from the bmi substructure,
+ // and we also need to keep a mode count.
+ for (idx = 0; idx < 2; ++idx) {
+ if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+ cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+ continue;
+
+ candidate = here + mv_ref_search[idx][0]
+ + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+ // Keep counts for entropy encoding.
+ context_counter += mode_2_counter[candidate->mbmi.mode];
+
+ // Check if the candidate comes from the same reference frame.
+ if (candidate->mbmi.ref_frame[0] == ref_frame) {
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
+ mv_ref_search[idx][0], block_idx));
+ different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+ } else {
+ different_ref_found = 1;
+ if (candidate->mbmi.ref_frame[1] == ref_frame) {
+ // Add second motion vector if it has the same ref_frame.
+ ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
+ mv_ref_search[idx][0], block_idx));
}
-
- // Count number of neihgbours coded intra and zeromv
- intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
- zero_count += (candidate_mi->mbmi.mode == ZEROMV);
- newmv_count += (candidate_mi->mbmi.mode >= NEWMV);
}
}
- // More distant neigbours
- for (i = 2; (i < MVREF_NEIGHBOURS) &&
- (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
- const int mi_search_col = mi_col + mv_ref_search[i][0];
- const int mi_search_row = mi_row + mv_ref_search[i][1];
- if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
- (mi_search_col < cm->cur_tile_mi_col_end) &&
- (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
- candidate_mi = here + mv_ref_search[i][0] +
- (mv_ref_search[i][1] * xd->mode_info_stride);
-
- if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c_refmv, 16);
+ // Check the rest of the neighbors in much the same way
+ // as before except we don't need to keep track of sub blocks or
+ // mode counts.
+ for (; idx < MVREF_NEIGHBOURS; ++idx) {
+ if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+ cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+ continue;
+
+ candidate = here + mv_ref_search[idx][0]
+ + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+ if (candidate->mbmi.ref_frame[0] == ref_frame) {
+ ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
+ different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+ } else {
+ different_ref_found = 1;
+ if (candidate->mbmi.ref_frame[1] == ref_frame) {
+ ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
}
}
}
- // Look in the last frame if it exists
- if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
- candidate_mi = lf_here;
- if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c_refmv, 16);
+ // Check the last frame's mode and mv info.
+ if (lf_here != NULL) {
+ if (lf_here->mbmi.ref_frame[0] == ref_frame) {
+ ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
+ } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
+ ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
}
}
- // If we have not found enough candidates consider ones where the
- // reference frame does not match. Break out when we have
- // MAX_MV_REF_CANDIDATES candidates.
- // Look first at spatial neighbours
- for (i = 0; (i < MVREF_NEIGHBOURS) &&
- (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
- const int mi_search_col = mi_col + mv_ref_search[i][0];
- const int mi_search_row = mi_row + mv_ref_search[i][1];
- if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
- (mi_search_col < cm->cur_tile_mi_col_end) &&
- (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
- candidate_mi = here + mv_ref_search[i][0] +
- (mv_ref_search[i][1] * xd->mode_info_stride);
-
- get_non_matching_candidates(candidate_mi, ref_frame,
- &c_ref_frame, &c_refmv,
- &c2_ref_frame, &c2_refmv);
-
- if (c_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c_refmv, 1);
- }
+ // Since we couldn't find 2 mvs from the same reference frame
+ // go back through the neighbors and find motion vectors from
+ // different reference frames.
+ if (different_ref_found) {
+ for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
+ if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+ cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+ continue;
- if (c2_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c2_refmv, 1);
- }
- }
- }
+ candidate = here + mv_ref_search[idx][0]
+ + mv_ref_search[idx][1] * xd->mode_info_stride;
- // Look at the last frame if it exists
- if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
- candidate_mi = lf_here;
- get_non_matching_candidates(candidate_mi, ref_frame,
- &c_ref_frame, &c_refmv,
- &c2_ref_frame, &c2_refmv);
-
- if (c_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c_refmv, 1);
- }
+ // If the candidate is INTRA we don't want to consider its mv.
+ if (candidate->mbmi.ref_frame[0] == INTRA_FRAME)
+ continue;
- if (c2_ref_frame != INTRA_FRAME) {
- scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
- add_candidate_mv(mv_ref_list, candidate_scores,
- &refmv_count, c2_refmv, 1);
+ IF_DIFF_REF_FRAME_ADD_MV(candidate);
}
}
- if (!intra_count) {
- if (!newmv_count) {
- // 0 = both zero mv
- // 1 = one zero mv + one a predicted mv
- // 2 = two predicted mvs
- mbmi->mb_mode_context[ref_frame] = 2 - zero_count;
- } else {
- // 3 = one predicted/zero and one new mv
- // 4 = two new mvs
- mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;
- }
- } else {
- // 5 = one intra neighbour + x
- // 6 = two intra neighbours
- mbmi->mb_mode_context[ref_frame] = 4 + intra_count;
+ // Since we still don't have a candidate we'll try the last frame.
+ if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) {
+ IF_DIFF_REF_FRAME_ADD_MV(lf_here);
}
+ Done:
+
+ mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
+
// Clamp vectors
- for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
- clamp_mv_ref(xd, &mv_ref_list[i]);
+ for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+ clamp_mv_ref(xd, &mv_ref_list[idx]);
}
}
+
+#undef ADD_MV_REF_LIST
+#undef IF_DIFF_REF_FRAME_ADD_MV
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index 7290f10ab..c5f89eb57 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -17,11 +17,13 @@
void vp9_find_mv_refs_idx(VP9_COMMON *cm,
MACROBLOCKD *xd,
MODE_INFO *here,
- MODE_INFO *lf_here,
- MV_REFERENCE_FRAME ref_frame,
+ const MODE_INFO *lf_here,
+ const MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
- int *ref_sign_bias,
- int block_idx);
+ const int *ref_sign_bias,
+ const int block_idx,
+ const int mi_row,
+ const int mi_col);
static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
MACROBLOCKD *xd,
@@ -29,9 +31,10 @@ static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
MODE_INFO *lf_here,
MV_REFERENCE_FRAME ref_frame,
int_mv *mv_ref_list,
- int *ref_sign_bias) {
+ int *ref_sign_bias,
+ int mi_row, int mi_col) {
vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
- mv_ref_list, ref_sign_bias, -1);
+ mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
}
#endif // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index f31f24b26..152a93293 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -42,7 +42,7 @@ typedef struct frame_contexts {
vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
[PARTITION_TYPES - 1];
- vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+ vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS - 1];
vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
@@ -59,12 +59,12 @@ typedef struct {
unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
- vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES];
- unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
+ vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
+ unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
[COEF_BANDS][PREV_COEF_CONTEXTS];
unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
[VP9_SWITCHABLE_FILTERS];
- unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+ unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -240,8 +240,7 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
}
-static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
- int mi_row, int mi_col,
+static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
BLOCK_SIZE_TYPE bsize) {
int bsl = mi_width_log2(bsize), bs = 1 << bsl;
int ms = bs / 2;
@@ -278,14 +277,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
}
-static int get_mi_row(const MACROBLOCKD *xd) {
- return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE));
-}
-
-static int get_mi_col(const MACROBLOCKD *xd) {
- return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE));
-}
-
static int get_token_alloc(int mb_rows, int mb_cols) {
return mb_rows * mb_cols * (48 * 16 + 4);
}
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index e8bcdea82..795962a71 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -55,34 +55,28 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
}
// Returns a context number for the given MB prediction signal
unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
- int pred_context;
const MODE_INFO *const mi = xd->mode_info_context;
const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
- // Note:
- // The mode info data structure has a one element border above and to the
- // left of the entries correpsonding to real macroblocks.
- // The prediction flags in these dummy entries are initialised to 0.
- if (above_in_image && left_in_image) { // both edges available
- if (left_mbmi->ref_frame[0] == INTRA_FRAME &&
- above_mbmi->ref_frame[0] == INTRA_FRAME) { // intra/intra (3)
- pred_context = 3;
- } else { // intra/inter (1) or inter/inter (0)
- pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME ||
- above_mbmi->ref_frame[0] == INTRA_FRAME;
- }
- } else if (above_in_image || left_in_image) { // one edge available
- const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+ const int left_intra = !is_inter_block(left_mbmi);
+ const int above_intra = !is_inter_block(above_mbmi);
- // inter: 0, intra: 2
- pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME);
- } else {
- pred_context = 0;
- }
- assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
- return pred_context;
+ // The mode info data structure has a one element border above and to the
+ // left of the entries corresponding to real macroblocks.
+ // The prediction flags in these dummy entries are initialized to 0.
+ // 0 - inter/inter, inter/--, --/inter, --/--
+ // 1 - intra/inter, inter/intra
+ // 2 - intra/--, --/intra
+ // 3 - intra/intra
+ if (above_in_image && left_in_image) // both edges available
+ return left_intra && above_intra ? 3
+ : left_intra || above_intra;
+ else if (above_in_image || left_in_image) // one edge available
+ return 2 * (above_in_image ? above_intra : left_intra);
+ else
+ return 0;
}
// Returns a context number for the given MB prediction signal
unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index e4b6575e3..238290b41 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -110,9 +110,9 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
const struct tx_probs *tx_probs) {
- if (bsize < BLOCK_SIZE_MB16X16)
+ if (bsize < BLOCK_16X16)
return tx_probs->p8x8[context];
- else if (bsize < BLOCK_SIZE_SB32X32)
+ else if (bsize < BLOCK_32X32)
return tx_probs->p16x16[context];
else
return tx_probs->p32x32[context];
@@ -127,9 +127,9 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
TX_SIZE tx_size, struct tx_counts *tx_counts) {
- if (bsize >= BLOCK_SIZE_SB32X32)
+ if (bsize >= BLOCK_32X32)
tx_counts->p32x32[context][tx_size]++;
- else if (bsize >= BLOCK_SIZE_MB16X16)
+ else if (bsize >= BLOCK_16X16)
tx_counts->p16x16[context][tx_size]++;
else
tx_counts->p8x8[context][tx_size]++;
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 63e5646ad..0b65e0610 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -197,14 +197,14 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
- const int_mv *src_mv,
+ const MV *src_mv,
const struct scale_factors *scale,
int w, int h, int weight,
const struct subpix_fn_table *subpix,
enum mv_precision precision) {
const MV32 mv = precision == MV_PRECISION_Q4
- ? scale->scale_mv_q4(&src_mv->as_mv, scale)
- : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+ ? scale->scale_mv_q4(src_mv, scale)
+ : scale->scale_mv_q3_to_q4(src_mv, scale);
const int subpel_x = mv.col & 15;
const int subpel_y = mv.row & 15;
@@ -220,45 +220,44 @@ static INLINE int round_mv_comp_q4(int value) {
return (value < 0 ? value - 2 : value + 2) / 4;
}
-static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {
- const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +
- mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +
- mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +
- mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;
- return round_mv_comp_q4(temp);
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+ MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+ mi->bmi[1].as_mv[idx].as_mv.row +
+ mi->bmi[2].as_mv[idx].as_mv.row +
+ mi->bmi[3].as_mv[idx].as_mv.row),
+ round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+ mi->bmi[1].as_mv[idx].as_mv.col +
+ mi->bmi[2].as_mv[idx].as_mv.col +
+ mi->bmi[3].as_mv[idx].as_mv.col) };
+ return res;
}
-static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {
- const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +
- mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +
- mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +
- mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;
- return round_mv_comp_q4(temp);
-}
+
// TODO(jkoleszar): yet another mv clamping function :-(
MV clamp_mv_to_umv_border_sb(const MV *src_mv,
int bwl, int bhl, int ss_x, int ss_y,
int mb_to_left_edge, int mb_to_top_edge,
int mb_to_right_edge, int mb_to_bottom_edge) {
- /* If the MV points so far into the UMV border that no visible pixels
- * are used for reconstruction, the subpel part of the MV can be
- * discarded and the MV limited to 16 pixels with equivalent results.
- */
+ // If the MV points so far into the UMV border that no visible pixels
+ // are used for reconstruction, the subpel part of the MV can be
+ // discarded and the MV limited to 16 pixels with equivalent results.
const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
const int spel_right = spel_left - (1 << 4);
const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
const int spel_bottom = spel_top - (1 << 4);
- MV clamped_mv;
-
+ MV clamped_mv = {
+ src_mv->row << (1 - ss_y),
+ src_mv->col << (1 - ss_x)
+ };
assert(ss_x <= 1);
assert(ss_y <= 1);
- clamped_mv.col = clamp(src_mv->col << (1 - ss_x),
- (mb_to_left_edge << (1 - ss_x)) - spel_left,
- (mb_to_right_edge << (1 - ss_x)) + spel_right);
- clamped_mv.row = clamp(src_mv->row << (1 - ss_y),
- (mb_to_top_edge << (1 - ss_y)) - spel_top,
- (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
+ clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
+ (mb_to_right_edge << (1 - ss_x)) + spel_right,
+ (mb_to_top_edge << (1 - ss_y)) - spel_top,
+ (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
return clamped_mv;
}
@@ -280,15 +279,14 @@ static void build_inter_predictors(int plane, int block,
const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
- const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
+ const MODE_INFO *const mi = xd->mode_info_context;
+ const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
int which_mv;
assert(x < (4 << bwl));
assert(y < (4 << bhl));
- assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
- 4 << pred_w == (4 << bwl));
- assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
- 4 << pred_h == (4 << bhl));
+ assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl));
+ assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl));
for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
// source
@@ -301,44 +299,30 @@ static void build_inter_predictors(int plane, int block,
// dest
uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
- // motion vector
- const MV *mv;
- MV split_chroma_mv;
- int_mv clamped_mv;
-
- if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
- if (plane == 0) {
- mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;
- } else {
- // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
- // same MV (the average of the 4 luma MVs) but we could do something
- // smarter for non-4:2:0. Just punt for now, pending the changes to get
- // rid of SPLITMV mode entirely.
- split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);
- split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);
- mv = &split_chroma_mv;
- }
- } else {
- mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
- }
-
- /* TODO(jkoleszar): This clamping is done in the incorrect place for the
- * scaling case. It needs to be done on the scaled MV, not the pre-scaling
- * MV. Note however that it performs the subsampling aware scaling so
- * that the result is always q4.
- */
- clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,
- xd->plane[plane].subsampling_x,
- xd->plane[plane].subsampling_y,
- xd->mb_to_left_edge,
- xd->mb_to_top_edge,
- xd->mb_to_right_edge,
- xd->mb_to_bottom_edge);
+ // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+ // same MV (the average of the 4 luma MVs) but we could do something
+ // smarter for non-4:2:0. Just punt for now, pending the changes to get
+ // rid of SPLITMV mode entirely.
+ const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8
+ ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
+ : mi_mv_pred_q4(mi, which_mv))
+ : mi->mbmi.mv[which_mv].as_mv;
+
+ // TODO(jkoleszar): This clamping is done in the incorrect place for the
+ // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+ // MV. Note however that it performs the subsampling aware scaling so
+ // that the result is always q4.
+ const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y,
+ xd->mb_to_left_edge,
+ xd->mb_to_top_edge,
+ xd->mb_to_right_edge,
+ xd->mb_to_bottom_edge);
scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-
vp9_build_inter_predictor(pre, pre_stride,
dst, arg->dst_stride[plane],
- &clamped_mv, &xd->scale_factor[which_mv],
+ &res_mv, &xd->scale_factor[which_mv],
4 << pred_w, 4 << pred_h, which_mv,
&xd->subpix, MV_PRECISION_Q4);
}
@@ -400,7 +384,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
const int ref = cm->active_ref_idx[i];
struct scale_factors *const sf = &cm->active_ref_scale[i];
if (ref >= NUM_YV12_BUFFERS) {
- memset(sf, 0, sizeof(*sf));
+ vp9_zero(*sf);
} else {
YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
vp9_setup_scale_factors_for_frame(sf,
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index e37750dea..6ec7323e1 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -39,7 +39,7 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
- const int_mv *mv_q3,
+ const MV *mv_q3,
const struct scale_factors *scale,
int w, int h, int do_avg,
const struct subpix_fn_table *subpix,
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index c357ef62a..6bb3cb888 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -7,9 +7,7 @@ cat <<EOF
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_enums.h"
-struct loop_filter_info;
struct macroblockd;
-struct loop_filter_info;
/* Encoder forward decls */
struct macroblock;
@@ -22,7 +20,11 @@ EOF
}
forward_decls vp9_common_forward_decls
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2 && ssse3_x86inc=ssse3
+
+# this variable is for functions that are 64 bit only.
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 && ssse3_x86_64=ssse3
#
# Dequant
@@ -47,7 +49,7 @@ prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
specialize vp9_d27_predictor_4x4
prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_4x4
+specialize vp9_d45_predictor_4x4 ssse3
prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d63_predictor_4x4
@@ -86,7 +88,7 @@ prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
specialize vp9_d27_predictor_8x8
prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_8x8
+specialize vp9_d45_predictor_8x8 ssse3
prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d63_predictor_8x8
@@ -125,7 +127,7 @@ prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
specialize vp9_d27_predictor_16x16
prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_16x16
+specialize vp9_d45_predictor_16x16 ssse3
prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d63_predictor_16x16
@@ -164,7 +166,7 @@ prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
specialize vp9_d27_predictor_32x32
prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_32x32
+specialize vp9_d45_predictor_32x32 ssse3
prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d63_predictor_32x32
@@ -214,7 +216,7 @@ fi
# Loopfilter
#
prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon
prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_vertical_edge sse2 neon
@@ -223,7 +225,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
specialize vp9_loop_filter_vertical_edge mmx neon
prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2
+specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -265,10 +267,10 @@ specialize vp9_blend_b
# Sub Pixel Filters
#
prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy sse2
+specialize vp9_convolve_copy $sse2_x86inc
prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg sse2
+specialize vp9_convolve_avg $sse2_x86inc
prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3 neon
@@ -297,14 +299,17 @@ specialize vp9_short_idct4x4_1_add sse2
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct4x4_add sse2
+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_1_add sse2
+
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct8x8_add sse2 neon
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_8x8_add sse2
-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_8x8
+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_1_add sse2
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct16x16_add sse2
@@ -312,18 +317,12 @@ specialize vp9_short_idct16x16_add sse2
prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_16x16_add sse2
-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_16x16
-
prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct32x32_add sse2
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
-prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_32x32_add
-
prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht4x4_add sse2
@@ -702,12 +701,10 @@ specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
-specialize vp9_block_error sse2
+specialize vp9_block_error $sse2_x86inc
prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
-specialize vp9_subtract_block sse2
-
-[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+specialize vp9_subtract_block $sse2_x86inc
prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
specialize vp9_quantize_b $ssse3_x86_64
@@ -719,13 +716,11 @@ specialize vp9_quantize_b_32x32 $ssse3_x86_64
# Structured Similarity (SSIM)
#
if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
- [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
+ specialize vp9_ssim_parms_8x8 $sse2_x86_64
prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
- specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
+ specialize vp9_ssim_parms_16x16 $sse2_x86_64
fi
# fdct functions
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index ebcd4116f..31182c35c 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -79,4 +79,22 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
}
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+ const unsigned int ct[2],
+ unsigned int count_sat,
+ unsigned int max_update_factor) {
+ const unsigned int count = MIN(ct[0] + ct[1], count_sat);
+ const unsigned int factor = max_update_factor * count / count_sat;
+ return weighted_prob(pre_prob, prob, factor);
+}
+
+static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
+ const unsigned int ct[2],
+ unsigned int count_sat,
+ unsigned int max_update_factor) {
+ return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
+ max_update_factor);
+}
+
+
#endif // VP9_COMMON_VP9_TREECODER_H_
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index a1e14b482..8f740f412 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
{ \
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
+ d0 = _mm_add_epi16(in_x, d0); \
+ d0 = _mm_packus_epi16(d0, d0); \
+ _mm_storel_epi64((__m128i *)(dest), d0); \
dest += stride; \
}
@@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
RECON_AND_STORE(dest, in7);
}
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 5);
+
+ dc_value = _mm_set1_epi16(a);
+
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+}
+
// perform 8x8 transpose
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
@@ -1449,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
}
}
+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ __m128i dc_value;
+ const __m128i zero = _mm_setzero_si128();
+ int a, i;
+
+ a = dct_const_round_shift(input[0] * cospi_16_64);
+ a = dct_const_round_shift(a * cospi_16_64);
+ a = ROUND_POWER_OF_TWO(a, 6);
+
+ dc_value = _mm_set1_epi16(a);
+
+ for (i = 0; i < 2; ++i) {
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ RECON_AND_STORE(dest, dc_value);
+ dest += 8 - (stride * 16);
+ }
+}
+
static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
__m128i tbuf[8];
array_transpose_8x8(res0, res0);
@@ -2760,6 +2813,12 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
}
}
+#define LOAD_DQCOEFF(reg, input) \
+ { \
+ reg = _mm_load_si128((__m128i *) input); \
+ input += 8; \
+ } \
+
void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -2827,48 +2886,126 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
stp2_30, stp2_31;
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- int i, j;
+ int i, j, i32;
+ __m128i zero_idx[16];
+ int zero_flag[2];
// We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
for (i = 0; i < 8; i++) {
+ i32 = (i << 5);
if (i < 4) {
// First 1-D idct
// Load input data.
- in0 = _mm_load_si128((__m128i *)input);
- in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
- in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
- in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
- in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
- in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
- in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
- in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
- in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
- in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
- in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
- in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
- in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
- in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
- in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
- in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
-
- in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
- in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
- in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
- in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
- in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
- in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
- in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
- in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
- in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
- in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
- in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
- in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
- in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
- in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
- in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
- in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
-
- input += 256;
+ LOAD_DQCOEFF(in0, input);
+ LOAD_DQCOEFF(in8, input);
+ LOAD_DQCOEFF(in16, input);
+ LOAD_DQCOEFF(in24, input);
+ LOAD_DQCOEFF(in1, input);
+ LOAD_DQCOEFF(in9, input);
+ LOAD_DQCOEFF(in17, input);
+ LOAD_DQCOEFF(in25, input);
+ LOAD_DQCOEFF(in2, input);
+ LOAD_DQCOEFF(in10, input);
+ LOAD_DQCOEFF(in18, input);
+ LOAD_DQCOEFF(in26, input);
+ LOAD_DQCOEFF(in3, input);
+ LOAD_DQCOEFF(in11, input);
+ LOAD_DQCOEFF(in19, input);
+ LOAD_DQCOEFF(in27, input);
+
+ LOAD_DQCOEFF(in4, input);
+ LOAD_DQCOEFF(in12, input);
+ LOAD_DQCOEFF(in20, input);
+ LOAD_DQCOEFF(in28, input);
+ LOAD_DQCOEFF(in5, input);
+ LOAD_DQCOEFF(in13, input);
+ LOAD_DQCOEFF(in21, input);
+ LOAD_DQCOEFF(in29, input);
+ LOAD_DQCOEFF(in6, input);
+ LOAD_DQCOEFF(in14, input);
+ LOAD_DQCOEFF(in22, input);
+ LOAD_DQCOEFF(in30, input);
+ LOAD_DQCOEFF(in7, input);
+ LOAD_DQCOEFF(in15, input);
+ LOAD_DQCOEFF(in23, input);
+ LOAD_DQCOEFF(in31, input);
+
+ // checking if all entries are zero
+ zero_idx[0] = _mm_or_si128(in0, in1);
+ zero_idx[1] = _mm_or_si128(in2, in3);
+ zero_idx[2] = _mm_or_si128(in4, in5);
+ zero_idx[3] = _mm_or_si128(in6, in7);
+ zero_idx[4] = _mm_or_si128(in8, in9);
+ zero_idx[5] = _mm_or_si128(in10, in11);
+ zero_idx[6] = _mm_or_si128(in12, in13);
+ zero_idx[7] = _mm_or_si128(in14, in15);
+ zero_idx[8] = _mm_or_si128(in16, in17);
+ zero_idx[9] = _mm_or_si128(in18, in19);
+ zero_idx[10] = _mm_or_si128(in20, in21);
+ zero_idx[11] = _mm_or_si128(in22, in23);
+ zero_idx[12] = _mm_or_si128(in24, in25);
+ zero_idx[13] = _mm_or_si128(in26, in27);
+ zero_idx[14] = _mm_or_si128(in28, in29);
+ zero_idx[15] = _mm_or_si128(in30, in31);
+
+ zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+ zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+ zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+ zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+ zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+ zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+ zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+ zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+ zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+ zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
+ zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
+ zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
+ zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
+ zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
+
+ if (!zero_flag[0] && !zero_flag[1]) {
+ col[i32 + 0] = _mm_setzero_si128();
+ col[i32 + 1] = _mm_setzero_si128();
+ col[i32 + 2] = _mm_setzero_si128();
+ col[i32 + 3] = _mm_setzero_si128();
+ col[i32 + 4] = _mm_setzero_si128();
+ col[i32 + 5] = _mm_setzero_si128();
+ col[i32 + 6] = _mm_setzero_si128();
+ col[i32 + 7] = _mm_setzero_si128();
+ col[i32 + 8] = _mm_setzero_si128();
+ col[i32 + 9] = _mm_setzero_si128();
+ col[i32 + 10] = _mm_setzero_si128();
+ col[i32 + 11] = _mm_setzero_si128();
+ col[i32 + 12] = _mm_setzero_si128();
+ col[i32 + 13] = _mm_setzero_si128();
+ col[i32 + 14] = _mm_setzero_si128();
+ col[i32 + 15] = _mm_setzero_si128();
+ col[i32 + 16] = _mm_setzero_si128();
+ col[i32 + 17] = _mm_setzero_si128();
+ col[i32 + 18] = _mm_setzero_si128();
+ col[i32 + 19] = _mm_setzero_si128();
+ col[i32 + 20] = _mm_setzero_si128();
+ col[i32 + 21] = _mm_setzero_si128();
+ col[i32 + 22] = _mm_setzero_si128();
+ col[i32 + 23] = _mm_setzero_si128();
+ col[i32 + 24] = _mm_setzero_si128();
+ col[i32 + 25] = _mm_setzero_si128();
+ col[i32 + 26] = _mm_setzero_si128();
+ col[i32 + 27] = _mm_setzero_si128();
+ col[i32 + 28] = _mm_setzero_si128();
+ col[i32 + 29] = _mm_setzero_si128();
+ col[i32 + 30] = _mm_setzero_si128();
+ col[i32 + 31] = _mm_setzero_si128();
+ continue;
+ }
// Transpose 32x8 block to 8x32 block
TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
@@ -3239,38 +3376,38 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
// final stage
if (i < 4) {
// 1_D: Store 32 intermediate results for each 8x32 block.
- col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
} else {
const __m128i zero = _mm_setzero_si128();
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
index bc8ed5c1f..8ba26f310 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -10,6 +10,31 @@
%include "third_party/x86inc/x86inc.asm"
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_2: times 8 dw 2
+pb_7m1: times 8 db 7, -1
+pb_15: times 16 db 15
+
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
+sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
+sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
+sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
+sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
+sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
+sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
+sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+
SECTION .text
INIT_MMX ssse3
@@ -85,3 +110,182 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
inc lineq
jnz .loop
REP_RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
+ movq m0, [aboveq]
+ pshufb m2, m0, [sh_b23456777]
+ pshufb m1, m0, [sh_b01234577]
+ pshufb m0, [sh_b12345677]
+ pavgb m3, m2, m1
+ pxor m2, m1
+ pand m2, [pb_1]
+ psubb m3, m2
+ pavgb m0, m3
+
+ ; store 4 lines
+ movd [dstq ], m0
+ psrlq m0, 8
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ psrlq m0, 8
+ movd [dstq ], m0
+ psrlq m0, 8
+ movd [dstq+strideq], m0
+ RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
+ movq m0, [aboveq]
+ mova m1, [sh_b12345677]
+ DEFINE_ARGS dst, stride, stride3, line
+ lea stride3q, [strideq*3]
+ pshufb m2, m0, [sh_b23456777]
+ pavgb m3, m2, m0
+ pxor m2, m0
+ pshufb m0, m1
+ pand m2, [pb_1]
+ psubb m3, m2
+ pavgb m0, m3
+
+ ; store 4 lines
+ movq [dstq ], m0
+ pshufb m0, m1
+ movq [dstq+strideq ], m0
+ pshufb m0, m1
+ movq [dstq+strideq*2], m0
+ pshufb m0, m1
+ movq [dstq+stride3q ], m0
+ pshufb m0, m1
+ lea dstq, [dstq+strideq*4]
+
+ ; store next 4 lines
+ movq [dstq ], m0
+ pshufb m0, m1
+ movq [dstq+strideq ], m0
+ pshufb m0, m1
+ movq [dstq+strideq*2], m0
+ pshufb m0, m1
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, dst8, line
+ lea stride3q, [strideq*3]
+ lea dst8q, [dstq+strideq*8]
+ mova m1, [sh_b123456789abcdeff]
+ pshufb m2, m0, [sh_b23456789abcdefff]
+ pavgb m3, m2, m0
+ pxor m2, m0
+ pshufb m0, m1
+ pand m2, [pb_1]
+ psubb m3, m2
+ pavgb m0, m3
+
+ ; first 4 lines and first half of 3rd 4 lines
+ mov lined, 2
+.loop:
+ mova [dstq ], m0
+ movhps [dst8q ], m0
+ pshufb m0, m1
+ mova [dstq +strideq ], m0
+ movhps [dst8q+strideq ], m0
+ pshufb m0, m1
+ mova [dstq +strideq*2 ], m0
+ movhps [dst8q+strideq*2 ], m0
+ pshufb m0, m1
+ mova [dstq +stride3q ], m0
+ movhps [dst8q+stride3q ], m0
+ pshufb m0, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst8q, [dst8q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; bottom-right 8x8 block
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+ lea dstq, [dstq+strideq*4]
+ movhps [dstq +8], m0
+ movhps [dstq+strideq +8], m0
+ movhps [dstq+strideq*2+8], m0
+ movhps [dstq+stride3q +8], m0
+ RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
+ mova m0, [aboveq]
+ mova m4, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, dst16, line
+ lea stride3q, [strideq*3]
+ lea dst16q, [dstq +strideq*8]
+ lea dst16q, [dst16q+strideq*8]
+ mova m1, [sh_b123456789abcdeff]
+ pshufb m2, m4, [sh_b23456789abcdefff]
+ pavgb m3, m2, m4
+ pxor m2, m4
+ palignr m5, m4, m0, 1
+ palignr m6, m4, m0, 2
+ pshufb m4, m1
+ pand m2, [pb_1]
+ psubb m3, m2
+ pavgb m4, m3
+ pavgb m3, m0, m6
+ pxor m0, m6
+ pand m0, [pb_1]
+ psubb m3, m0
+ pavgb m5, m3
+
+ ; write 4x4 lines (and the first half of the second 4x4 lines)
+ mov lined, 4
+.loop:
+ mova [dstq ], m5
+ mova [dstq +16], m4
+ mova [dst16q ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +strideq ], m3
+ mova [dstq +strideq +16], m4
+ mova [dst16q+strideq ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ mova [dstq +strideq*2 ], m5
+ mova [dstq +strideq*2+16], m4
+ mova [dst16q+strideq*2 ], m4
+ palignr m3, m4, m5, 1
+ pshufb m4, m1
+ mova [dstq +stride3q ], m3
+ mova [dstq +stride3q +16], m4
+ mova [dst16q+stride3q ], m4
+ palignr m5, m4, m3, 1
+ pshufb m4, m1
+ lea dstq, [dstq +strideq*4]
+ lea dst16q, [dst16q+strideq*4]
+ dec lined
+ jnz .loop
+
+ ; write second half of second 4x4 lines
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ lea dstq, [dstq +strideq*4]
+ mova [dstq +16], m4
+ mova [dstq +strideq +16], m4
+ mova [dstq +strideq*2+16], m4
+ mova [dstq +stride3q +16], m4
+ RET