Roll latest libvpx into Android.android-4.4_r0.8 android-4.4_r0.7

The latest libvpx just added initial multithread vp9 decoding support and more neon optimization. Checkout is from master branch(hash:33afddadb9af6569bd8296ef1d48d0511b651e9d). Change-Id: I54be2f48bc033c00876b6b1d0a3ff1eccb92a2fa
author: hkuang <hkuang@google.com> 2013-08-06 11:07:19 -0700
committer: Hangyu Kuang <hkuang@google.com> 2013-08-06 18:31:37 +0000
commit: f3bed9137f66ef693bd406e43b17e9a1114f1e14 (patch)
tree: cd1bea0cd923c6d125cb5b3e7b3404d7c2f70208 /libvpx/vp9/common
parent: a8b927ab4f06e2fc0d16d9606b57672df9899ac1 (diff)
download: libvpx-f3bed9137f66ef693bd406e43b17e9a1114f1e14.tar.gz
31 files changed, 1706 insertions, 850 deletions
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
index 15039e267..110a56cdd 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -52,15 +52,15 @@
 ; sp[]int h
 
 |vp9_convolve8_avg_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_horiz_c
+
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r4, [sp, #36]           ; x_step_q4
     ldr             r5, [sp, #32]           ; filter_x
-    cmp             r4, #16
-    bne             call_horiz_c_convolve   ; x_step_q4 != 16
-
     ldr             r6, [sp, #48]           ; w
     ldr             r7, [sp, #52]           ; h
 
@@ -82,22 +82,22 @@
     mov             r10, r6                 ; w loop counter
 
 loop_horiz
-    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
-    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld1.8          {d24}, [r0]!
     vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
 
-    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
-    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld1.8          {d25}, [r0]!
     vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
 
-    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
-    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld1.8          {d26}, [r0]!
     vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
 
-    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
-    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld1.8          {d27}, [r0]!
     vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
 
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
     ; extract to s16
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
@@ -128,8 +128,8 @@ loop_horiz
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; transpose
     vtrn.16         d2, d3
@@ -137,10 +137,7 @@ loop_horiz
     vtrn.8          d2, d3
     
     ; average the new value and the dst value
-    vaddl.u8        q8, d2, d6
-    vaddl.u8        q9, d3, d7
-    vqrshrn.u16     d2, q8, #1
-    vqrshrn.u16     d3, q9, #1
+    vrhadd.u8       q1, q1, q3
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d3[0]}, [r2], r3
@@ -159,26 +156,20 @@ loop_horiz
 
     pop             {r4-r10, pc}
 
-call_horiz_c_convolve
-    pop             {r4-r10, lr}
-    add             r0, r0, #3              ; un-adjust for taps
-    b               vp9_convolve8_avg_horiz_c
-
-
     ENDP
 
 |vp9_convolve8_avg_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_vert_c
+
     push            {r4-r10, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r6, [sp, #44]           ; y_step_q4
     ldr             r7, [sp, #40]           ; filter_y
-    cmp             r6, #16
-    bne             call_vert_c_convolve    ; y_step_q4 != 16
-
     ldr             r8, [sp, #48]           ; w
     ldr             r9, [sp, #52]           ; h
 
@@ -240,14 +231,11 @@ loop_vert
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; average the new value and the dst value
-    vaddl.u8        q8, d2, d6
-    vaddl.u8        q9, d3, d7
-    vqrshrn.u16     d2, q8, #1
-    vqrshrn.u16     d3, q9, #1
+    vrhadd.u8       q1, q1, q3
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d2[1]}, [r2], r3
@@ -266,12 +254,5 @@ loop_vert
 
     pop             {r4-r10, pc}
 
-call_vert_c_convolve
-    pop             {r4-r10, lr}
-    ; un-adjust for taps
-    add             r0, r0, r1
-    add             r0, r0, r1, lsl #1
-    b               vp9_convolve8_avg_vert_c
-
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
index 842c73c90..845e4a866 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -52,15 +52,15 @@
 ; sp[]int h
 
 |vp9_convolve8_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_horiz_c
+
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r4, [sp, #36]           ; x_step_q4
     ldr             r5, [sp, #32]           ; filter_x
-    cmp             r4, #16
-    bne             call_horiz_c_convolve   ; x_step_q4 != 16
-
     ldr             r6, [sp, #48]           ; w
     ldr             r7, [sp, #52]           ; h
 
@@ -82,22 +82,22 @@
     mov             r10, r6                 ; w loop counter
 
 loop_horiz
-    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
-    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld1.8          {d24}, [r0]!
     vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
 
-    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
-    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld1.8          {d25}, [r0]!
     vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
 
-    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
-    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld1.8          {d26}, [r0]!
     vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
 
-    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
-    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld1.8          {d27}, [r0]!
     vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
 
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
     ; extract to s16
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
@@ -120,8 +120,8 @@ loop_horiz
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; transpose
     vtrn.16         d2, d3
@@ -145,26 +145,20 @@ loop_horiz
 
     pop             {r4-r10, pc}
 
-call_horiz_c_convolve
-    pop             {r4-r10, lr}
-    add             r0, r0, #3              ; un-adjust for taps
-    b               vp9_convolve8_horiz_c
-
-
     ENDP
 
 |vp9_convolve8_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_vert_c
+
     push            {r4-r10, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r6, [sp, #44]           ; y_step_q4
     ldr             r7, [sp, #40]           ; filter_y
-    cmp             r6, #16
-    bne             call_vert_c_convolve    ; y_step_q4 != 16
-
     ldr             r8, [sp, #48]           ; w
     ldr             r9, [sp, #52]           ; h
 
@@ -219,8 +213,8 @@ loop_vert
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d2[1]}, [r2], r3
@@ -239,12 +233,5 @@ loop_vert
 
     pop             {r4-r10, pc}
 
-call_vert_c_convolve
-    pop             {r4-r10, lr}
-    ; un-adjust for taps
-    add             r0, r0, r1
-    add             r0, r0, r1, lsl #1
-    b               vp9_convolve8_vert_c
-
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
new file mode 100644
index 000000000..edf5786e3
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -0,0 +1,618 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
+    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh
+;                                        int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+    ldr         r12, [sp, #92]             ; load count
+
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         h_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
+
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
+
+    b           h_next
+
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
+
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
+
+    b           h_next
+
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
+
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
+
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
+
+; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_vertical_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+    b           v_end
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_end
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_end
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
+
+; void vp9_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vp9_wide_mbfilter_neon| PROC
+    mov         r7, #0
+
+    ; filter_mask
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
+
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
+
+    vcge.u8     d20, d30, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
+
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
+
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
+
+    vcge.u8     d18, d30, d23              ; flat2
+
+    vmov.u8     d22, #0x80
+
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d21              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    tst         r7, #1
+    bxne        lr
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsubw.u8    q15, d4                    ; op1 = op2 - p3
+    vsubw.u8    q15, d5                    ; op1 -= p2
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d9                    ; op1 += q1
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsubw.u8    q15, d4                    ; op0 = op1 - p3
+    vsubw.u8    q15, d6                    ; op0 -= p1
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d10                   ; op0 += q2
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d11                   ; oq0 += q3
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d11                   ; oq1 += q3
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d11                   ; oq2 += q3
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
+    vaddw.u8    q15, d2                    ; op6 += p5
+    vaddw.u8    q15, d3                    ; op6 += p4
+    vaddw.u8    q15, d4                    ; op6 += p3
+    vaddw.u8    q15, d5                    ; op6 += p2
+    vaddw.u8    q15, d6                    ; op6 += p1
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsubw.u8    q15, d0                    ; op5 = op6 - p7
+    vsubw.u8    q15, d1                    ; op5 -= p6
+    vaddw.u8    q15, d2                    ; op5 += p5
+    vaddw.u8    q15, d9                    ; op5 += q1
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsubw.u8    q15, d0                    ; op4 = op5 - p7
+    vsubw.u8    q15, d2                    ; op4 -= p5
+    vaddw.u8    q15, d3                    ; op4 += p4
+    vaddw.u8    q15, d10                   ; op4 += q2
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vsubw.u8    q15, d0                    ; op3 = op4 - p7
+    vsubw.u8    q15, d3                    ; op3 -= p4
+    vaddw.u8    q15, d4                    ; op3 += p3
+    vaddw.u8    q15, d11                   ; op3 += q3
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vsubw.u8    q15, d0                    ; op2 = op3 - p7
+    vsubw.u8    q15, d4                    ; op2 -= p3
+    vaddw.u8    q15, d5                    ; op2 += p2
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op1 = op2 - p7
+    vsubw.u8    q15, d5                    ; op1 -= p2
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op0 = op1 - p7
+    vsubw.u8    q15, d6                    ; op0 -= p1
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d15                   ; oq2 += q7
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
+    vsubw.u8    q15, d10                   ; oq3 -= q2
+    vaddw.u8    q15, d11                   ; oq3 += q3
+    vaddw.u8    q15, d15                   ; oq3 += q7
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
+    vsubw.u8    q15, d11                   ; oq4 -= q3
+    vaddw.u8    q15, d12                   ; oq4 += q4
+    vaddw.u8    q15, d15                   ; oq4 += q7
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
+    vsubw.u8    q15, d12                   ; oq5 -= q4
+    vaddw.u8    q15, d13                   ; oq5 += q5
+    vaddw.u8    q15, d15                   ; oq5 += q7
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
+    vsubw.u8    q15, d13                   ; oq6 -= q5
+    vaddw.u8    q15, d14                   ; oq6 += q6
+    vaddw.u8    q15, d15                   ; oq6 += q7
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
+
+    bx          lr
+    ENDP        ; |vp9_wide_mbfilter_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index 8e4aadac2..f82966577 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -22,8 +22,8 @@
     MACRO
     IDCT8x8_1D
     ; stage 1
-    vdup.16         d0, r3;                   ; duplicate cospi_28_64
-    vdup.16         d1, r4;                   ; duplicate cospi_4_64
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
 
     ; input[1] * cospi_28_64
     vmull.s16       q2, d18, d0
@@ -57,8 +57,8 @@
     vqrshrn.s32     d14, q2, #14              ; >> 14
     vqrshrn.s32     d15, q3, #14              ; >> 14
 
-    vdup.16         d0, r5;                   ; duplicate cospi_12_64
-    vdup.16         d1, r6;                   ; duplicate cospi_20_64
+    vdup.16         d0, r5                    ; duplicate cospi_12_64
+    vdup.16         d1, r6                    ; duplicate cospi_20_64
 
     ; input[5] * cospi_12_64
     vmull.s16       q2, d26, d0
@@ -93,7 +93,7 @@
     vqrshrn.s32     d13, q1, #14              ; >> 14
 
     ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7;                   ; duplicate cospi_16_64
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
 
     ; input[0] * cospi_16_64
     vmull.s16       q2, d16, d0
@@ -128,8 +128,8 @@
     vqrshrn.s32     d23, q3, #14              ; >> 14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vdup.16         d0, r8;                   ; duplicate cospi_24_64
-    vdup.16         d1, r9;                   ; duplicate cospi_8_64
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
 
     ; input[1] * cospi_24_64
     vmull.s16       q2, d20, d0
@@ -176,7 +176,7 @@
     vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
 
     ; stage 3 -odd half
-    vdup.16         d16, r7;                   ; duplicate cospi_16_64
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
 
     ; step2[6] * cospi_16_64
     vmull.s16       q9, d28, d16
@@ -211,14 +211,14 @@
     vqrshrn.s32     d13, q10, #14             ; >> 14
 
     ; stage 4
-    vadd.s16        q8, q0, q7;               ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6;               ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5;              ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4;              ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4;              ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5;              ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6;              ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7;              ; output[7] = step1[0] - step1[7];
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
     MEND
 
     ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
@@ -310,14 +310,14 @@
     mov             r0, r1
 
     ; load destination data
-    vld1.u8         {d0}, [r1], r2
-    vld1.u8         {d1}, [r1], r2
-    vld1.s16        {d2}, [r1], r2
-    vld1.s16        {d3}, [r1], r2
-    vld1.s16        {d4}, [r1], r2
-    vld1.s16        {d5}, [r1], r2
-    vld1.s16        {d6}, [r1], r2
-    vld1.s16        {d7}, [r1]
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
     vaddw.u8        q8, q8, d0
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 129711412..f68c5c6ea 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -26,9 +26,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define BLOCK_SIZE_GROUPS   4
-
-#define PREDICTION_PROBS 3
-
 #define MBSKIP_CONTEXTS 3
 
 /* Segment Feature Masks */
@@ -164,6 +161,11 @@ typedef struct {
   union b_mode_info bmi[4];
 } MODE_INFO;
 
+static int is_inter_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
@@ -286,22 +288,22 @@ typedef struct macroblockd {
 
 static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
   switch (subsize) {
-    case BLOCK_SIZE_SB64X64:
-    case BLOCK_SIZE_SB64X32:
-    case BLOCK_SIZE_SB32X64:
-    case BLOCK_SIZE_SB32X32:
+    case BLOCK_64X64:
+    case BLOCK_64X32:
+    case BLOCK_32X64:
+    case BLOCK_32X32:
       return &xd->sb_index;
-    case BLOCK_SIZE_SB32X16:
-    case BLOCK_SIZE_SB16X32:
-    case BLOCK_SIZE_MB16X16:
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
       return &xd->mb_index;
-    case BLOCK_SIZE_SB16X8:
-    case BLOCK_SIZE_SB8X16:
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_16X8:
+    case BLOCK_8X16:
+    case BLOCK_8X8:
       return &xd->b_index;
-    case BLOCK_SIZE_SB8X4:
-    case BLOCK_SIZE_SB4X8:
-    case BLOCK_SIZE_AB4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8:
+    case BLOCK_4X4:
       return &xd->ab_index;
     default:
       assert(0);
@@ -315,7 +317,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
   const int bwl = b_width_log2(sb_type);
   const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
   const char pcval0 = ~(0xe << boffset);
   const char pcval1 = ~(0xf << boffset);
   const char pcvalue[2] = {pcval0, pcval1};
@@ -333,7 +335,7 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                           BLOCK_SIZE_TYPE sb_type) {
   int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
 
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
@@ -366,10 +368,10 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 
   if (plane_type != PLANE_TYPE_Y_WITH_DC ||
       xd->lossless ||
-      mbmi->ref_frame[0] != INTRA_FRAME)
+      is_inter_block(mbmi))
     return DCT_DCT;
 
-  return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
                        mi->bmi[ib].as_mode : mbmi->mode];
 }
 
@@ -496,16 +498,16 @@ static INLINE void foreach_transformed_block_in_plane(
     // it to 4x4 block sizes.
     if (xd->mb_to_right_edge < 0)
       max_blocks_wide +=
-          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
 
     if (xd->mb_to_bottom_edge < 0)
       max_blocks_high +=
-          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
 
     i = 0;
     // Unlike the normal case - in here we have to keep track of the
     // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border..
+    // the unrestricted motion border.
     for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
       for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
         if (r < max_blocks_high && c < max_blocks_wide)
@@ -563,8 +565,8 @@ static INLINE void foreach_predicted_block_in_plane(
   // size of the predictor to use.
   int pred_w, pred_h;
 
-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-    assert(bsize == BLOCK_SIZE_SB8X8);
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
     pred_w = 0;
     pred_h = 0;
   } else {
@@ -689,46 +691,39 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
   }
 }
 static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int ss_tx_size, int eob, int aoff,
-                                   int loff, ENTROPY_CONTEXT *A,
-                                   ENTROPY_CONTEXT *L) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-  const int sw = bw - xd->plane[plane].subsampling_x;
-  const int sh = bh - xd->plane[plane].subsampling_y;
-  int mi_blocks_wide = 1 << sw;
-  int mi_blocks_high = 1 << sh;
-  int tx_size_in_blocks = (1 << ss_tx_size);
+                                   int plane, int tx_size_in_blocks,
+                                   int eob, int aoff, int loff,
+                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
+  struct macroblockd_plane *pd = &xd->plane[plane];
   int above_contexts = tx_size_in_blocks;
   int left_contexts = tx_size_in_blocks;
+  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
+  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
   int pt;
 
   // xd->mb_to_right_edge is in units of pixels * 8.  This converts
   // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0) {
-    mi_blocks_wide += (xd->mb_to_right_edge
-        >> (5 + xd->plane[plane].subsampling_x));
-  }
+  if (xd->mb_to_right_edge < 0)
+    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
 
   // this code attempts to avoid copying into contexts that are outside
   // our border.  Any blocks that do are set to 0...
   if (above_contexts + aoff > mi_blocks_wide)
     above_contexts = mi_blocks_wide - aoff;
 
-  if (xd->mb_to_bottom_edge < 0) {
-    mi_blocks_high += (xd->mb_to_bottom_edge
-        >> (5 + xd->plane[plane].subsampling_y));
-  }
-  if (left_contexts + loff > mi_blocks_high) {
+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  if (left_contexts + loff > mi_blocks_high)
     left_contexts = mi_blocks_high - loff;
-  }
 
   for (pt = 0; pt < above_contexts; pt++)
     A[pt] = eob > 0;
-  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
+  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
     A[pt] = 0;
   for (pt = 0; pt < left_contexts; pt++)
     L[pt] = eob > 0;
-  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
+  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
     L[pt] = 0;
 }
 
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index dee44ec63..fdf37e46a 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -31,6 +31,14 @@ const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
 const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
+// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
+const int size_group_lookup[BLOCK_SIZE_TYPES] =
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+
+const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+
+
 const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
   {  // 4X4
     // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
@@ -40,25 +48,25 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID
   }, {  // 8X8
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
   }, {  // 16X16
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID
   }, {  // 32X32
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
     PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID
   }, {  // 64X64
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
@@ -68,29 +76,29 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
 
 const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
   {     // PARTITION_NONE
-    BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4,
-    BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8,
-    BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16,
-    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32,
-    BLOCK_SIZE_SB64X64,
+    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
   }, {  // PARTITION_HORZ
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB64X32,
+    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_64X32,
   }, {  // PARTITION_VERT
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X64,
+    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X64,
   }, {  // PARTITION_SPLIT
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X32,
+    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X32,
   }
 };
 
@@ -108,14 +116,9 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
 };
 
 const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  {BLOCK_SIZE_AB4X4,   BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8,
-    BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8},
-  {BLOCK_SIZE_SB8X4,   BLOCK_SIZE_SB8X8,   BLOCK_SIZE_SB8X16,
-    BLOCK_SIZE_SB8X16,  BLOCK_SIZE_SB8X16},
-  {BLOCK_SIZE_SB16X8,  BLOCK_SIZE_SB16X8,  BLOCK_SIZE_MB16X16,
-    BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32},
-  {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16,
-    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64},
-  {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32,
-    BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64}
+  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
+  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
+  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
+  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
+  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
 };
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index 8b0f8a500..bc8c01a77 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -21,10 +21,9 @@ extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
 extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
 extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
 extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE
-  partition_lookup[][BLOCK_SIZE_TYPES];
-
-
+extern const int size_group_lookup[BLOCK_SIZE_TYPES];
+extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
 extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 0ad0dbccd..df3a9fed5 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -73,7 +73,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -419,7 +419,7 @@ static void init_bit_trees() {
   init_bit_tree(cat6, 14);
 }
 
-vp9_extra_bit vp9_extra_bits[12] = {
+const vp9_extra_bit vp9_extra_bits[12] = {
   { 0, 0, 0, 0},
   { 0, 0, 0, 1},
   { 0, 0, 0, 2},
@@ -437,14 +437,10 @@ vp9_extra_bit vp9_extra_bits[12] = {
 #include "vp9/common/vp9_default_coef_probs.h"
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
-             sizeof(pc->fc.coef_probs[TX_4X4]));
-  vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,
-             sizeof(pc->fc.coef_probs[TX_8X8]));
-  vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,
-             sizeof(pc->fc.coef_probs[TX_16X16]));
-  vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,
-             sizeof(pc->fc.coef_probs[TX_32X32]));
+  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -613,17 +609,17 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
-                             int count_sat, int update_factor) {
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
+                             unsigned int count_sat,
+                             unsigned int update_factor) {
   FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
-  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
-  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size];
-  vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size];
+  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+  vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cm->counts.eob_branch[txfm_size];
-  int t, i, j, k, l, count;
-  int factor;
+      cm->counts.eob_branch[tx_size];
+  int t, i, j, k, l;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
   vp9_prob coef_probs[UNCONSTRAINED_NODES];
   int entropy_nodes_adapt = UNCONSTRAINED_NODES;
@@ -634,29 +630,23 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(
-              vp9_coefmodel_tree,
-              coef_probs, branch_ct,
-              coef_counts[i][j][k][l], 0);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
+                                           branch_ct, coef_counts[i][j][k][l],
+                                           0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t) {
-            count = branch_ct[t][0] + branch_ct[t][1];
-            count = count > count_sat ? count_sat : count;
-            factor = (update_factor * count / count_sat);
-            dst_coef_probs[i][j][k][l][t] =
-                weighted_prob(pre_coef_probs[i][j][k][l][t],
-                              coef_probs[t], factor);
-          }
+          for (t = 0; t < entropy_nodes_adapt; ++t)
+            dst_coef_probs[i][j][k][l][t] = merge_probs(
+                pre_coef_probs[i][j][k][l][t], coef_probs[t],
+                branch_ct[t], count_sat, update_factor);
         }
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
   TX_SIZE t;
-  int count_sat;
-  int update_factor; /* denominator 256 */
+  unsigned int count_sat, update_factor;
 
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
     count_sat = COEF_COUNT_SAT_KEY;
   } else if (cm->last_frame_type == KEY_FRAME) {
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 4ea727ff4..861c0786c 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -50,7 +50,7 @@ typedef struct {
   int base_val;
 } vp9_extra_bit;
 
-extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
+extern const vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
 
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
@@ -80,7 +80,6 @@ extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
-/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #define PREV_COEF_CONTEXTS          6
 
 // #define ENTROPY_STATS
@@ -102,7 +101,7 @@ extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
 
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
@@ -119,7 +118,7 @@ extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
 
 extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index ca188e438..768e5f523 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -356,53 +356,15 @@ void vp9_entropy_mode_init() {
                               vp9_inter_mode_tree, NEARESTMV);
 }
 
-void vp9_accum_mv_refs(VP9_COMMON *pc,
-                       MB_PREDICTION_MODE m,
-                       const int context) {
-  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
-      pc->counts.inter_mode;
-
-  if (m == ZEROMV) {
-    ++inter_mode_counts[context][0][0];
-  } else {
-    ++inter_mode_counts[context][0][1];
-    if (m == NEARESTMV) {
-      ++inter_mode_counts[context][1][0];
-    } else {
-      ++inter_mode_counts[context][1][1];
-      if (m == NEARMV) {
-        ++inter_mode_counts[context][2][0];
-      } else {
-        ++inter_mode_counts[context][2][1];
-      }
-    }
-  }
-}
-
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
-                          unsigned int ct[2]) {
-  const int count = MIN(ct[0] + ct[1], COUNT_SAT);
-  const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT;
-  return weighted_prob(pre_prob, prob, factor);
+static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
+  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
 static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
-  return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct);
-}
-
-void vp9_adapt_mode_context(VP9_COMMON *pc) {
-  int i, j;
-  FRAME_CONTEXT *const fc = &pc->fc;
-  FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx];
-  FRAME_COUNTS  *const counts = &pc->counts;
-
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++)
-    for (i = 0; i < VP9_INTER_MODES - 1; i++)
-      fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i],
-                                              counts->inter_mode[j][i]);
+  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
 static void update_mode_probs(int n_modes,
@@ -440,6 +402,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
       fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
+                      fc->inter_mode_probs[i], NEARESTMV);
+
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
                       counts->y_mode[i], pre_fc->y_mode_prob[i],
@@ -466,25 +433,25 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     int j;
-    unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
 
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+      for (j = 0; j < TX_SIZES - 3; ++j)
         fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
                                        branch_ct_16x16p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+      for (j = 0; j < TX_SIZES - 2; ++j)
         fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
                                        branch_ct_32x32p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+      for (j = 0; j < TX_SIZES - 1; ++j)
         fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
@@ -495,22 +462,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                      counts->mbskip[i]);
 }
 
-static void set_default_lf_deltas(MACROBLOCKD *xd) {
-  xd->lf.mode_ref_delta_enabled = 1;
-  xd->lf.mode_ref_delta_update = 1;
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
 
-  xd->lf.ref_deltas[INTRA_FRAME] = 1;
-  xd->lf.ref_deltas[LAST_FRAME] = 0;
-  xd->lf.ref_deltas[GOLDEN_FRAME] = -1;
-  xd->lf.ref_deltas[ALTREF_FRAME] = -1;
+  lf->ref_deltas[INTRA_FRAME] = 1;
+  lf->ref_deltas[LAST_FRAME] = 0;
+  lf->ref_deltas[GOLDEN_FRAME] = -1;
+  lf->ref_deltas[ALTREF_FRAME] = -1;
 
-  xd->lf.mode_deltas[0] = 0;
-  xd->lf.mode_deltas[1] = 0;
+  lf->mode_deltas[0] = 0;
+  lf->mode_deltas[1] = 0;
 }
 
 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
+  struct loopfilter *const lf = &xd->lf;
+
   int i;
   vp9_clearall_segfeatures(&xd->seg);
   xd->seg.abs_delta = SEGMENT_DELTADATA;
@@ -518,12 +487,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // Reset the mode ref deltas for loop filter
-  vp9_zero(xd->lf.last_ref_deltas);
-  vp9_zero(xd->lf.last_mode_deltas);
-  set_default_lf_deltas(xd);
+  vp9_zero(lf->last_ref_deltas);
+  vp9_zero(lf->last_mode_deltas);
+  set_default_lf_deltas(lf);
 
   // To force update of the sharpness
-  xd->lf.last_sharpness_level = -1;
+  lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 8c14e7e17..17a7c2634 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -24,15 +24,15 @@
 struct VP9Common;
 
 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
 };
 
 struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };
 
 extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
@@ -61,18 +61,12 @@ extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
 
-int vp9_mv_cont(const int_mv *l, const int_mv *a);
-
 void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
 
 void vp9_init_mbmode_probs(struct VP9Common *x);
 
-void vp9_adapt_mode_context(struct VP9Common *pc);
-
 void vp9_adapt_mode_probs(struct VP9Common *);
 
-void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context);
-
 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);
 void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 343b6241d..6cfc34697 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -16,7 +16,7 @@
 #define MV_MAX_UPDATE_FACTOR 128
 
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH    8
+#define COMPANDED_MVREF_THRESH 8
 
 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
   -MV_JOINT_ZERO, 2,
@@ -107,12 +107,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
   return mv_class_base(c) + offset;
 }
 
-static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
-                                   int incr) {
-  assert (v != 0);
-  comp_counts->mvcount[MV_MAX + v] += incr;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
@@ -164,25 +158,19 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   }
 }
 
-void vp9_inc_mv(const MV *mv,  nmv_context_counts *mvctx) {
+void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
-  mvctx->joints[j]++;
+  ++counts->joints[j];
+
   if (mv_joint_vertical(j))
-    inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
+    ++counts->comps[0].mvcount[MV_MAX + mv->row];
 
   if (mv_joint_horizontal(j))
-    inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
+    ++counts->comps[1].mvcount[MV_MAX + mv->col];
 }
 
-static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
-  const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);
-  if (count) {
-    const vp9_prob newp = get_binary_prob(ct[0], ct[1]);
-    const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;
-    *dest = weighted_prob(prep, newp, factor);
-  } else {
-    *dest = prep;
-  }
+static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
+  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
 void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
@@ -195,31 +183,22 @@ static unsigned int adapt_probs(unsigned int i,
                                 vp9_prob this_probs[],
                                 const vp9_prob last_probs[],
                                 const unsigned int num_events[]) {
-  vp9_prob this_prob;
 
-  const uint32_t left = tree[i] <= 0
+
+  const unsigned int left = tree[i] <= 0
           ? num_events[-tree[i]]
           : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
 
-  const uint32_t right = tree[i + 1] <= 0
+  const unsigned int right = tree[i + 1] <= 0
           ? num_events[-tree[i + 1]]
           : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-
-  uint32_t weight = left + right;
-  if (weight) {
-    this_prob = get_binary_prob(left, right);
-    weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight;
-    this_prob = weighted_prob(last_probs[i >> 1], this_prob,
-                              MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT);
-  } else {
-    this_prob = last_probs[i >> 1];
-  }
-  this_probs[i >> 1] = this_prob;
+  const unsigned int ct[2] = { left, right };
+  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
   return left + right;
 }
 
 
-void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
   FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
@@ -228,36 +207,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
   nmv_context *pre_ctx = &pre_fc->nmvc;
   nmv_context_counts *cts = &cm->counts.mv;
 
-  vp9_counts_process(cts, usehp);
+  vp9_counts_process(cts, allow_hp);
 
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {
-    adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
     adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
                 pre_ctx->comps[i].classes, cts->comps[i].classes);
     adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
                 pre_ctx->comps[i].class0, cts->comps[i].class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
-                 cts->comps[i].bits[j]);
-  }
+        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
+                                           cts->comps[i].bits[j]);
 
-  for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j)
       adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
                   pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
 
     adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
                 cts->comps[i].fp);
-  }
 
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
-                 cts->comps[i].class0_hp);
-      adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
+    if (allow_hp) {
+      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
+                                           cts->comps[i].class0_hp);
+      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
     }
   }
 }
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 86f0d0bfd..3208b7270 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -54,7 +54,7 @@ typedef enum {
   TX_8X8 = 1,                      // 8x8 dct transform
   TX_16X16 = 2,                    // 16x16 dct transform
   TX_32X32 = 3,                    // 32x32 dct transform
-  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
+  TX_SIZES
 } TX_SIZE;
 
 typedef enum {
@@ -63,7 +63,7 @@ typedef enum {
   ALLOW_16X16         = 2,
   ALLOW_32X32         = 3,
   TX_MODE_SELECT      = 4,
-  NB_TXFM_MODES       = 5,
+  TX_MODES            = 5,
 } TX_MODE;
 
 typedef enum {
diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c
index 95ec59061..d8496c4f2 100644
--- a/libvpx/vp9/common/vp9_extend.c
+++ b/libvpx/vp9/common/vp9_extend.c
@@ -8,9 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_extend.h"
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_extend.h"
+
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch,
                                   int w, int h,
@@ -107,14 +109,14 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
   const int src_y_offset = srcy * src->y_stride + srcx;
   const int dst_y_offset = srcy * dst->y_stride + srcx;
 
-  const int et_uv = (et_y + 1) >> 1;
-  const int el_uv = (el_y + 1) >> 1;
-  const int eb_uv = (eb_y + 1) >> 1;
-  const int er_uv = (er_y + 1) >> 1;
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
   const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
   const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = (srch + 1) >> 1;
-  const int srcw_uv = (srcw + 1) >> 1;
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
 
   copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
                         dst->y_buffer + dst_y_offset, dst->y_stride,
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 643b229a6..3af8b8d21 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -14,8 +14,9 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_sadmxn.h"
 
-static void lower_mv_precision(int_mv *mv, int usehp) {
-  if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
+static void lower_mv_precision(int_mv *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv);
+  if (!use_hp) {
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
     if (mv->as_mv.col & 1)
@@ -32,7 +33,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
-    clamp_mv2(&mvlist[i], xd);
+    clamp_mv2(&mvlist[i].as_mv, xd);
   }
   *nearest = mvlist[0];
   *near = mvlist[1];
@@ -41,7 +42,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
-                                   int block_idx, int ref_idx) {
+                                   int block_idx, int ref_idx,
+                                   int mi_row, int mi_col) {
   int_mv dst_list[MAX_MV_REF_CANDIDATES];
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *mi = xd->mode_info_context;
@@ -53,7 +55,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
   vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
                        xd->prev_mode_info_context,
                        mbmi->ref_frame[ref_idx],
-                       mv_list, cm->ref_frame_sign_bias, block_idx);
+                       mv_list, cm->ref_frame_sign_bias, block_idx,
+                       mi_row, mi_col);
 
   dst_list[1].as_int = 0;
   if (block_idx == 0) {
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index b0fa505b5..e5221ed67 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -29,31 +29,19 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            int_mv *near);
 
 // TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv(int_mv *mv,
-                     int mb_to_left_edge,
-                     int mb_to_right_edge,
-                     int mb_to_top_edge,
-                     int mb_to_bottom_edge) {
-  mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);
-  mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);
-}
-
-static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
-  int_mv tmp_mv;
-  tmp_mv.as_int = mv->as_int;
-  clamp_mv(mv,
-           xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-  return tmp_mv.as_int != mv->as_int;
+static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
-                                   int block_idx, int ref_idx);
+                                   int block_idx, int ref_idx,
+                                   int mi_row, int mi_col);
 
 static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
   // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
@@ -62,7 +50,7 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
-    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
       return ((cur_mb->bmi + 1 + b)->as_mode);
@@ -80,7 +68,7 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
-    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
       return ((cur_mb->bmi + 2 + b)->as_mode);
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index a95560a55..a2245259e 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+  int i, j;
+  int a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
+}
+
 static void iadst4_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 5);
-}
-
 static void idct16_1d(int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
@@ -857,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int i, j;
+  int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 6);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
 }
 
 static void idct32_1d(int16_t *input, int16_t *output) {
@@ -1259,29 +1274,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   out = dct_const_round_shift(out * cospi_16_64);
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
-
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  int16_t out[32 * 32] = { 0 };
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[32], temp_out[32];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    idct32_1d(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
-  }
-}
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 5498b1717..66df62753 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -16,6 +16,12 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
+struct loop_filter_info {
+  const uint8_t *mblim;
+  const uint8_t *lim;
+  const uint8_t *hev_thr;
+};
+
 static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[DC_PRED] = 0;
   lfi->mode_lf_lut[D45_PRED] = 0;
@@ -73,13 +79,14 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
 
 void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                 int default_filt_lvl) {
-  int seg;
+  int seg_id;
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
   const int n_shift = default_filt_lvl >> 5;
   loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *lf = &xd->lf;
+  struct loopfilter *const lf = &xd->lf;
+  struct segmentation *const seg = &xd->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -87,13 +94,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     lf->last_sharpness_level = lf->sharpness_level;
   }
 
-  for (seg = 0; seg < MAX_SEGMENTS; seg++) {
+  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
 
     // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF);
-      lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA
+    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
                   ? data
                   : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
     }
@@ -101,18 +108,18 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
       continue;
     }
 
     intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
-    lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
     for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
       for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
         const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
                                       + (lf->mode_deltas[mode] << n_shift);
-        lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
       }
   }
 }
@@ -256,7 +263,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const int skip_this = mi[c].mbmi.mb_skip_coeff
-                            && mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
+                            && is_inter_block(&mi[c].mbmi);
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
           !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
@@ -376,3 +383,11 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
   vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
                        0, cm->mi_rows, y_only);
 }
+
+int vp9_loop_filter_worker(void *arg1, void *arg2) {
+  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
+  (void)arg2;
+  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+                       lf_data->start, lf_data->stop, lf_data->y_only);
+  return 1;
+}
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index e59cc6485..5fc909495 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -35,13 +35,6 @@ typedef struct {
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
-
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
@@ -64,4 +57,18 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                           struct VP9Common *cm, struct macroblockd *xd,
                           int start, int stop, int y_only);
+
+typedef struct LoopFilterWorkerData {
+  const YV12_BUFFER_CONFIG *frame_buffer;
+  struct VP9Common *cm;
+  struct macroblockd xd;  // TODO(jzern): most of this is unnecessary to the
+                          // loopfilter. the planes are necessary as their state
+                          // is changed during decode.
+  int start;
+  int stop;
+  int y_only;
+} LFWorkerData;
+
+// Operates on the rows described by LFWorkerData passed as 'arg1'.
+int vp9_loop_filter_worker(void *arg1, void *arg2);
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index a095258be..31a79b984 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h
@@ -13,6 +13,8 @@
 
 #include "vpx/vpx_integer.h"
 
+#include "vp9/common/vp9_common.h"
+
 typedef struct {
   int16_t row;
   int16_t col;
@@ -28,4 +30,10 @@ typedef struct {
   int32_t col;
 } MV32;
 
+static void clamp_mv(MV *mv, int min_col, int max_col,
+                             int min_row, int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index ae009b0ff..3b72f41c2 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -11,6 +11,65 @@
 #include "vp9/common/vp9_mvref_common.h"
 
 #define MVREF_NEIGHBOURS 8
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D27_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
 static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
   // SB4X4
   {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
@@ -39,263 +98,212 @@ static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
   // SB64X64
   {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
 };
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 
 static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
-  mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER,
-                                       xd->mb_to_right_edge + MV_BORDER);
-  mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER,
-                                       xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// Gets a candidate reference motion vector from the given mode info
-// structure if one exists that matches the given reference frame.
-static int get_matching_candidate(const MODE_INFO *candidate_mi,
-                                  MV_REFERENCE_FRAME ref_frame,
-                                  int_mv *c_mv, int block_idx) {
-  if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {
-    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;
-    else
-      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-  } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {
-    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;
-    else
-      c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-  } else {
-    return 0;
-  }
-
-  return 1;
+  clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER,
+                       xd->mb_to_right_edge + MV_BORDER,
+                       xd->mb_to_top_edge - MV_BORDER,
+                       xd->mb_to_bottom_edge + MV_BORDER);
 }
 
-// Gets candidate reference motion vector(s) from the given mode info
-// structure if they exists and do NOT match the given reference frame.
-static void get_non_matching_candidates(const MODE_INFO *candidate_mi,
-                                        MV_REFERENCE_FRAME ref_frame,
-                                        MV_REFERENCE_FRAME *c_ref_frame,
-                                        int_mv *c_mv,
-                                        MV_REFERENCE_FRAME *c2_ref_frame,
-                                        int_mv *c2_mv) {
-
-  c_mv->as_int = 0;
-  c2_mv->as_int = 0;
-  *c_ref_frame = INTRA_FRAME;
-  *c2_ref_frame = INTRA_FRAME;
-
-  // If first candidate not valid neither will be.
-  if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {
-    // First candidate
-    if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {
-      *c_ref_frame = candidate_mi->mbmi.ref_frame[0];
-      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    }
-
-    // Second candidate
-    if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&
-        (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&
-        (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {
-      *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    }
-  }
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
+                                      int check_sub_blocks, int which_mv,
+                                      int search_col, int block_idx) {
+  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv]);
 }
 
 
 // Performs mv sign inversion if indicated by the reference frame combination.
-static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,
-                     MV_REFERENCE_FRAME candidate_ref_frame,
-                     int_mv *candidate_mv, int *ref_sign_bias) {
+static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv return_mv = candidate->mbmi.mv[which_mv];
 
   // Sign inversion where appropriate.
-  if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
-    candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
-    candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
+      ref_sign_bias[this_ref_frame]) {
+    return_mv.as_mv.row *= -1;
+    return_mv.as_mv.col *= -1;
   }
+  return return_mv;
 }
 
-// Add a candidate mv.
-// Discard if it has already been seen.
-static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,
-                             int *candidate_count, int_mv candidate_mv,
-                             int weight) {
-  if (*candidate_count == 0) {
-    mv_list[0].as_int = candidate_mv.as_int;
-    mv_scores[0] = weight;
-    *candidate_count += 1;
-  } else if ((*candidate_count == 1) &&
-             (candidate_mv.as_int != mv_list[0].as_int)) {
-    mv_list[1].as_int = candidate_mv.as_int;
-    mv_scores[1] = weight;
-    *candidate_count += 1;
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(MV) \
+  if (refmv_count) { \
+    if ((MV).as_int != mv_ref_list[0].as_int) { \
+      mv_ref_list[refmv_count] = (MV); \
+      goto Done; \
+    } \
+  } else { \
+    mv_ref_list[refmv_count++] = (MV); \
+  }
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
+  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+  } \
+  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
+      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
+      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
   }
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const int mi_col, const int mi_row,
+                            const int cur_tile_mi_col_start,
+                            const int cur_tile_mi_col_end, const int mi_rows,
+                            const int (*mv_ref_search)[2], int idx) {
+  int mi_search_col;
+  const int mi_search_row = mi_row + mv_ref_search[idx][1];;
+
+  // Check that the candidate is within the border.  We only need to check
+  // the left side because all the positive right side ones are for blocks that
+  // are large enough to support the + value they have within their border.
+  if (mi_search_row < 0)
+    return 0;
+
+  mi_search_col = mi_col + mv_ref_search[idx][0];
+  if (mi_search_col < cur_tile_mi_col_start)
+    return 0;
+
+  return 1;
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-//
 void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                          MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list, int *ref_sign_bias,
-                          int block_idx) {
-  int i;
-  MODE_INFO *candidate_mi;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int_mv c_refmv;
-  int_mv c2_refmv;
-  MV_REFERENCE_FRAME c_ref_frame;
-  MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 };
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list, const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row, const int mi_col) {
+  int idx;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int refmv_count = 0;
   const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
-  const int mi_col = get_mi_col(xd);
-  const int mi_row = get_mi_row(xd);
-  int intra_count = 0;
-  int zero_count = 0;
-  int newmv_count = 0;
-  int x_idx = 0, y_idx = 0;
-
-  // Blank the reference vector lists and other local structures.
-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
-
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    x_idx = block_idx & 1;
-    y_idx = block_idx >> 1;
-  }
-
-  // We first scan for candidate vectors that match the current reference frame
-  // Look at nearest neigbours
-  for (i = 0; i < 2; ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      int b;
-
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      if (block_idx >= 0) {
-        if (mv_ref_search[i][0])
-          b = 1 + y_idx * 2;
-        else
-          b = 2 + x_idx;
-      } else {
-        b = -1;
-      }
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 16);
+  const MODE_INFO *candidate;
+  const int check_sub_blocks = block_idx >= 0;
+  int different_ref_found = 0;
+  int context_counter = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (idx = 0; idx < 2; ++idx) {
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+      continue;
+
+    candidate = here + mv_ref_search[idx][0]
+                + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+    // Keep counts for entropy encoding.
+    context_counter += mode_2_counter[candidate->mbmi.mode];
+
+    // Check if the candidate comes from the same reference frame.
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
+                                       mv_ref_search[idx][0], block_idx));
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        // Add second motion vector if it has the same ref_frame.
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
+                                         mv_ref_search[idx][0], block_idx));
       }
-
-      // Count number of neihgbours coded intra and zeromv
-      intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
-      zero_count += (candidate_mi->mbmi.mode == ZEROMV);
-      newmv_count += (candidate_mi->mbmi.mode >= NEWMV);
     }
   }
 
-  // More distant neigbours
-  for (i = 2; (i < MVREF_NEIGHBOURS) &&
-              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 16);
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; idx < MVREF_NEIGHBOURS; ++idx) {
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+      continue;
+
+    candidate = here + mv_ref_search[idx][0]
+                + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
       }
     }
   }
 
-  // Look in the last frame if it exists
-  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
-    candidate_mi = lf_here;
-    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c_refmv, 16);
+  // Check the last frame's mode and mv info.
+  if (lf_here != NULL) {
+    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
+    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
     }
   }
 
-  // If we have not found enough candidates consider ones where the
-  // reference frame does not match. Break out when we have
-  // MAX_MV_REF_CANDIDATES candidates.
-  // Look first at spatial neighbours
-  for (i = 0; (i < MVREF_NEIGHBOURS) &&
-              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      get_non_matching_candidates(candidate_mi, ref_frame,
-                                  &c_ref_frame, &c_refmv,
-                                  &c2_ref_frame, &c2_refmv);
-
-      if (c_ref_frame != INTRA_FRAME) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 1);
-      }
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
+      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                     cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+        continue;
 
-      if (c2_ref_frame != INTRA_FRAME) {
-        scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c2_refmv, 1);
-      }
-    }
-  }
+      candidate = here + mv_ref_search[idx][0]
+                  + mv_ref_search[idx][1] * xd->mode_info_stride;
 
-  // Look at the last frame if it exists
-  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
-    candidate_mi = lf_here;
-    get_non_matching_candidates(candidate_mi, ref_frame,
-                                &c_ref_frame, &c_refmv,
-                                &c2_ref_frame, &c2_refmv);
-
-    if (c_ref_frame != INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c_refmv, 1);
-    }
+      // If the candidate is INTRA we don't want to consider its mv.
+      if (candidate->mbmi.ref_frame[0] == INTRA_FRAME)
+        continue;
 
-    if (c2_ref_frame != INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c2_refmv, 1);
+      IF_DIFF_REF_FRAME_ADD_MV(candidate);
     }
   }
 
-  if (!intra_count) {
-    if (!newmv_count) {
-      // 0 = both zero mv
-      // 1 = one zero mv + one a predicted mv
-      // 2 = two predicted mvs
-      mbmi->mb_mode_context[ref_frame] = 2 - zero_count;
-    } else {
-      // 3 = one predicted/zero and one new mv
-      // 4 = two new mvs
-      mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;
-    }
-  } else {
-    // 5 = one intra neighbour + x
-    // 6 = two intra neighbours
-    mbmi->mb_mode_context[ref_frame] = 4 + intra_count;
+  // Since we still don't have a candidate we'll try the last frame.
+  if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) {
+    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
   }
 
+ Done:
+
+  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
+
   // Clamp vectors
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    clamp_mv_ref(xd, &mv_ref_list[i]);
+  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+    clamp_mv_ref(xd, &mv_ref_list[idx]);
   }
 }
+
+#undef ADD_MV_REF_LIST
+#undef IF_DIFF_REF_FRAME_ADD_MV
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index 7290f10ab..c5f89eb57 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -17,11 +17,13 @@
 void vp9_find_mv_refs_idx(VP9_COMMON *cm,
                           MACROBLOCKD *xd,
                           MODE_INFO *here,
-                          MODE_INFO *lf_here,
-                          MV_REFERENCE_FRAME ref_frame,
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
-                          int *ref_sign_bias,
-                          int block_idx);
+                          const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row,
+                          const int mi_col);
 
 static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                     MACROBLOCKD *xd,
@@ -29,9 +31,10 @@ static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                     MODE_INFO *lf_here,
                                     MV_REFERENCE_FRAME ref_frame,
                                     int_mv *mv_ref_list,
-                                    int *ref_sign_bias) {
+                                    int *ref_sign_bias,
+                                    int mi_row, int mi_col) {
   vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
-                       mv_ref_list, ref_sign_bias, -1);
+                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
 }
 
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index f31f24b26..152a93293 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -42,7 +42,7 @@ typedef struct frame_contexts {
   vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
   vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                          [PARTITION_TYPES - 1];
-  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
@@ -59,12 +59,12 @@ typedef struct {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
   unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
   unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
-  vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
+  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
+  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
   unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
                                 [VP9_SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -240,8 +240,7 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
-static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                int mi_row, int mi_col,
+static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
                                 BLOCK_SIZE_TYPE bsize) {
   int bsl = mi_width_log2(bsize), bs = 1 << bsl;
   int ms = bs / 2;
@@ -278,14 +277,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
 }
 
-static int get_mi_row(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE));
-}
-
-static int get_mi_col(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE));
-}
-
 static int get_token_alloc(int mb_rows, int mb_cols) {
   return mb_rows * mb_cols * (48 * 16 + 4);
 }
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index e8bcdea82..795962a71 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -55,34 +55,28 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  int pred_context;
   const MODE_INFO *const mi = xd->mode_info_context;
   const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
   const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
   const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
   const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
-    if (left_mbmi->ref_frame[0] == INTRA_FRAME &&
-        above_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)
-      pred_context = 3;
-    } else {  // intra/inter (1) or inter/inter (0)
-      pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME ||
-                     above_mbmi->ref_frame[0] == INTRA_FRAME;
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  const int left_intra = !is_inter_block(left_mbmi);
+  const int above_intra = !is_inter_block(above_mbmi);
 
-    // inter: 0, intra: 2
-    pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME);
-  } else {
-    pred_context = 0;
-  }
-  assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
-  return pred_context;
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  // 0 - inter/inter, inter/--, --/inter, --/--
+  // 1 - intra/inter, inter/intra
+  // 2 - intra/--, --/intra
+  // 3 - intra/intra
+  if (above_in_image && left_in_image)  // both edges available
+    return left_intra && above_intra ? 3
+                                     : left_intra || above_intra;
+  else if (above_in_image || left_in_image)  // one edge available
+    return 2 * (above_in_image ? above_intra : left_intra);
+  else
+    return 0;
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index e4b6575e3..238290b41 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -110,9 +110,9 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
 static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
                                     const struct tx_probs *tx_probs) {
-  if (bsize < BLOCK_SIZE_MB16X16)
+  if (bsize < BLOCK_16X16)
     return tx_probs->p8x8[context];
-  else if (bsize < BLOCK_SIZE_SB32X32)
+  else if (bsize < BLOCK_32X32)
     return tx_probs->p16x16[context];
   else
     return tx_probs->p32x32[context];
@@ -127,9 +127,9 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
 
 static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
                              TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_SIZE_SB32X32)
+  if (bsize >= BLOCK_32X32)
     tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_SIZE_MB16X16)
+  else if (bsize >= BLOCK_16X16)
     tx_counts->p16x16[context][tx_size]++;
   else
     tx_counts->p8x8[context][tx_size]++;
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 63e5646ad..0b65e0610 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -197,14 +197,14 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
-                               const int_mv *src_mv,
+                               const MV *src_mv,
                                const struct scale_factors *scale,
                                int w, int h, int weight,
                                const struct subpix_fn_table *subpix,
                                enum mv_precision precision) {
   const MV32 mv = precision == MV_PRECISION_Q4
-                     ? scale->scale_mv_q4(&src_mv->as_mv, scale)
-                     : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+                     ? scale->scale_mv_q4(src_mv, scale)
+                     : scale->scale_mv_q3_to_q4(src_mv, scale);
   const int subpel_x = mv.col & 15;
   const int subpel_y = mv.row & 15;
 
@@ -220,45 +220,44 @@ static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
-static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {
-  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;
-  return round_mv_comp_q4(temp);
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
+  return res;
 }
 
-static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {
-  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;
-  return round_mv_comp_q4(temp);
-}
+
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 MV clamp_mv_to_umv_border_sb(const MV *src_mv,
     int bwl, int bhl, int ss_x, int ss_y,
     int mb_to_left_edge, int mb_to_top_edge,
     int mb_to_right_edge, int mb_to_bottom_edge) {
-  /* If the MV points so far into the UMV border that no visible pixels
-   * are used for reconstruction, the subpel part of the MV can be
-   * discarded and the MV limited to 16 pixels with equivalent results.
-   */
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
   const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
   const int spel_right = spel_left - (1 << 4);
   const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
   const int spel_bottom = spel_top - (1 << 4);
-  MV clamped_mv;
-
+  MV clamped_mv = {
+    src_mv->row << (1 - ss_y),
+    src_mv->col << (1 - ss_x)
+  };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
-  clamped_mv.col = clamp(src_mv->col << (1 - ss_x),
-                         (mb_to_left_edge << (1 - ss_x)) - spel_left,
-                         (mb_to_right_edge << (1 - ss_x)) + spel_right);
-  clamped_mv.row = clamp(src_mv->row << (1 - ss_y),
-                         (mb_to_top_edge << (1 - ss_y)) - spel_top,
-                         (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
+  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
   return clamped_mv;
 }
 
@@ -280,15 +279,14 @@ static void build_inter_predictors(int plane, int block,
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
   const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
-  const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
   int which_mv;
 
   assert(x < (4 << bwl));
   assert(y < (4 << bhl));
-  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_w == (4 << bwl));
-  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_h == (4 << bhl));
+  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl));
+  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl));
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
     // source
@@ -301,44 +299,30 @@ static void build_inter_predictors(int plane, int block,
     // dest
     uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
 
-    // motion vector
-    const MV *mv;
-    MV split_chroma_mv;
-    int_mv clamped_mv;
-
-    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      if (plane == 0) {
-        mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;
-      } else {
-        // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-        // same MV (the average of the 4 luma MVs) but we could do something
-        // smarter for non-4:2:0. Just punt for now, pending the changes to get
-        // rid of SPLITMV mode entirely.
-        split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);
-        split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);
-        mv = &split_chroma_mv;
-      }
-    } else {
-      mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
-    }
-
-    /* TODO(jkoleszar): This clamping is done in the incorrect place for the
-     * scaling case. It needs to be done on the scaled MV, not the pre-scaling
-     * MV. Note however that it performs the subsampling aware scaling so
-     * that the result is always q4.
-     */
-    clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,
-                                                 xd->plane[plane].subsampling_x,
-                                                 xd->plane[plane].subsampling_y,
-                                                 xd->mb_to_left_edge,
-                                                 xd->mb_to_top_edge,
-                                                 xd->mb_to_right_edge,
-                                                 xd->mb_to_bottom_edge);
+    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+    // same MV (the average of the 4 luma MVs) but we could do something
+    // smarter for non-4:2:0. Just punt for now, pending the changes to get
+    // rid of SPLITMV mode entirely.
+    const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
+                             : mi_mv_pred_q4(mi, which_mv))
+               : mi->mbmi.mv[which_mv].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
+                                                xd->plane[plane].subsampling_x,
+                                                xd->plane[plane].subsampling_y,
+                                                xd->mb_to_left_edge,
+                                                xd->mb_to_top_edge,
+                                                xd->mb_to_right_edge,
+                                                xd->mb_to_bottom_edge);
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-
     vp9_build_inter_predictor(pre, pre_stride,
                               dst, arg->dst_stride[plane],
-                              &clamped_mv, &xd->scale_factor[which_mv],
+                              &res_mv, &xd->scale_factor[which_mv],
                               4 << pred_w, 4 << pred_h, which_mv,
                               &xd->subpix, MV_PRECISION_Q4);
   }
@@ -400,7 +384,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
   struct scale_factors *const sf = &cm->active_ref_scale[i];
   if (ref >= NUM_YV12_BUFFERS) {
-    memset(sf, 0, sizeof(*sf));
+    vp9_zero(*sf);
   } else {
     YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
     vp9_setup_scale_factors_for_frame(sf,
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index e37750dea..6ec7323e1 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -39,7 +39,7 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
-                               const int_mv *mv_q3,
+                               const MV *mv_q3,
                                const struct scale_factors *scale,
                                int w, int h, int do_avg,
                                const struct subpix_fn_table *subpix,
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index c357ef62a..6bb3cb888 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -7,9 +7,7 @@ cat <<EOF
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -22,7 +20,11 @@ EOF
 }
 forward_decls vp9_common_forward_decls
 
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2  && ssse3_x86inc=ssse3
+
+# this variable is for functions that are 64 bit only.
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2  && ssse3_x86_64=ssse3
 
 #
 # Dequant
@@ -47,7 +49,7 @@ prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
 specialize vp9_d27_predictor_4x4
 
 prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_4x4
+specialize vp9_d45_predictor_4x4 ssse3
 
 prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_4x4
@@ -86,7 +88,7 @@ prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
 specialize vp9_d27_predictor_8x8
 
 prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_8x8
+specialize vp9_d45_predictor_8x8 ssse3
 
 prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_8x8
@@ -125,7 +127,7 @@ prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
 specialize vp9_d27_predictor_16x16
 
 prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_16x16
+specialize vp9_d45_predictor_16x16 ssse3
 
 prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_16x16
@@ -164,7 +166,7 @@ prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
 specialize vp9_d27_predictor_32x32
 
 prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_32x32
+specialize vp9_d45_predictor_32x32 ssse3
 
 prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_32x32
@@ -214,7 +216,7 @@ fi
 # Loopfilter
 #
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon
 
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_vertical_edge sse2 neon
@@ -223,7 +225,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
 specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2
+specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -265,10 +267,10 @@ specialize vp9_blend_b
 # Sub Pixel Filters
 #
 prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy sse2
+specialize vp9_convolve_copy $sse2_x86inc
 
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg sse2
+specialize vp9_convolve_avg $sse2_x86inc
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8 ssse3 neon
@@ -297,14 +299,17 @@ specialize vp9_short_idct4x4_1_add sse2
 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct4x4_add sse2
 
+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_1_add sse2
+
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_8x8_add sse2
 
-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_8x8
+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_1_add sse2
 
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2
@@ -312,18 +317,12 @@ specialize vp9_short_idct16x16_add sse2
 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_16x16_add sse2
 
-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_16x16
-
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct32x32_add sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_32x32_add
-
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht4x4_add sse2
 
@@ -702,12 +701,10 @@ specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
 prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
-specialize vp9_block_error sse2
+specialize vp9_block_error $sse2_x86inc
 
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
-specialize vp9_subtract_block sse2
-
-[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+specialize vp9_subtract_block $sse2_x86inc
 
 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
@@ -719,13 +716,11 @@ specialize vp9_quantize_b_32x32 $ssse3_x86_64
 # Structured Similarity (SSIM)
 #
 if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
     prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
+    specialize vp9_ssim_parms_8x8 $sse2_x86_64
 
     prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
+    specialize vp9_ssim_parms_16x16 $sse2_x86_64
 fi
 
 # fdct functions
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index ebcd4116f..31182c35c 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -79,4 +79,22 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  const unsigned int count = MIN(ct[0] + ct[1], count_sat);
+  const unsigned int factor = max_update_factor * count / count_sat;
+  return weighted_prob(pre_prob, prob, factor);
+}
+
+static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
+                     max_update_factor);
+}
+
+
 #endif  // VP9_COMMON_VP9_TREECODER_H_
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index a1e14b482..8f740f412 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
   {                                                     \
      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
       d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
       dest += stride; \
   }
 
@@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+}
+
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
@@ -1449,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    dest += 8 - (stride * 16);
+  }
+}
+
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
   array_transpose_8x8(res0, res0);
@@ -2760,6 +2813,12 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
   }
 }
 
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = _mm_load_si128((__m128i *) input); \
+    input += 8; \
+  }  \
+
 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -2827,48 +2886,126 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j;
+  int i, j, i32;
+  __m128i zero_idx[16];
+  int zero_flag[2];
 
   // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
   for (i = 0; i < 8; i++) {
+    i32 = (i << 5);
     if (i < 4) {
       // First 1-D idct
       // Load input data.
-      in0 = _mm_load_si128((__m128i *)input);
-      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
-      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
-      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
-      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
-      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
-      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
-      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
-      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
-      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
-      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
-      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
-      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
-      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
-      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
-
-      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
-      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
-      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
-      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
-      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
-      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
-      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
-      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
-      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
-      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
-      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
-      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
-      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
-      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
-      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
-      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
-
-      input += 256;
+      LOAD_DQCOEFF(in0, input);
+      LOAD_DQCOEFF(in8, input);
+      LOAD_DQCOEFF(in16, input);
+      LOAD_DQCOEFF(in24, input);
+      LOAD_DQCOEFF(in1, input);
+      LOAD_DQCOEFF(in9, input);
+      LOAD_DQCOEFF(in17, input);
+      LOAD_DQCOEFF(in25, input);
+      LOAD_DQCOEFF(in2, input);
+      LOAD_DQCOEFF(in10, input);
+      LOAD_DQCOEFF(in18, input);
+      LOAD_DQCOEFF(in26, input);
+      LOAD_DQCOEFF(in3, input);
+      LOAD_DQCOEFF(in11, input);
+      LOAD_DQCOEFF(in19, input);
+      LOAD_DQCOEFF(in27, input);
+
+      LOAD_DQCOEFF(in4, input);
+      LOAD_DQCOEFF(in12, input);
+      LOAD_DQCOEFF(in20, input);
+      LOAD_DQCOEFF(in28, input);
+      LOAD_DQCOEFF(in5, input);
+      LOAD_DQCOEFF(in13, input);
+      LOAD_DQCOEFF(in21, input);
+      LOAD_DQCOEFF(in29, input);
+      LOAD_DQCOEFF(in6, input);
+      LOAD_DQCOEFF(in14, input);
+      LOAD_DQCOEFF(in22, input);
+      LOAD_DQCOEFF(in30, input);
+      LOAD_DQCOEFF(in7, input);
+      LOAD_DQCOEFF(in15, input);
+      LOAD_DQCOEFF(in23, input);
+      LOAD_DQCOEFF(in31, input);
+
+      // checking if all entries are zero
+      zero_idx[0] = _mm_or_si128(in0, in1);
+      zero_idx[1] = _mm_or_si128(in2, in3);
+      zero_idx[2] = _mm_or_si128(in4, in5);
+      zero_idx[3] = _mm_or_si128(in6, in7);
+      zero_idx[4] = _mm_or_si128(in8, in9);
+      zero_idx[5] = _mm_or_si128(in10, in11);
+      zero_idx[6] = _mm_or_si128(in12, in13);
+      zero_idx[7] = _mm_or_si128(in14, in15);
+      zero_idx[8] = _mm_or_si128(in16, in17);
+      zero_idx[9] = _mm_or_si128(in18, in19);
+      zero_idx[10] = _mm_or_si128(in20, in21);
+      zero_idx[11] = _mm_or_si128(in22, in23);
+      zero_idx[12] = _mm_or_si128(in24, in25);
+      zero_idx[13] = _mm_or_si128(in26, in27);
+      zero_idx[14] = _mm_or_si128(in28, in29);
+      zero_idx[15] = _mm_or_si128(in30, in31);
+
+      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
+      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
+      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
+      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
+      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
+
+      if (!zero_flag[0] && !zero_flag[1]) {
+        col[i32 + 0] = _mm_setzero_si128();
+        col[i32 + 1] = _mm_setzero_si128();
+        col[i32 + 2] = _mm_setzero_si128();
+        col[i32 + 3] = _mm_setzero_si128();
+        col[i32 + 4] = _mm_setzero_si128();
+        col[i32 + 5] = _mm_setzero_si128();
+        col[i32 + 6] = _mm_setzero_si128();
+        col[i32 + 7] = _mm_setzero_si128();
+        col[i32 + 8] = _mm_setzero_si128();
+        col[i32 + 9] = _mm_setzero_si128();
+        col[i32 + 10] = _mm_setzero_si128();
+        col[i32 + 11] = _mm_setzero_si128();
+        col[i32 + 12] = _mm_setzero_si128();
+        col[i32 + 13] = _mm_setzero_si128();
+        col[i32 + 14] = _mm_setzero_si128();
+        col[i32 + 15] = _mm_setzero_si128();
+        col[i32 + 16] = _mm_setzero_si128();
+        col[i32 + 17] = _mm_setzero_si128();
+        col[i32 + 18] = _mm_setzero_si128();
+        col[i32 + 19] = _mm_setzero_si128();
+        col[i32 + 20] = _mm_setzero_si128();
+        col[i32 + 21] = _mm_setzero_si128();
+        col[i32 + 22] = _mm_setzero_si128();
+        col[i32 + 23] = _mm_setzero_si128();
+        col[i32 + 24] = _mm_setzero_si128();
+        col[i32 + 25] = _mm_setzero_si128();
+        col[i32 + 26] = _mm_setzero_si128();
+        col[i32 + 27] = _mm_setzero_si128();
+        col[i32 + 28] = _mm_setzero_si128();
+        col[i32 + 29] = _mm_setzero_si128();
+        col[i32 + 30] = _mm_setzero_si128();
+        col[i32 + 31] = _mm_setzero_si128();
+        continue;
+      }
 
       // Transpose 32x8 block to 8x32 block
       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
@@ -3239,38 +3376,38 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
     // final stage
     if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
       const __m128i zero = _mm_setzero_si128();
 
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
index bc8ed5c1f..8ba26f310 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -10,6 +10,31 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_2: times 8 dw 2
+pb_7m1: times 8 db 7, -1
+pb_15: times 16 db 15
+
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
+sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
+sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
+sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
+sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
+sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
+sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
+sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+
 SECTION .text
 
 INIT_MMX ssse3
@@ -85,3 +110,182 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
   inc                lineq
   jnz .loop
   REP_RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
+  movq                m0, [aboveq]
+  pshufb              m2, m0, [sh_b23456777]
+  pshufb              m1, m0, [sh_b01234577]
+  pshufb              m0, [sh_b12345677]
+  pavgb               m3, m2, m1
+  pxor                m2, m1
+  pand                m2, [pb_1]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m0, 8
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
+  movq                m0, [aboveq]
+  mova                m1, [sh_b12345677]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  pshufb              m2, m0, [sh_b23456777]
+  pavgb               m3, m2, m0
+  pxor                m2, m0
+  pshufb              m0, m1
+  pand                m2, [pb_1]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  pshufb              m0, m1
+  lea               dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
+  mova                   m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, dst8, line
+  lea              stride3q, [strideq*3]
+  lea                 dst8q, [dstq+strideq*8]
+  mova                   m1, [sh_b123456789abcdeff]
+  pshufb                 m2, m0, [sh_b23456789abcdefff]
+  pavgb                  m3, m2, m0
+  pxor                   m2, m0
+  pshufb                 m0, m1
+  pand                   m2, [pb_1]
+  psubb                  m3, m2
+  pavgb                  m0, m3
+
+  ; first 4 lines and first half of 3rd 4 lines
+  mov                 lined, 2
+.loop:
+  mova   [dstq            ], m0
+  movhps [dst8q           ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq   ], m0
+  movhps [dst8q+strideq   ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq*2 ], m0
+  movhps [dst8q+strideq*2 ], m0
+  pshufb                 m0, m1
+  mova   [dstq +stride3q  ], m0
+  movhps [dst8q+stride3q  ], m0
+  pshufb                 m0, m1
+  lea                  dstq, [dstq +strideq*4]
+  lea                 dst8q, [dst8q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; bottom-right 8x8 block
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  lea                  dstq, [dstq+strideq*4]
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
+  mova                   m0, [aboveq]
+  mova                   m4, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, dst16, line
+  lea              stride3q, [strideq*3]
+  lea                dst16q, [dstq  +strideq*8]
+  lea                dst16q, [dst16q+strideq*8]
+  mova                   m1, [sh_b123456789abcdeff]
+  pshufb                 m2, m4, [sh_b23456789abcdefff]
+  pavgb                  m3, m2, m4
+  pxor                   m2, m4
+  palignr                m5, m4, m0, 1
+  palignr                m6, m4, m0, 2
+  pshufb                 m4, m1
+  pand                   m2, [pb_1]
+  psubb                  m3, m2
+  pavgb                  m4, m3
+  pavgb                  m3, m0, m6
+  pxor                   m0, m6
+  pand                   m0, [pb_1]
+  psubb                  m3, m0
+  pavgb                  m5, m3
+
+  ; write 4x4 lines (and the first half of the second 4x4 lines)
+  mov                  lined, 4
+.loop:
+  mova [dstq               ], m5
+  mova [dstq            +16], m4
+  mova [dst16q             ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq     ], m3
+  mova [dstq  +strideq  +16], m4
+  mova [dst16q+strideq     ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq*2   ], m5
+  mova [dstq  +strideq*2+16], m4
+  mova [dst16q+strideq*2   ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +stride3q    ], m3
+  mova [dstq  +stride3q +16], m4
+  mova [dst16q+stride3q    ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  lea                  dstq, [dstq  +strideq*4]
+  lea                dst16q, [dst16q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; write second half of second 4x4 lines
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  RET
author	hkuang <hkuang@google.com>	2013-08-06 11:07:19 -0700
committer	Hangyu Kuang <hkuang@google.com>	2013-08-06 18:31:37 +0000
commit	f3bed9137f66ef693bd406e43b17e9a1114f1e14 (patch)
tree	cd1bea0cd923c6d125cb5b3e7b3404d7c2f70208 /libvpx/vp9/common
parent	a8b927ab4f06e2fc0d16d9606b57672df9899ac1 (diff)
download	libvpx-f3bed9137f66ef693bd406e43b17e9a1114f1e14.tar.gz