88 files changed, 3734 insertions, 2249 deletions
diff --git a/armv7a-neon/libvpx_srcs.txt b/armv7a-neon/libvpx_srcs.txt
index 7f331c0cf..25ca5e0f8 100644
--- a/armv7a-neon/libvpx_srcs.txt
+++ b/armv7a-neon/libvpx_srcs.txt
@@ -208,6 +208,7 @@ vp9/common/arm/neon/vp9_convolve8_neon.asm.s
 vp9/common/arm/neon/vp9_convolve_neon.c
 vp9/common/arm/neon/vp9_dc_only_idct_add_neon.asm.s
 vp9/common/arm/neon/vp9_loopfilter_neon.asm.s
+vp9/common/arm/neon/vp9_mb_lpf_neon.asm.s
 vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm.s
 vp9/common/generic/vp9_systemdependent.c
 vp9/common/vp9_alloccommon.c
@@ -282,6 +283,8 @@ vp9/decoder/vp9_onyxd.h
 vp9/decoder/vp9_onyxd_if.c
 vp9/decoder/vp9_onyxd_int.h
 vp9/decoder/vp9_read_bit_buffer.h
+vp9/decoder/vp9_thread.c
+vp9/decoder/vp9_thread.h
 vp9/decoder/vp9_treereader.h
 vp9/vp9_common.mk
 vp9/vp9_dx_iface.c
diff --git a/armv7a-neon/vp9_rtcd.h b/armv7a-neon/vp9_rtcd.h
index 6e6ff717b..4ebb49773 100644
--- a/armv7a-neon/vp9_rtcd.h
+++ b/armv7a-neon/vp9_rtcd.h
@@ -14,9 +14,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -207,7 +205,8 @@ void vp9_add_constant_residual_32x32_neon(const int16_t diff, uint8_t *dest, int
 #define vp9_add_constant_residual_32x32 vp9_add_constant_residual_32x32_neon
 
 void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
-#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
+void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_neon
 
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -218,7 +217,8 @@ void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *bl
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_neon
 
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
+void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_neon
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -273,6 +273,9 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_neon
@@ -280,8 +283,8 @@ void vp9_short_idct8x8_add_neon(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_8x8 vp9_short_idct1_8x8_c
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
@@ -289,18 +292,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_16x16 vp9_short_idct1_16x16_c
-
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_32x32_add vp9_short_idct10_32x32_add_c
-
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 #define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
 
diff --git a/armv7a-neon/vpx_config.h b/armv7a-neon/vpx_config.h
index 6f45f7ec7..d132e4d60 100644
--- a/armv7a-neon/vpx_config.h
+++ b/armv7a-neon/vpx_config.h
@@ -39,6 +39,7 @@
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
+#define CONFIG_USE_X86INC 1
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
diff --git a/armv7a/libvpx_srcs.txt b/armv7a/libvpx_srcs.txt
index a929dc3ca..2ddb1bdd0 100644
--- a/armv7a/libvpx_srcs.txt
+++ b/armv7a/libvpx_srcs.txt
@@ -237,6 +237,8 @@ vp9/decoder/vp9_onyxd.h
 vp9/decoder/vp9_onyxd_if.c
 vp9/decoder/vp9_onyxd_int.h
 vp9/decoder/vp9_read_bit_buffer.h
+vp9/decoder/vp9_thread.c
+vp9/decoder/vp9_thread.h
 vp9/decoder/vp9_treereader.h
 vp9/vp9_common.mk
 vp9/vp9_dx_iface.c
diff --git a/armv7a/vp9_rtcd.h b/armv7a/vp9_rtcd.h
index d6b244db4..1ce24c553 100644
--- a/armv7a/vp9_rtcd.h
+++ b/armv7a/vp9_rtcd.h
@@ -14,9 +14,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -260,14 +258,17 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_8x8 vp9_short_idct1_8x8_c
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
@@ -275,18 +276,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_16x16 vp9_short_idct1_16x16_c
-
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_32x32_add vp9_short_idct10_32x32_add_c
-
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 #define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
 
diff --git a/armv7a/vpx_config.h b/armv7a/vpx_config.h
index be08d2a25..a330023f9 100644
--- a/armv7a/vpx_config.h
+++ b/armv7a/vpx_config.h
@@ -39,6 +39,7 @@
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
+#define CONFIG_USE_X86INC 1
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
diff --git a/generic/libvpx_srcs.txt b/generic/libvpx_srcs.txt
index 402ac2420..055f5fb5d 100644
--- a/generic/libvpx_srcs.txt
+++ b/generic/libvpx_srcs.txt
@@ -197,6 +197,8 @@ vp9/decoder/vp9_onyxd.h
 vp9/decoder/vp9_onyxd_if.c
 vp9/decoder/vp9_onyxd_int.h
 vp9/decoder/vp9_read_bit_buffer.h
+vp9/decoder/vp9_thread.c
+vp9/decoder/vp9_thread.h
 vp9/decoder/vp9_treereader.h
 vp9/vp9_common.mk
 vp9/vp9_dx_iface.c
diff --git a/generic/vp9_rtcd.h b/generic/vp9_rtcd.h
index c0824cb16..2562e82c5 100644
--- a/generic/vp9_rtcd.h
+++ b/generic/vp9_rtcd.h
@@ -14,9 +14,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -260,14 +258,17 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_8x8 vp9_short_idct1_8x8_c
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
@@ -275,18 +276,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_16x16 vp9_short_idct1_16x16_c
-
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_32x32_add vp9_short_idct10_32x32_add_c
-
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 #define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
 
diff --git a/generic/vpx_config.h b/generic/vpx_config.h
index 37dcff976..4d6172b8d 100644
--- a/generic/vpx_config.h
+++ b/generic/vpx_config.h
@@ -39,6 +39,7 @@
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
+#define CONFIG_USE_X86INC 1
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
diff --git a/libvpx/README b/libvpx/README
index 92cc0742c..d7cb11afb 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -1,7 +1,7 @@
 vpx Multi-Format Codec SDK
-README - 21 June 2012
+README - 1 August 2013
 
-Welcome to the WebM VP8 Codec SDK!
+Welcome to the WebM VP8/VP9 Codec SDK!
 
 COMPILING THE APPLICATIONS/LIBRARIES:
   The build system used is similar to autotools. Building generally consists of
@@ -53,33 +53,63 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv5te-android-gcc
     armv5te-linux-rvct
     armv5te-linux-gcc
+    armv5te-none-rvct
     armv6-darwin-gcc
     armv6-linux-rvct
     armv6-linux-gcc
+    armv6-none-rvct
     armv7-android-gcc
+    armv7-darwin-gcc
     armv7-linux-rvct
     armv7-linux-gcc
+    armv7-none-rvct
+    armv7-win32-vs11
     mips32-linux-gcc
     ppc32-darwin8-gcc
     ppc32-darwin9-gcc
+    ppc32-linux-gcc
     ppc64-darwin8-gcc
     ppc64-darwin9-gcc
     ppc64-linux-gcc
+    sparc-solaris-gcc
+    x86-android-gcc
     x86-darwin8-gcc
     x86-darwin8-icc
     x86-darwin9-gcc
     x86-darwin9-icc
+    x86-darwin10-gcc
+    x86-darwin11-gcc
+    x86-darwin12-gcc
+    x86-darwin13-gcc
     x86-linux-gcc
     x86-linux-icc
+    x86-os2-gcc
     x86-solaris-gcc
+    x86-win32-gcc
     x86-win32-vs7
     x86-win32-vs8
+    x86-win32-vs9
+    x86-win32-vs10
+    x86-win32-vs11
     x86_64-darwin9-gcc
+    x86_64-darwin10-gcc
+    x86_64-darwin11-gcc
+    x86_64-darwin12-gcc
+    x86_64-darwin13-gcc
     x86_64-linux-gcc
+    x86_64-linux-icc
     x86_64-solaris-gcc
+    x86_64-win64-gcc
     x86_64-win64-vs8
+    x86_64-win64-vs9
+    x86_64-win64-vs10
+    x86_64-win64-vs11
     universal-darwin8-gcc
     universal-darwin9-gcc
+    universal-darwin10-gcc
+    universal-darwin11-gcc
+    universal-darwin12-gcc
+    universal-darwin13-gcc
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index 30a61067f..e2566b0a7 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -1189,6 +1189,12 @@ EOF
         fi
     fi
 
+    # default use_x86inc to yes if pic is no or 64bit or we are not on darwin
+    echo "  checking here for x86inc \"${tgt_isa}\" \"$pic\" "
+    if [ ${tgt_isa} = x86_64 -o ! "$pic" == "yes" -o ! ${tgt_os:0:6} = darwin ]; then
+      soft_enable use_x86inc
+    fi
+
     # Position Independent Code (PIC) support, for building relocatable
     # shared objects
     enabled gcc && enabled pic && check_add_cflags -fPIC
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index f9fc69428..0c269b16b 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -72,15 +72,21 @@ parse_project() {
     eval "${var}_name=$name"
     eval "${var}_guid=$guid"
 
-    # assume that all projects have the same list of possible configurations,
-    # so overwriting old config_lists is not a problem
     if [ "$sfx" = "vcproj" ]; then
-        config_list=`grep -A1 '<Configuration' $file |
+        cur_config_list=`grep -A1 '<Configuration' $file |
             grep Name | cut -d\" -f2`
     else
-        config_list=`grep -B1 'Label="Configuration"' $file |
+        cur_config_list=`grep -B1 'Label="Configuration"' $file |
             grep Condition | cut -d\' -f4`
     fi
+    new_config_list=$(for i in $config_list $cur_config_list; do
+        echo $i
+    done | sort | uniq)
+    if [ "$config_list" != "" ] && [ "$config_list" != "$new_config_list" ]; then
+        mixed_platforms=1
+    fi
+    config_list="$new_config_list"
+    eval "${var}_config_list=\"$cur_config_list\""
     proj_list="${proj_list} ${var}"
 }
 
@@ -130,6 +136,11 @@ process_global() {
     indent_push
     IFS_bak=${IFS}
     IFS=$'\r'$'\n'
+    if [ "$mixed_platforms" != "" ]; then
+        config_list="
+Release|Mixed Platforms
+Debug|Mixed Platforms"
+    fi
     for config in ${config_list}; do
         echo "${indent}$config = $config"
     done
@@ -144,10 +155,17 @@ process_global() {
     indent_push
     for proj in ${proj_list}; do
         eval "local proj_guid=\${${proj}_guid}"
+        eval "local proj_config_list=\${${proj}_config_list}"
         IFS=$'\r'$'\n'
-        for config in ${config_list}; do
-            echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
-            echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
+        for config in ${proj_config_list}; do
+            if [ "$mixed_platforms" != "" ]; then
+                local c=${config%%|*}
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${c}|Mixed Platforms.Build.0 = ${config}"
+            else
+                echo "${indent}${proj_guid}.${config}.ActiveCfg = ${config}"
+                echo "${indent}${proj_guid}.${config}.Build.0 = ${config}"
+            fi
 
         done
         IFS=${IFS_bak}
diff --git a/libvpx/configure b/libvpx/configure
index 3651334e2..24be893f7 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -257,6 +257,7 @@ CONFIG_LIST="
     install_bins
     install_libs
     install_srcs
+    use_x86inc
     debug
     gprof
     gcov
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index 4aa7dc48a..233863108 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -57,6 +57,13 @@ CLEAN-OBJS += $$(BUILD_PFX)$(1).h
 RTCD += $$(BUILD_PFX)$(1).h
 endef
 
+# x86inc.asm is not compatible with pic 32bit builds. Restrict
+# files which use it to 64bit builds or 32bit without pic
+USE_X86INC = no
+ifeq ($(CONFIG_USE_X86INC),yes)
+  USE_X86INC = yes
+endif
+
 CODEC_SRCS-yes += CHANGELOG
 CODEC_SRCS-yes += libs.mk
 
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index 3b72129cc..b1510c648 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -527,9 +527,9 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
 
 #if HAVE_SSSE3
 const ConvolveFunctions convolve8_ssse3(
-    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_c,
-    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_c,
-    vp9_convolve8_ssse3, vp9_convolve8_avg_c);
+    vp9_convolve8_horiz_ssse3, vp9_convolve8_avg_horiz_ssse3,
+    vp9_convolve8_vert_ssse3, vp9_convolve8_avg_vert_ssse3,
+    vp9_convolve8_ssse3, vp9_convolve8_avg_ssse3);
 
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
     make_tuple(4, 4, &convolve8_ssse3),
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 619533a38..25e05b9fc 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -89,6 +89,7 @@ LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9)         += convolve_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_thread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index 3e5fe8d6a..24767957f 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -39,7 +39,7 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
   // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES;
+  for (BLOCK_SIZE_TYPE bsize = BLOCK_4X4; bsize < BLOCK_SIZE_TYPES;
        bsize = static_cast<BLOCK_SIZE_TYPE>(static_cast<int>(bsize) + 1)) {
     const int block_width  = 4 << b_width_log2(bsize);
     const int block_height = 4 << b_height_log2(bsize);
@@ -93,9 +93,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
                         ::testing::Values(vp9_subtract_block_c));
 
-#if HAVE_SSE2
+#if HAVE_SSE2 && CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
                         ::testing::Values(vp9_subtract_block_sse2));
 #endif
-
 }  // namespace vp9
diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
new file mode 100644
index 000000000..41d22dd3a
--- /dev/null
+++ b/libvpx/test/vp9_thread_test.cc
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/decoder/vp9_thread.h"
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/md5_helper.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+class VP9WorkerThreadTest : public ::testing::Test {
+ protected:
+  virtual ~VP9WorkerThreadTest() {}
+  virtual void SetUp() {
+    vp9_worker_init(&worker_);
+  }
+
+  virtual void TearDown() {
+    vp9_worker_end(&worker_);
+  }
+
+  VP9Worker worker_;
+};
+
+int ThreadHook(void* data, void* return_value) {
+  int* const hook_data = reinterpret_cast<int*>(data);
+  *hook_data = 5;
+  return *reinterpret_cast<int*>(return_value);
+}
+
+TEST_F(VP9WorkerThreadTest, HookSuccess) {
+  EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
+
+  for (int i = 0; i < 2; ++i) {
+    EXPECT_TRUE(vp9_worker_reset(&worker_));
+
+    int hook_data = 0;
+    int return_value = 1;  // return successfully from the hook
+    worker_.hook = ThreadHook;
+    worker_.data1 = &hook_data;
+    worker_.data2 = &return_value;
+
+    vp9_worker_launch(&worker_);
+    EXPECT_TRUE(vp9_worker_sync(&worker_));
+    EXPECT_FALSE(worker_.had_error);
+    EXPECT_EQ(5, hook_data);
+
+    EXPECT_TRUE(vp9_worker_sync(&worker_));  // should be a no-op.
+  }
+}
+
+TEST_F(VP9WorkerThreadTest, HookFailure) {
+  EXPECT_TRUE(vp9_worker_reset(&worker_));
+
+  int hook_data = 0;
+  int return_value = 0;  // return failure from the hook
+  worker_.hook = ThreadHook;
+  worker_.data1 = &hook_data;
+  worker_.data2 = &return_value;
+
+  vp9_worker_launch(&worker_);
+  EXPECT_FALSE(vp9_worker_sync(&worker_));
+  EXPECT_TRUE(worker_.had_error);
+
+  // Ensure _reset() clears the error and _launch() can be called again.
+  return_value = 1;
+  EXPECT_TRUE(vp9_worker_reset(&worker_));
+  EXPECT_FALSE(worker_.had_error);
+  vp9_worker_launch(&worker_);
+  EXPECT_TRUE(vp9_worker_sync(&worker_));
+  EXPECT_FALSE(worker_.had_error);
+}
+
+TEST(VP9DecodeMTTest, MTDecode) {
+  libvpx_test::WebMVideoSource video("vp90-2-03-size-226x226.webm");
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = 2;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  libvpx_test::MD5 md5;
+  for (video.Begin(); video.cxdata(); video.Next()) {
+    const vpx_codec_err_t res =
+        decoder.DecodeFrame(video.cxdata(), video.frame_size());
+    ASSERT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
+
+    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
+    const vpx_image_t *img = NULL;
+
+    // Get decompressed data
+    while ((img = dec_iter.Next())) {
+      md5.Add(img);
+    }
+  }
+  EXPECT_STREQ("b35a1b707b28e82be025d960aba039bc", md5.Get());
+}
+
+}  // namespace
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
index 15039e267..110a56cdd 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_avg_neon.asm
@@ -52,15 +52,15 @@
 ; sp[]int h
 
 |vp9_convolve8_avg_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_horiz_c
+
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r4, [sp, #36]           ; x_step_q4
     ldr             r5, [sp, #32]           ; filter_x
-    cmp             r4, #16
-    bne             call_horiz_c_convolve   ; x_step_q4 != 16
-
     ldr             r6, [sp, #48]           ; w
     ldr             r7, [sp, #52]           ; h
 
@@ -82,22 +82,22 @@
     mov             r10, r6                 ; w loop counter
 
 loop_horiz
-    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
-    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld1.8          {d24}, [r0]!
     vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
 
-    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
-    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld1.8          {d25}, [r0]!
     vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
 
-    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
-    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld1.8          {d26}, [r0]!
     vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
 
-    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
-    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld1.8          {d27}, [r0]!
     vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
 
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
     ; extract to s16
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
@@ -128,8 +128,8 @@ loop_horiz
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; transpose
     vtrn.16         d2, d3
@@ -137,10 +137,7 @@ loop_horiz
     vtrn.8          d2, d3
     
     ; average the new value and the dst value
-    vaddl.u8        q8, d2, d6
-    vaddl.u8        q9, d3, d7
-    vqrshrn.u16     d2, q8, #1
-    vqrshrn.u16     d3, q9, #1
+    vrhadd.u8       q1, q1, q3
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d3[0]}, [r2], r3
@@ -159,26 +156,20 @@ loop_horiz
 
     pop             {r4-r10, pc}
 
-call_horiz_c_convolve
-    pop             {r4-r10, lr}
-    add             r0, r0, #3              ; un-adjust for taps
-    b               vp9_convolve8_avg_horiz_c
-
-
     ENDP
 
 |vp9_convolve8_avg_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_avg_vert_c
+
     push            {r4-r10, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r6, [sp, #44]           ; y_step_q4
     ldr             r7, [sp, #40]           ; filter_y
-    cmp             r6, #16
-    bne             call_vert_c_convolve    ; y_step_q4 != 16
-
     ldr             r8, [sp, #48]           ; w
     ldr             r9, [sp, #52]           ; h
 
@@ -240,14 +231,11 @@ loop_vert
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; average the new value and the dst value
-    vaddl.u8        q8, d2, d6
-    vaddl.u8        q9, d3, d7
-    vqrshrn.u16     d2, q8, #1
-    vqrshrn.u16     d3, q9, #1
+    vrhadd.u8       q1, q1, q3
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d2[1]}, [r2], r3
@@ -266,12 +254,5 @@ loop_vert
 
     pop             {r4-r10, pc}
 
-call_vert_c_convolve
-    pop             {r4-r10, lr}
-    ; un-adjust for taps
-    add             r0, r0, r1
-    add             r0, r0, r1, lsl #1
-    b               vp9_convolve8_avg_vert_c
-
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
index 842c73c90..845e4a866 100644
--- a/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm
@@ -52,15 +52,15 @@
 ; sp[]int h
 
 |vp9_convolve8_horiz_neon| PROC
+    ldr             r12, [sp, #4]           ; x_step_q4
+    cmp             r12, #16
+    bne             vp9_convolve8_horiz_c
+
     push            {r4-r10, lr}
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r4, [sp, #36]           ; x_step_q4
     ldr             r5, [sp, #32]           ; filter_x
-    cmp             r4, #16
-    bne             call_horiz_c_convolve   ; x_step_q4 != 16
-
     ldr             r6, [sp, #48]           ; w
     ldr             r7, [sp, #52]           ; h
 
@@ -82,22 +82,22 @@
     mov             r10, r6                 ; w loop counter
 
 loop_horiz
-    vld4.u8         {d24[0], d25[0], d26[0], d27[0]}, [r0]!
-    vld4.u8         {d24[4], d25[4], d26[4], d27[4]}, [r0]!
+    vld1.8          {d24}, [r0]!
     vld3.u8         {d28[0], d29[0], d30[0]}, [r0], r9
 
-    vld4.u8         {d24[1], d25[1], d26[1], d27[1]}, [r0]!
-    vld4.u8         {d24[5], d25[5], d26[5], d27[5]}, [r0]!
+    vld1.8          {d25}, [r0]!
     vld3.u8         {d28[1], d29[1], d30[1]}, [r0], r9
 
-    vld4.u8         {d24[2], d25[2], d26[2], d27[2]}, [r0]!
-    vld4.u8         {d24[6], d25[6], d26[6], d27[6]}, [r0]!
+    vld1.8          {d26}, [r0]!
     vld3.u8         {d28[2], d29[2], d30[2]}, [r0], r9
 
-    vld4.u8         {d24[3], d25[3], d26[3], d27[3]}, [r0]!
-    vld4.u8         {d24[7], d25[7], d26[7], d27[7]}, [r0]!
+    vld1.8          {d27}, [r0]!
     vld3.u8         {d28[3], d29[3], d30[3]}, [r0], r8
 
+    vtrn.16         q12, q13
+    vtrn.8          d24, d25
+    vtrn.8          d26, d27
+
     ; extract to s16
     vmovl.u8        q8, d24
     vmovl.u8        q9, d25
@@ -120,8 +120,8 @@ loop_horiz
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     ; transpose
     vtrn.16         d2, d3
@@ -145,26 +145,20 @@ loop_horiz
 
     pop             {r4-r10, pc}
 
-call_horiz_c_convolve
-    pop             {r4-r10, lr}
-    add             r0, r0, #3              ; un-adjust for taps
-    b               vp9_convolve8_horiz_c
-
-
     ENDP
 
 |vp9_convolve8_vert_neon| PROC
+    ldr             r12, [sp, #12]
+    cmp             r12, #16
+    bne             vp9_convolve8_vert_c
+
     push            {r4-r10, lr}
 
     ; adjust for taps
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r6, [sp, #44]           ; y_step_q4
     ldr             r7, [sp, #40]           ; filter_y
-    cmp             r6, #16
-    bne             call_vert_c_convolve    ; y_step_q4 != 16
-
     ldr             r8, [sp, #48]           ; w
     ldr             r9, [sp, #52]           ; h
 
@@ -219,8 +213,8 @@ loop_vert
     vqrshrun.s32    d5, q15, #7
 
     ; saturate
-    vqshrn.u16      d2, q1, #0
-    vqshrn.u16      d3, q2, #0
+    vqmovn.u16      d2, q1
+    vqmovn.u16      d3, q2
 
     vst1.u32        {d2[0]}, [r2], r3
     vst1.u32        {d2[1]}, [r2], r3
@@ -239,12 +233,5 @@ loop_vert
 
     pop             {r4-r10, pc}
 
-call_vert_c_convolve
-    pop             {r4-r10, lr}
-    ; un-adjust for taps
-    add             r0, r0, r1
-    add             r0, r0, r1, lsl #1
-    b               vp9_convolve8_vert_c
-
     ENDP
     END
diff --git a/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
new file mode 100644
index 000000000..edf5786e3
--- /dev/null
+++ b/libvpx/vp9/common/arm/neon/vp9_mb_lpf_neon.asm
@@ -0,0 +1,618 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_mb_lpf_horizontal_edge_w_neon|
+    EXPORT  |vp9_mb_lpf_vertical_edge_w_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh
+;                                        int count)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_horizontal_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+    ldr         r12, [sp, #92]             ; load count
+
+h_count
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, r1, lsl #3         ; move src pointer down by 8 lines
+
+    vld1.u8     {d0}, [r8@64], r1          ; p7
+    vld1.u8     {d1}, [r8@64], r1          ; p6
+    vld1.u8     {d2}, [r8@64], r1          ; p5
+    vld1.u8     {d3}, [r8@64], r1          ; p4
+    vld1.u8     {d4}, [r8@64], r1          ; p3
+    vld1.u8     {d5}, [r8@64], r1          ; p2
+    vld1.u8     {d6}, [r8@64], r1          ; p1
+    vld1.u8     {d7}, [r8@64], r1          ; p0
+    vld1.u8     {d8}, [r8@64], r1          ; q0
+    vld1.u8     {d9}, [r8@64], r1          ; q1
+    vld1.u8     {d10}, [r8@64], r1         ; q2
+    vld1.u8     {d11}, [r8@64], r1         ; q3
+    vld1.u8     {d12}, [r8@64], r1         ; q4
+    vld1.u8     {d13}, [r8@64], r1         ; q5
+    vld1.u8     {d14}, [r8@64], r1         ; q6
+    vld1.u8     {d15}, [r8@64], r1         ; q7
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         h_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, r1, lsl #1
+
+    vst1.u8     {d25}, [r8@64], r1         ; store op1
+    vst1.u8     {d24}, [r8@64], r1         ; store op0
+    vst1.u8     {d23}, [r8@64], r1         ; store oq0
+    vst1.u8     {d26}, [r8@64], r1         ; store oq1
+
+    b           h_next
+
+h_mbfilter
+    tst         r7, #2
+    beq         h_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, r1, lsl #1
+    sub         r8, r8, r1
+
+    vst1.u8     {d18}, [r8@64], r1         ; store op2
+    vst1.u8     {d19}, [r8@64], r1         ; store op1
+    vst1.u8     {d20}, [r8@64], r1         ; store op0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq0
+    vst1.u8     {d22}, [r8@64], r1         ; store oq1
+    vst1.u8     {d23}, [r8@64], r1         ; store oq2
+
+    b           h_next
+
+h_wide_mbfilter
+    sub         r8, r0, r1, lsl #3
+    add         r8, r8, r1
+
+    vst1.u8     {d16}, [r8@64], r1         ; store op6
+    vst1.u8     {d24}, [r8@64], r1         ; store op5
+    vst1.u8     {d25}, [r8@64], r1         ; store op4
+    vst1.u8     {d26}, [r8@64], r1         ; store op3
+    vst1.u8     {d27}, [r8@64], r1         ; store op2
+    vst1.u8     {d18}, [r8@64], r1         ; store op1
+    vst1.u8     {d19}, [r8@64], r1         ; store op0
+    vst1.u8     {d20}, [r8@64], r1         ; store oq0
+    vst1.u8     {d21}, [r8@64], r1         ; store oq1
+    vst1.u8     {d22}, [r8@64], r1         ; store oq2
+    vst1.u8     {d23}, [r8@64], r1         ; store oq3
+    vst1.u8     {d1}, [r8@64], r1          ; store oq4
+    vst1.u8     {d2}, [r8@64], r1          ; store oq5
+    vst1.u8     {d3}, [r8@64], r1          ; store oq6
+
+h_next
+    add         r0, r0, #8
+    subs        r12, r12, #1
+    bne         h_count
+
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_horizontal_edge_w_neon|
+
+; void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int p,
+;                                        const uint8_t *blimit,
+;                                        const uint8_t *limit,
+;                                        const uint8_t *thresh)
+; r0    uint8_t *s,
+; r1    int p, /* pitch */
+; r2    const uint8_t *blimit,
+; r3    const uint8_t *limit,
+; sp    const uint8_t *thresh,
+|vp9_mb_lpf_vertical_edge_w_neon| PROC
+    push        {r4-r8, lr}
+    vpush       {d8-d15}
+    ldr         r4, [sp, #88]              ; load thresh
+
+    vld1.8      {d16[]}, [r2]              ; load *blimit
+    vld1.8      {d17[]}, [r3]              ; load *limit
+    vld1.8      {d18[]}, [r4]              ; load *thresh
+
+    sub         r8, r0, #8
+
+    vld1.8      {d0}, [r8@64], r1
+    vld1.8      {d8}, [r0@64], r1
+    vld1.8      {d1}, [r8@64], r1
+    vld1.8      {d9}, [r0@64], r1
+    vld1.8      {d2}, [r8@64], r1
+    vld1.8      {d10}, [r0@64], r1
+    vld1.8      {d3}, [r8@64], r1
+    vld1.8      {d11}, [r0@64], r1
+    vld1.8      {d4}, [r8@64], r1
+    vld1.8      {d12}, [r0@64], r1
+    vld1.8      {d5}, [r8@64], r1
+    vld1.8      {d13}, [r0@64], r1
+    vld1.8      {d6}, [r8@64], r1
+    vld1.8      {d14}, [r0@64], r1
+    vld1.8      {d7}, [r8@64], r1
+    vld1.8      {d15}, [r0@64], r1
+
+    sub         r0, r0, r1, lsl #3
+
+    vtrn.32     q0, q2
+    vtrn.32     q1, q3
+    vtrn.32     q4, q6
+    vtrn.32     q5, q7
+
+    vtrn.16     q0, q1
+    vtrn.16     q2, q3
+    vtrn.16     q4, q5
+    vtrn.16     q6, q7
+
+    vtrn.8      d0, d1
+    vtrn.8      d2, d3
+    vtrn.8      d4, d5
+    vtrn.8      d6, d7
+
+    vtrn.8      d8, d9
+    vtrn.8      d10, d11
+    vtrn.8      d12, d13
+    vtrn.8      d14, d15
+
+    bl          vp9_wide_mbfilter_neon
+
+    tst         r7, #1
+    beq         v_mbfilter
+
+    ; flat && mask were not set for any of the channels. Just store the values
+    ; from filter.
+    sub         r8, r0, #2
+
+    vswp        d23, d25
+
+    vst4.8      {d23[0], d24[0], d25[0], d26[0]}, [r8], r1
+    vst4.8      {d23[1], d24[1], d25[1], d26[1]}, [r8], r1
+    vst4.8      {d23[2], d24[2], d25[2], d26[2]}, [r8], r1
+    vst4.8      {d23[3], d24[3], d25[3], d26[3]}, [r8], r1
+    vst4.8      {d23[4], d24[4], d25[4], d26[4]}, [r8], r1
+    vst4.8      {d23[5], d24[5], d25[5], d26[5]}, [r8], r1
+    vst4.8      {d23[6], d24[6], d25[6], d26[6]}, [r8], r1
+    vst4.8      {d23[7], d24[7], d25[7], d26[7]}, [r8], r1
+
+    b           v_end
+
+v_mbfilter
+    tst         r7, #2
+    beq         v_wide_mbfilter
+
+    ; flat2 was not set for any of the channels. Just store the values from
+    ; mbfilter.
+    sub         r8, r0, #3
+
+    vst3.8      {d18[0], d19[0], d20[0]}, [r8], r1
+    vst3.8      {d21[0], d22[0], d23[0]}, [r0], r1
+    vst3.8      {d18[1], d19[1], d20[1]}, [r8], r1
+    vst3.8      {d21[1], d22[1], d23[1]}, [r0], r1
+    vst3.8      {d18[2], d19[2], d20[2]}, [r8], r1
+    vst3.8      {d21[2], d22[2], d23[2]}, [r0], r1
+    vst3.8      {d18[3], d19[3], d20[3]}, [r8], r1
+    vst3.8      {d21[3], d22[3], d23[3]}, [r0], r1
+    vst3.8      {d18[4], d19[4], d20[4]}, [r8], r1
+    vst3.8      {d21[4], d22[4], d23[4]}, [r0], r1
+    vst3.8      {d18[5], d19[5], d20[5]}, [r8], r1
+    vst3.8      {d21[5], d22[5], d23[5]}, [r0], r1
+    vst3.8      {d18[6], d19[6], d20[6]}, [r8], r1
+    vst3.8      {d21[6], d22[6], d23[6]}, [r0], r1
+    vst3.8      {d18[7], d19[7], d20[7]}, [r8], r1
+    vst3.8      {d21[7], d22[7], d23[7]}, [r0], r1
+
+    b           v_end
+
+v_wide_mbfilter
+    sub         r8, r0, #8
+
+    vtrn.32     d0,  d26
+    vtrn.32     d16, d27
+    vtrn.32     d24, d18
+    vtrn.32     d25, d19
+
+    vtrn.16     d0,  d24
+    vtrn.16     d16, d25
+    vtrn.16     d26, d18
+    vtrn.16     d27, d19
+
+    vtrn.8      d0,  d16
+    vtrn.8      d24, d25
+    vtrn.8      d26, d27
+    vtrn.8      d18, d19
+
+    vtrn.32     d20, d1
+    vtrn.32     d21, d2
+    vtrn.32     d22, d3
+    vtrn.32     d23, d15
+
+    vtrn.16     d20, d22
+    vtrn.16     d21, d23
+    vtrn.16     d1,  d3
+    vtrn.16     d2,  d15
+
+    vtrn.8      d20, d21
+    vtrn.8      d22, d23
+    vtrn.8      d1,  d2
+    vtrn.8      d3,  d15
+
+    vst1.8      {d0}, [r8@64], r1
+    vst1.8      {d20}, [r0@64], r1
+    vst1.8      {d16}, [r8@64], r1
+    vst1.8      {d21}, [r0@64], r1
+    vst1.8      {d24}, [r8@64], r1
+    vst1.8      {d22}, [r0@64], r1
+    vst1.8      {d25}, [r8@64], r1
+    vst1.8      {d23}, [r0@64], r1
+    vst1.8      {d26}, [r8@64], r1
+    vst1.8      {d1}, [r0@64], r1
+    vst1.8      {d27}, [r8@64], r1
+    vst1.8      {d2}, [r0@64], r1
+    vst1.8      {d18}, [r8@64], r1
+    vst1.8      {d3}, [r0@64], r1
+    vst1.8      {d19}, [r8@64], r1
+    vst1.8      {d15}, [r0@64], r1
+
+v_end
+    vpop        {d8-d15}
+    pop         {r4-r8, pc}
+
+    ENDP        ; |vp9_mb_lpf_vertical_edge_w_neon|
+
+; void vp9_wide_mbfilter_neon();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store.
+;
+; r0-r3 PRESERVE
+; d16    blimit
+; d17    limit
+; d18    thresh
+; d0    p7
+; d1    p6
+; d2    p5
+; d3    p4
+; d4    p3
+; d5    p2
+; d6    p1
+; d7    p0
+; d8    q0
+; d9    q1
+; d10   q2
+; d11   q3
+; d12   q4
+; d13   q5
+; d14   q6
+; d15   q7
+|vp9_wide_mbfilter_neon| PROC
+    mov         r7, #0
+
+    ; filter_mask
+    vabd.u8     d19, d4, d5                ; abs(p3 - p2)
+    vabd.u8     d20, d5, d6                ; abs(p2 - p1)
+    vabd.u8     d21, d6, d7                ; abs(p1 - p0)
+    vabd.u8     d22, d9, d8                ; abs(q1 - q0)
+    vabd.u8     d23, d10, d9               ; abs(q2 - q1)
+    vabd.u8     d24, d11, d10              ; abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     d19, d19, d20              ; max(abs(p3 - p2), abs(p2 - p1))
+    vmax.u8     d20, d21, d22              ; max(abs(p1 - p0), abs(q1 - q0))
+    vmax.u8     d23, d23, d24              ; max(abs(q2 - q1), abs(q3 - q2))
+    vmax.u8     d19, d19, d20
+
+    vabd.u8     d24, d7, d8                ; abs(p0 - q0)
+
+    vmax.u8     d19, d19, d23
+
+    vabd.u8     d23, d6, d9                ; a = abs(p1 - q1)
+    vqadd.u8    d24, d24, d24              ; b = abs(p0 - q0) * 2
+
+    ; abs () > limit
+    vcge.u8     d19, d17, d19
+
+    ; flatmask4
+    vabd.u8     d25, d7, d5                ; abs(p0 - p2)
+    vabd.u8     d26, d8, d10               ; abs(q0 - q2)
+    vabd.u8     d27, d4, d7                ; abs(p3 - p0)
+    vabd.u8     d28, d11, d8               ; abs(q3 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d25, d25, d26              ; max(abs(p0 - p2), abs(q0 - q2))
+    vmax.u8     d26, d27, d28              ; max(abs(p3 - p0), abs(q3 - q0))
+    vmax.u8     d25, d25, d26
+    vmax.u8     d20, d20, d25
+
+    vshr.u8     d23, d23, #1               ; a = a / 2
+    vqadd.u8    d24, d24, d23              ; a = b + a
+
+    vmov.u8     d30, #1
+    vcge.u8     d24, d16, d24              ; (a > blimit * 2 + limit) * -1
+
+    vcge.u8     d20, d30, d20              ; flat
+
+    vand        d19, d19, d24              ; mask
+
+    ; hevmask
+    vcgt.u8     d21, d21, d18              ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     d22, d22, d18              ; (abs(q1 - q0) > thresh)*-1
+    vorr        d21, d21, d22              ; hev
+
+    vand        d16, d20, d19              ; flat && mask
+    vmov        r5, r6, d16
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #1                 ; Only do filter branch
+
+    ; flatmask5(1, p7, p6, p5, p4, p0, q0, q4, q5, q6, q7)
+    vabd.u8     d22, d3, d7                ; abs(p4 - p0)
+    vabd.u8     d23, d12, d8               ; abs(q4 - q0)
+    vabd.u8     d24, d7, d2                ; abs(p0 - p5)
+    vabd.u8     d25, d8, d13               ; abs(q0 - q5)
+    vabd.u8     d26, d1, d7                ; abs(p6 - p0)
+    vabd.u8     d27, d14, d8               ; abs(q6 - q0)
+    vabd.u8     d28, d0, d7                ; abs(p7 - p0)
+    vabd.u8     d29, d15, d8               ; abs(q7 - q0)
+
+    ; only compare the largest value to thresh
+    vmax.u8     d22, d22, d23              ; max(abs(p4 - p0), abs(q4 - q0))
+    vmax.u8     d23, d24, d25              ; max(abs(p0 - p5), abs(q0 - q5))
+    vmax.u8     d24, d26, d27              ; max(abs(p6 - p0), abs(q6 - q0))
+    vmax.u8     d25, d28, d29              ; max(abs(p7 - p0), abs(q7 - q0))
+
+    vmax.u8     d26, d22, d23
+    vmax.u8     d27, d24, d25
+    vmax.u8     d23, d26, d27
+
+    vcge.u8     d18, d30, d23              ; flat2
+
+    vmov.u8     d22, #0x80
+
+    vand        d17, d18, d16              ; flat2 && flat && mask
+    vmov        r5, r6, d17
+    orrs        r5, r5, r6                 ; Check for 0
+    orreq       r7, r7, #2                 ; Only do mbfilter branch
+
+    ; mbfilter() function
+
+    ; filter() function
+    ; convert to signed
+    veor        d23, d8, d22               ; qs0
+    veor        d24, d7, d22               ; ps0
+    veor        d25, d6, d22               ; ps1
+    veor        d26, d9, d22               ; qs1
+
+    vmov.u8     d27, #3
+
+    vsub.s8     d28, d23, d24              ; ( qs0 - ps0)
+
+    vqsub.s8    d29, d25, d26              ; filter = clamp(ps1-qs1)
+
+    vmull.s8    q15, d28, d27              ; 3 * ( qs0 - ps0)
+
+    vand        d29, d29, d21              ; filter &= hev
+
+    vaddw.s8    q15, q15, d29              ; filter + 3 * (qs0 - ps0)
+
+    vmov.u8     d29, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d28, q15
+
+    vand        d28, d28, d19              ; filter &= mask
+
+    vqadd.s8    d30, d28, d27              ; filter2 = clamp(filter+3)
+    vqadd.s8    d29, d28, d29              ; filter1 = clamp(filter+4)
+    vshr.s8     d30, d30, #3               ; filter2 >>= 3
+    vshr.s8     d29, d29, #3               ; filter1 >>= 3
+
+
+    vqadd.s8    d24, d24, d30              ; op0 = clamp(ps0 + filter2)
+    vqsub.s8    d23, d23, d29              ; oq0 = clamp(qs0 - filter1)
+
+    ; outer tap adjustments: ++filter1 >> 1
+    vrshr.s8    d29, d29, #1
+    vbic        d29, d29, d21              ; filter &= ~hev
+
+    vqadd.s8    d25, d25, d29              ; op1 = clamp(ps1 + filter)
+    vqsub.s8    d26, d26, d29              ; oq1 = clamp(qs1 - filter)
+
+    veor        d24, d24, d22              ; *f_op0 = u^0x80
+    veor        d23, d23, d22              ; *f_oq0 = u^0x80
+    veor        d25, d25, d22              ; *f_op1 = u^0x80
+    veor        d26, d26, d22              ; *f_oq1 = u^0x80
+
+    tst         r7, #1
+    bxne        lr
+
+    ; mbfilter flat && mask branch
+    ; TODO(fgalligan): Can I decrease the cycles shifting to consective d's
+    ; and using vibt on the q's?
+    vmov.u8     d29, #2
+    vaddl.u8    q15, d7, d8                ; op2 = p0 + q0
+    vmlal.u8    q15, d4, d27               ; op2 = p0 + q0 + p3 * 3
+    vmlal.u8    q15, d5, d29               ; op2 = p0 + q0 + p3 * 3 + p2 * 2
+    vaddw.u8    q15, d6                    ; op2=p1 + p0 + q0 + p3 * 3 + p2 *2
+    vqrshrn.u16 d18, q15, #3               ; r_op2
+
+    vsubw.u8    q15, d4                    ; op1 = op2 - p3
+    vsubw.u8    q15, d5                    ; op1 -= p2
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d9                    ; op1 += q1
+    vqrshrn.u16 d19, q15, #3               ; r_op1
+
+    vsubw.u8    q15, d4                    ; op0 = op1 - p3
+    vsubw.u8    q15, d6                    ; op0 -= p1
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d10                   ; op0 += q2
+    vqrshrn.u16 d20, q15, #3               ; r_op0
+
+    vsubw.u8    q15, d4                    ; oq0 = op0 - p3
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d11                   ; oq0 += q3
+    vqrshrn.u16 d21, q15, #3               ; r_oq0
+
+    vsubw.u8    q15, d5                    ; oq1 = oq0 - p2
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d11                   ; oq1 += q3
+    vqrshrn.u16 d22, q15, #3               ; r_oq1
+
+    vsubw.u8    q15, d6                    ; oq2 = oq0 - p1
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d11                   ; oq2 += q3
+    vqrshrn.u16 d27, q15, #3               ; r_oq2
+
+    ; Filter does not set op2 or oq2, so use p2 and q2.
+    vbif        d18, d5, d16               ; t_op2 |= p2 & ~(flat & mask)
+    vbif        d19, d25, d16              ; t_op1 |= f_op1 & ~(flat & mask)
+    vbif        d20, d24, d16              ; t_op0 |= f_op0 & ~(flat & mask)
+    vbif        d21, d23, d16              ; t_oq0 |= f_oq0 & ~(flat & mask)
+    vbif        d22, d26, d16              ; t_oq1 |= f_oq1 & ~(flat & mask)
+
+    vbit        d23, d27, d16              ; t_oq2 |= r_oq2 & (flat & mask)
+    vbif        d23, d10, d16              ; t_oq2 |= q2 & ~(flat & mask)
+
+    tst         r7, #2
+    bxne        lr
+
+    ; wide_mbfilter flat2 && flat && mask branch
+    vmov.u8     d16, #7
+    vaddl.u8    q15, d7, d8                ; op6 = p0 + q0
+    vmlal.u8    q15, d0, d16               ; op6 += p7 * 3
+    vmlal.u8    q15, d1, d29               ; op6 += p6 * 2
+    vaddw.u8    q15, d2                    ; op6 += p5
+    vaddw.u8    q15, d3                    ; op6 += p4
+    vaddw.u8    q15, d4                    ; op6 += p3
+    vaddw.u8    q15, d5                    ; op6 += p2
+    vaddw.u8    q15, d6                    ; op6 += p1
+    vqrshrn.u16 d16, q15, #4               ; w_op6
+
+    vsubw.u8    q15, d0                    ; op5 = op6 - p7
+    vsubw.u8    q15, d1                    ; op5 -= p6
+    vaddw.u8    q15, d2                    ; op5 += p5
+    vaddw.u8    q15, d9                    ; op5 += q1
+    vqrshrn.u16 d24, q15, #4               ; w_op5
+
+    vsubw.u8    q15, d0                    ; op4 = op5 - p7
+    vsubw.u8    q15, d2                    ; op4 -= p5
+    vaddw.u8    q15, d3                    ; op4 += p4
+    vaddw.u8    q15, d10                   ; op4 += q2
+    vqrshrn.u16 d25, q15, #4               ; w_op4
+
+    vsubw.u8    q15, d0                    ; op3 = op4 - p7
+    vsubw.u8    q15, d3                    ; op3 -= p4
+    vaddw.u8    q15, d4                    ; op3 += p3
+    vaddw.u8    q15, d11                   ; op3 += q3
+    vqrshrn.u16 d26, q15, #4               ; w_op3
+
+    vsubw.u8    q15, d0                    ; op2 = op3 - p7
+    vsubw.u8    q15, d4                    ; op2 -= p3
+    vaddw.u8    q15, d5                    ; op2 += p2
+    vaddw.u8    q15, d12                   ; op2 += q4
+    vqrshrn.u16 d27, q15, #4               ; w_op2
+
+    vbif        d27, d18, d17              ; op2 |= t_op2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op1 = op2 - p7
+    vsubw.u8    q15, d5                    ; op1 -= p2
+    vaddw.u8    q15, d6                    ; op1 += p1
+    vaddw.u8    q15, d13                   ; op1 += q5
+    vqrshrn.u16 d18, q15, #4               ; w_op1
+
+    vbif        d18, d19, d17              ; op1 |= t_op1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; op0 = op1 - p7
+    vsubw.u8    q15, d6                    ; op0 -= p1
+    vaddw.u8    q15, d7                    ; op0 += p0
+    vaddw.u8    q15, d14                   ; op0 += q6
+    vqrshrn.u16 d19, q15, #4               ; w_op0
+
+    vbif        d19, d20, d17              ; op0 |= t_op0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d0                    ; oq0 = op0 - p7
+    vsubw.u8    q15, d7                    ; oq0 -= p0
+    vaddw.u8    q15, d8                    ; oq0 += q0
+    vaddw.u8    q15, d15                   ; oq0 += q7
+    vqrshrn.u16 d20, q15, #4               ; w_oq0
+
+    vbif        d20, d21, d17              ; oq0 |= t_oq0 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d1                    ; oq1 = oq0 - p6
+    vsubw.u8    q15, d8                    ; oq1 -= q0
+    vaddw.u8    q15, d9                    ; oq1 += q1
+    vaddw.u8    q15, d15                   ; oq1 += q7
+    vqrshrn.u16 d21, q15, #4               ; w_oq1
+
+    vbif        d21, d22, d17              ; oq1 |= t_oq1 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d2                    ; oq2 = oq1 - p5
+    vsubw.u8    q15, d9                    ; oq2 -= q1
+    vaddw.u8    q15, d10                   ; oq2 += q2
+    vaddw.u8    q15, d15                   ; oq2 += q7
+    vqrshrn.u16 d22, q15, #4               ; w_oq2
+
+    vbif        d22, d23, d17              ; oq2 |= t_oq2 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d3                    ; oq3 = oq2 - p4
+    vsubw.u8    q15, d10                   ; oq3 -= q2
+    vaddw.u8    q15, d11                   ; oq3 += q3
+    vaddw.u8    q15, d15                   ; oq3 += q7
+    vqrshrn.u16 d23, q15, #4               ; w_oq3
+
+    vbif        d16, d1, d17               ; op6 |= p6 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d4                    ; oq4 = oq3 - p3
+    vsubw.u8    q15, d11                   ; oq4 -= q3
+    vaddw.u8    q15, d12                   ; oq4 += q4
+    vaddw.u8    q15, d15                   ; oq4 += q7
+    vqrshrn.u16 d1, q15, #4                ; w_oq4
+
+    vbif        d24, d2, d17               ; op5 |= p5 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d5                    ; oq5 = oq4 - p2
+    vsubw.u8    q15, d12                   ; oq5 -= q4
+    vaddw.u8    q15, d13                   ; oq5 += q5
+    vaddw.u8    q15, d15                   ; oq5 += q7
+    vqrshrn.u16 d2, q15, #4                ; w_oq5
+
+    vbif        d25, d3, d17               ; op4 |= p4 & ~(f2 & f & m)
+
+    vsubw.u8    q15, d6                    ; oq6 = oq5 - p1
+    vsubw.u8    q15, d13                   ; oq6 -= q5
+    vaddw.u8    q15, d14                   ; oq6 += q6
+    vaddw.u8    q15, d15                   ; oq6 += q7
+    vqrshrn.u16 d3, q15, #4                ; w_oq6
+
+    vbif        d26, d4, d17               ; op3 |= p3 & ~(f2 & f & m)
+    vbif        d23, d11, d17              ; oq3 |= q3 & ~(f2 & f & m)
+    vbif        d1, d12, d17               ; oq4 |= q4 & ~(f2 & f & m)
+    vbif        d2, d13, d17               ; oq5 |= q5 & ~(f2 & f & m)
+    vbif        d3, d14, d17               ; oq6 |= q6 & ~(f2 & f & m)
+
+    bx          lr
+    ENDP        ; |vp9_wide_mbfilter_neon|
+
+    END
diff --git a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
index 8e4aadac2..f82966577 100644
--- a/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
+++ b/libvpx/vp9/common/arm/neon/vp9_short_idct8x8_add_neon.asm
@@ -22,8 +22,8 @@
     MACRO
     IDCT8x8_1D
     ; stage 1
-    vdup.16         d0, r3;                   ; duplicate cospi_28_64
-    vdup.16         d1, r4;                   ; duplicate cospi_4_64
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
 
     ; input[1] * cospi_28_64
     vmull.s16       q2, d18, d0
@@ -57,8 +57,8 @@
     vqrshrn.s32     d14, q2, #14              ; >> 14
     vqrshrn.s32     d15, q3, #14              ; >> 14
 
-    vdup.16         d0, r5;                   ; duplicate cospi_12_64
-    vdup.16         d1, r6;                   ; duplicate cospi_20_64
+    vdup.16         d0, r5                    ; duplicate cospi_12_64
+    vdup.16         d1, r6                    ; duplicate cospi_20_64
 
     ; input[5] * cospi_12_64
     vmull.s16       q2, d26, d0
@@ -93,7 +93,7 @@
     vqrshrn.s32     d13, q1, #14              ; >> 14
 
     ; stage 2 & stage 3 - even half
-    vdup.16         d0, r7;                   ; duplicate cospi_16_64
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
 
     ; input[0] * cospi_16_64
     vmull.s16       q2, d16, d0
@@ -128,8 +128,8 @@
     vqrshrn.s32     d23, q3, #14              ; >> 14
 
     ; input[1] * cospi_24_64 - input[3] * cospi_8_64
-    vdup.16         d0, r8;                   ; duplicate cospi_24_64
-    vdup.16         d1, r9;                   ; duplicate cospi_8_64
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
 
     ; input[1] * cospi_24_64
     vmull.s16       q2, d20, d0
@@ -176,7 +176,7 @@
     vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
 
     ; stage 3 -odd half
-    vdup.16         d16, r7;                   ; duplicate cospi_16_64
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
 
     ; step2[6] * cospi_16_64
     vmull.s16       q9, d28, d16
@@ -211,14 +211,14 @@
     vqrshrn.s32     d13, q10, #14             ; >> 14
 
     ; stage 4
-    vadd.s16        q8, q0, q7;               ; output[0] = step1[0] + step1[7];
-    vadd.s16        q9, q1, q6;               ; output[1] = step1[1] + step1[6];
-    vadd.s16        q10, q2, q5;              ; output[2] = step1[2] + step1[5];
-    vadd.s16        q11, q3, q4;              ; output[3] = step1[3] + step1[4];
-    vsub.s16        q12, q3, q4;              ; output[4] = step1[3] - step1[4];
-    vsub.s16        q13, q2, q5;              ; output[5] = step1[2] - step1[5];
-    vsub.s16        q14, q1, q6;              ; output[6] = step1[1] - step1[6];
-    vsub.s16        q15, q0, q7;              ; output[7] = step1[0] - step1[7];
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
     MEND
 
     ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
@@ -310,14 +310,14 @@
     mov             r0, r1
 
     ; load destination data
-    vld1.u8         {d0}, [r1], r2
-    vld1.u8         {d1}, [r1], r2
-    vld1.s16        {d2}, [r1], r2
-    vld1.s16        {d3}, [r1], r2
-    vld1.s16        {d4}, [r1], r2
-    vld1.s16        {d5}, [r1], r2
-    vld1.s16        {d6}, [r1], r2
-    vld1.s16        {d7}, [r1]
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
 
     ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
     vaddw.u8        q8, q8, d0
diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h
index 129711412..f68c5c6ea 100644
--- a/libvpx/vp9/common/vp9_blockd.h
+++ b/libvpx/vp9/common/vp9_blockd.h
@@ -26,9 +26,6 @@
 #include "vp9/common/vp9_treecoder.h"
 
 #define BLOCK_SIZE_GROUPS   4
-
-#define PREDICTION_PROBS 3
-
 #define MBSKIP_CONTEXTS 3
 
 /* Segment Feature Masks */
@@ -164,6 +161,11 @@ typedef struct {
   union b_mode_info bmi[4];
 } MODE_INFO;
 
+static int is_inter_block(const MB_MODE_INFO *mbmi) {
+  return mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
@@ -286,22 +288,22 @@ typedef struct macroblockd {
 
 static INLINE unsigned char *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
   switch (subsize) {
-    case BLOCK_SIZE_SB64X64:
-    case BLOCK_SIZE_SB64X32:
-    case BLOCK_SIZE_SB32X64:
-    case BLOCK_SIZE_SB32X32:
+    case BLOCK_64X64:
+    case BLOCK_64X32:
+    case BLOCK_32X64:
+    case BLOCK_32X32:
       return &xd->sb_index;
-    case BLOCK_SIZE_SB32X16:
-    case BLOCK_SIZE_SB16X32:
-    case BLOCK_SIZE_MB16X16:
+    case BLOCK_32X16:
+    case BLOCK_16X32:
+    case BLOCK_16X16:
       return &xd->mb_index;
-    case BLOCK_SIZE_SB16X8:
-    case BLOCK_SIZE_SB8X16:
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_16X8:
+    case BLOCK_8X16:
+    case BLOCK_8X8:
       return &xd->b_index;
-    case BLOCK_SIZE_SB8X4:
-    case BLOCK_SIZE_SB4X8:
-    case BLOCK_SIZE_AB4X4:
+    case BLOCK_8X4:
+    case BLOCK_4X8:
+    case BLOCK_4X4:
       return &xd->ab_index;
     default:
       assert(0);
@@ -315,7 +317,7 @@ static INLINE void update_partition_context(MACROBLOCKD *xd,
   const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
   const int bwl = b_width_log2(sb_type);
   const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
   const char pcval0 = ~(0xe << boffset);
   const char pcval1 = ~(0xf << boffset);
   const char pcvalue[2] = {pcval0, pcval1};
@@ -333,7 +335,7 @@ static INLINE int partition_plane_context(MACROBLOCKD *xd,
                                           BLOCK_SIZE_TYPE sb_type) {
   int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl;
+  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
 
   assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
   assert(bsl >= 0);
@@ -366,10 +368,10 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
 
   if (plane_type != PLANE_TYPE_Y_WITH_DC ||
       xd->lossless ||
-      mbmi->ref_frame[0] != INTRA_FRAME)
+      is_inter_block(mbmi))
     return DCT_DCT;
 
-  return mode2txfm_map[mbmi->sb_type < BLOCK_SIZE_SB8X8 ?
+  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
                        mi->bmi[ib].as_mode : mbmi->mode];
 }
 
@@ -496,16 +498,16 @@ static INLINE void foreach_transformed_block_in_plane(
     // it to 4x4 block sizes.
     if (xd->mb_to_right_edge < 0)
       max_blocks_wide +=
-          + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
+          (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x));
 
     if (xd->mb_to_bottom_edge < 0)
       max_blocks_high +=
-          + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
+          (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y));
 
     i = 0;
     // Unlike the normal case - in here we have to keep track of the
     // row and column of the blocks we use so that we know if we are in
-    // the unrestricted motion border..
+    // the unrestricted motion border.
     for (r = 0; r < (1 << sh); r += (1 << tx_size)) {
       for (c = 0; c < (1 << sw); c += (1 << tx_size)) {
         if (r < max_blocks_high && c < max_blocks_wide)
@@ -563,8 +565,8 @@ static INLINE void foreach_predicted_block_in_plane(
   // size of the predictor to use.
   int pred_w, pred_h;
 
-  if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-    assert(bsize == BLOCK_SIZE_SB8X8);
+  if (xd->mode_info_context->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
     pred_w = 0;
     pred_h = 0;
   } else {
@@ -689,46 +691,39 @@ static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block,
   }
 }
 static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                                   int plane, int ss_tx_size, int eob, int aoff,
-                                   int loff, ENTROPY_CONTEXT *A,
-                                   ENTROPY_CONTEXT *L) {
-  const int bw = b_width_log2(bsize), bh = b_height_log2(bsize);
-  const int sw = bw - xd->plane[plane].subsampling_x;
-  const int sh = bh - xd->plane[plane].subsampling_y;
-  int mi_blocks_wide = 1 << sw;
-  int mi_blocks_high = 1 << sh;
-  int tx_size_in_blocks = (1 << ss_tx_size);
+                                   int plane, int tx_size_in_blocks,
+                                   int eob, int aoff, int loff,
+                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
+  struct macroblockd_plane *pd = &xd->plane[plane];
   int above_contexts = tx_size_in_blocks;
   int left_contexts = tx_size_in_blocks;
+  int mi_blocks_wide = 1 << plane_block_width_log2by4(bsize, pd);
+  int mi_blocks_high = 1 << plane_block_height_log2by4(bsize, pd);
   int pt;
 
   // xd->mb_to_right_edge is in units of pixels * 8.  This converts
   // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0) {
-    mi_blocks_wide += (xd->mb_to_right_edge
-        >> (5 + xd->plane[plane].subsampling_x));
-  }
+  if (xd->mb_to_right_edge < 0)
+    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
 
   // this code attempts to avoid copying into contexts that are outside
   // our border.  Any blocks that do are set to 0...
   if (above_contexts + aoff > mi_blocks_wide)
     above_contexts = mi_blocks_wide - aoff;
 
-  if (xd->mb_to_bottom_edge < 0) {
-    mi_blocks_high += (xd->mb_to_bottom_edge
-        >> (5 + xd->plane[plane].subsampling_y));
-  }
-  if (left_contexts + loff > mi_blocks_high) {
+  if (xd->mb_to_bottom_edge < 0)
+    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+  if (left_contexts + loff > mi_blocks_high)
     left_contexts = mi_blocks_high - loff;
-  }
 
   for (pt = 0; pt < above_contexts; pt++)
     A[pt] = eob > 0;
-  for (pt = above_contexts; pt < (1 << ss_tx_size); pt++)
+  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
     A[pt] = 0;
   for (pt = 0; pt < left_contexts; pt++)
     L[pt] = eob > 0;
-  for (pt = left_contexts; pt < (1 << ss_tx_size); pt++)
+  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
     L[pt] = 0;
 }
 
diff --git a/libvpx/vp9/common/vp9_common_data.c b/libvpx/vp9/common/vp9_common_data.c
index dee44ec63..fdf37e46a 100644
--- a/libvpx/vp9/common/vp9_common_data.c
+++ b/libvpx/vp9/common/vp9_common_data.c
@@ -31,6 +31,14 @@ const int mi_height_log2_lookup[BLOCK_SIZE_TYPES] =
 const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES] =
   {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
 
+// MIN(3, MIN(b_width_log2(bsize), b_height_log2(bsize)))
+const int size_group_lookup[BLOCK_SIZE_TYPES] =
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+
+const int num_pels_log2_lookup[BLOCK_SIZE_TYPES] =
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+
+
 const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
   {  // 4X4
     // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
@@ -40,25 +48,25 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID
   }, {  // 8X8
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
   }, {  // 16X16
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID
   }, {  // 32X32
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
     PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
     PARTITION_INVALID, PARTITION_INVALID
   }, {  // 64X64
-  // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
+    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
     PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
@@ -68,29 +76,29 @@ const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES] = {
 
 const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES] = {
   {     // PARTITION_NONE
-    BLOCK_SIZE_AB4X4, BLOCK_SIZE_SB4X8, BLOCK_SIZE_SB8X4,
-    BLOCK_SIZE_SB8X8, BLOCK_SIZE_SB8X16, BLOCK_SIZE_SB16X8,
-    BLOCK_SIZE_MB16X16, BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB32X16,
-    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32,
-    BLOCK_SIZE_SB64X64,
+    BLOCK_4X4, BLOCK_4X8, BLOCK_8X4,
+    BLOCK_8X8, BLOCK_8X16, BLOCK_16X8,
+    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+    BLOCK_64X64,
   }, {  // PARTITION_HORZ
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB64X32,
+    BLOCK_8X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_64X32,
   }, {  // PARTITION_VERT
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X64,
+    BLOCK_4X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X32, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X64,
   }, {  // PARTITION_SPLIT
     BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_AB4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_MB16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
-    BLOCK_SIZE_SB32X32,
+    BLOCK_4X4, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_8X8, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_16X16, BLOCK_SIZE_TYPES, BLOCK_SIZE_TYPES,
+    BLOCK_32X32,
   }
 };
 
@@ -108,14 +116,9 @@ const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES] = {
 };
 
 const BLOCK_SIZE_TYPE bsize_from_dim_lookup[5][5] = {
-  {BLOCK_SIZE_AB4X4,   BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8,
-    BLOCK_SIZE_SB4X8,   BLOCK_SIZE_SB4X8},
-  {BLOCK_SIZE_SB8X4,   BLOCK_SIZE_SB8X8,   BLOCK_SIZE_SB8X16,
-    BLOCK_SIZE_SB8X16,  BLOCK_SIZE_SB8X16},
-  {BLOCK_SIZE_SB16X8,  BLOCK_SIZE_SB16X8,  BLOCK_SIZE_MB16X16,
-    BLOCK_SIZE_SB16X32, BLOCK_SIZE_SB16X32},
-  {BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16, BLOCK_SIZE_SB32X16,
-    BLOCK_SIZE_SB32X32, BLOCK_SIZE_SB32X64},
-  {BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X32,
-    BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64}
+  { BLOCK_4X4,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8,   BLOCK_4X8 },
+  { BLOCK_8X4,   BLOCK_8X8,   BLOCK_8X16,  BLOCK_8X16,  BLOCK_8X16 },
+  { BLOCK_16X8,  BLOCK_16X8,  BLOCK_16X16, BLOCK_16X32, BLOCK_16X32 },
+  { BLOCK_32X16, BLOCK_32X16, BLOCK_32X16, BLOCK_32X32, BLOCK_32X64 },
+  { BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X32, BLOCK_64X64 }
 };
diff --git a/libvpx/vp9/common/vp9_common_data.h b/libvpx/vp9/common/vp9_common_data.h
index 8b0f8a500..bc8c01a77 100644
--- a/libvpx/vp9/common/vp9_common_data.h
+++ b/libvpx/vp9/common/vp9_common_data.h
@@ -21,10 +21,9 @@ extern const int num_8x8_blocks_wide_lookup[BLOCK_SIZE_TYPES];
 extern const int num_8x8_blocks_high_lookup[BLOCK_SIZE_TYPES];
 extern const int num_4x4_blocks_high_lookup[BLOCK_SIZE_TYPES];
 extern const int num_4x4_blocks_wide_lookup[BLOCK_SIZE_TYPES];
-extern const PARTITION_TYPE
-  partition_lookup[][BLOCK_SIZE_TYPES];
-
-
+extern const int size_group_lookup[BLOCK_SIZE_TYPES];
+extern const int num_pels_log2_lookup[BLOCK_SIZE_TYPES];
+extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZE_TYPES];
 extern const BLOCK_SIZE_TYPE subsize_lookup[PARTITION_TYPES][BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZE_TYPES];
 extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZE_TYPES];
diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c
index 0ad0dbccd..df3a9fed5 100644
--- a/libvpx/vp9/common/vp9_entropy.c
+++ b/libvpx/vp9/common/vp9_entropy.c
@@ -73,7 +73,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -419,7 +419,7 @@ static void init_bit_trees() {
   init_bit_tree(cat6, 14);
 }
 
-vp9_extra_bit vp9_extra_bits[12] = {
+const vp9_extra_bit vp9_extra_bits[12] = {
   { 0, 0, 0, 0},
   { 0, 0, 0, 1},
   { 0, 0, 0, 2},
@@ -437,14 +437,10 @@ vp9_extra_bit vp9_extra_bits[12] = {
 #include "vp9/common/vp9_default_coef_probs.h"
 
 void vp9_default_coef_probs(VP9_COMMON *pc) {
-  vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4,
-             sizeof(pc->fc.coef_probs[TX_4X4]));
-  vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8,
-             sizeof(pc->fc.coef_probs[TX_8X8]));
-  vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16,
-             sizeof(pc->fc.coef_probs[TX_16X16]));
-  vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32,
-             sizeof(pc->fc.coef_probs[TX_32X32]));
+  vp9_copy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
+  vp9_copy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
+  vp9_copy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16);
+  vp9_copy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
 // Neighborhood 5-tuples for various scans and blocksizes,
@@ -613,17 +609,17 @@ void vp9_coef_tree_initialize() {
 #define COEF_COUNT_SAT_AFTER_KEY 24
 #define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
 
-static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
-                             int count_sat, int update_factor) {
+static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
+                             unsigned int count_sat,
+                             unsigned int update_factor) {
   FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
 
-  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size];
-  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[txfm_size];
-  vp9_coeff_count_model *coef_counts = cm->counts.coef[txfm_size];
+  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
+  vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
+  vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
   unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cm->counts.eob_branch[txfm_size];
-  int t, i, j, k, l, count;
-  int factor;
+      cm->counts.eob_branch[tx_size];
+  int t, i, j, k, l;
   unsigned int branch_ct[UNCONSTRAINED_NODES][2];
   vp9_prob coef_probs[UNCONSTRAINED_NODES];
   int entropy_nodes_adapt = UNCONSTRAINED_NODES;
@@ -634,29 +630,23 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size,
         for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
           if (l >= 3 && k == 0)
             continue;
-          vp9_tree_probs_from_distribution(
-              vp9_coefmodel_tree,
-              coef_probs, branch_ct,
-              coef_counts[i][j][k][l], 0);
+          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
+                                           branch_ct, coef_counts[i][j][k][l],
+                                           0);
           branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
           coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < entropy_nodes_adapt; ++t) {
-            count = branch_ct[t][0] + branch_ct[t][1];
-            count = count > count_sat ? count_sat : count;
-            factor = (update_factor * count / count_sat);
-            dst_coef_probs[i][j][k][l][t] =
-                weighted_prob(pre_coef_probs[i][j][k][l][t],
-                              coef_probs[t], factor);
-          }
+          for (t = 0; t < entropy_nodes_adapt; ++t)
+            dst_coef_probs[i][j][k][l][t] = merge_probs(
+                pre_coef_probs[i][j][k][l][t], coef_probs[t],
+                branch_ct[t], count_sat, update_factor);
         }
 }
 
 void vp9_adapt_coef_probs(VP9_COMMON *cm) {
   TX_SIZE t;
-  int count_sat;
-  int update_factor; /* denominator 256 */
+  unsigned int count_sat, update_factor;
 
-  if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+  if (cm->frame_type == KEY_FRAME || cm->intra_only) {
     update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
     count_sat = COEF_COUNT_SAT_KEY;
   } else if (cm->last_frame_type == KEY_FRAME) {
diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h
index 4ea727ff4..861c0786c 100644
--- a/libvpx/vp9/common/vp9_entropy.h
+++ b/libvpx/vp9/common/vp9_entropy.h
@@ -50,7 +50,7 @@ typedef struct {
   int base_val;
 } vp9_extra_bit;
 
-extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
+extern const vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
 
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
@@ -80,7 +80,6 @@ extern vp9_extra_bit vp9_extra_bits[12];    /* indexed by token value */
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
-/*# define DC_TOKEN_CONTEXTS        3*/ /* 00, 0!0, !0!0 */
 #define PREV_COEF_CONTEXTS          6
 
 // #define ENTROPY_STATS
@@ -102,7 +101,7 @@ extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, const int16_t, vp9_default_scan_8x8[64]);
+extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
 
 extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
 extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
@@ -119,7 +118,7 @@ extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
 
-extern DECLARE_ALIGNED(64, int16_t, vp9_default_iscan_8x8[64]);
+extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
 
 extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
 extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index ca188e438..768e5f523 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -356,53 +356,15 @@ void vp9_entropy_mode_init() {
                               vp9_inter_mode_tree, NEARESTMV);
 }
 
-void vp9_accum_mv_refs(VP9_COMMON *pc,
-                       MB_PREDICTION_MODE m,
-                       const int context) {
-  unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] =
-      pc->counts.inter_mode;
-
-  if (m == ZEROMV) {
-    ++inter_mode_counts[context][0][0];
-  } else {
-    ++inter_mode_counts[context][0][1];
-    if (m == NEARESTMV) {
-      ++inter_mode_counts[context][1][0];
-    } else {
-      ++inter_mode_counts[context][1][1];
-      if (m == NEARMV) {
-        ++inter_mode_counts[context][2][0];
-      } else {
-        ++inter_mode_counts[context][2][1];
-      }
-    }
-  }
-}
-
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
-                          unsigned int ct[2]) {
-  const int count = MIN(ct[0] + ct[1], COUNT_SAT);
-  const int factor = MAX_UPDATE_FACTOR * count / COUNT_SAT;
-  return weighted_prob(pre_prob, prob, factor);
+static int update_ct(vp9_prob pre_prob, vp9_prob prob, unsigned int ct[2]) {
+  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
 static int update_ct2(vp9_prob pre_prob, unsigned int ct[2]) {
-  return update_ct(pre_prob, get_binary_prob(ct[0], ct[1]), ct);
-}
-
-void vp9_adapt_mode_context(VP9_COMMON *pc) {
-  int i, j;
-  FRAME_CONTEXT *const fc = &pc->fc;
-  FRAME_CONTEXT *const pre_fc = &pc->frame_contexts[pc->frame_context_idx];
-  FRAME_COUNTS  *const counts = &pc->counts;
-
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++)
-    for (i = 0; i < VP9_INTER_MODES - 1; i++)
-      fc->inter_mode_probs[j][i] = update_ct2(pre_fc->inter_mode_probs[j][i],
-                                              counts->inter_mode[j][i]);
+  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
 static void update_mode_probs(int n_modes,
@@ -440,6 +402,11 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
       fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    update_mode_probs(VP9_INTER_MODES, vp9_inter_mode_tree,
+                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
+                      fc->inter_mode_probs[i], NEARESTMV);
+
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree,
                       counts->y_mode[i], pre_fc->y_mode_prob[i],
@@ -466,25 +433,25 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
 
   if (cm->tx_mode == TX_MODE_SELECT) {
     int j;
-    unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
+    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
+    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
 
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+      for (j = 0; j < TX_SIZES - 3; ++j)
         fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i],
                                        branch_ct_16x16p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+      for (j = 0; j < TX_SIZES - 2; ++j)
         fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i],
                                        branch_ct_32x32p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+      for (j = 0; j < TX_SIZES - 1; ++j)
         fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
@@ -495,22 +462,24 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
                                      counts->mbskip[i]);
 }
 
-static void set_default_lf_deltas(MACROBLOCKD *xd) {
-  xd->lf.mode_ref_delta_enabled = 1;
-  xd->lf.mode_ref_delta_update = 1;
+static void set_default_lf_deltas(struct loopfilter *lf) {
+  lf->mode_ref_delta_enabled = 1;
+  lf->mode_ref_delta_update = 1;
 
-  xd->lf.ref_deltas[INTRA_FRAME] = 1;
-  xd->lf.ref_deltas[LAST_FRAME] = 0;
-  xd->lf.ref_deltas[GOLDEN_FRAME] = -1;
-  xd->lf.ref_deltas[ALTREF_FRAME] = -1;
+  lf->ref_deltas[INTRA_FRAME] = 1;
+  lf->ref_deltas[LAST_FRAME] = 0;
+  lf->ref_deltas[GOLDEN_FRAME] = -1;
+  lf->ref_deltas[ALTREF_FRAME] = -1;
 
-  xd->lf.mode_deltas[0] = 0;
-  xd->lf.mode_deltas[1] = 0;
+  lf->mode_deltas[0] = 0;
+  lf->mode_deltas[1] = 0;
 }
 
 void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
   // Reset the segment feature data to the default stats:
   // Features disabled, 0, with delta coding (Default state).
+  struct loopfilter *const lf = &xd->lf;
+
   int i;
   vp9_clearall_segfeatures(&xd->seg);
   xd->seg.abs_delta = SEGMENT_DELTADATA;
@@ -518,12 +487,12 @@ void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) {
     vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   // Reset the mode ref deltas for loop filter
-  vp9_zero(xd->lf.last_ref_deltas);
-  vp9_zero(xd->lf.last_mode_deltas);
-  set_default_lf_deltas(xd);
+  vp9_zero(lf->last_ref_deltas);
+  vp9_zero(lf->last_mode_deltas);
+  set_default_lf_deltas(lf);
 
   // To force update of the sharpness
-  xd->lf.last_sharpness_level = -1;
+  lf->last_sharpness_level = -1;
 
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h
index 8c14e7e17..17a7c2634 100644
--- a/libvpx/vp9/common/vp9_entropymode.h
+++ b/libvpx/vp9/common/vp9_entropymode.h
@@ -24,15 +24,15 @@
 struct VP9Common;
 
 struct tx_probs {
-  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
-  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3];
+  vp9_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  vp9_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
+  vp9_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
 };
 
 struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
+  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 };
 
 extern const vp9_prob vp9_kf_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
@@ -61,18 +61,12 @@ extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS];
 
 void vp9_entropy_mode_init();
 
-int vp9_mv_cont(const int_mv *l, const int_mv *a);
-
 void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd);
 
 void vp9_init_mbmode_probs(struct VP9Common *x);
 
-void vp9_adapt_mode_context(struct VP9Common *pc);
-
 void vp9_adapt_mode_probs(struct VP9Common *);
 
-void vp9_accum_mv_refs(struct VP9Common *pc, MB_PREDICTION_MODE m, int context);
-
 void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p,
                                       unsigned int (*ct_32x32p)[2]);
 void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p,
diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c
index 343b6241d..6cfc34697 100644
--- a/libvpx/vp9/common/vp9_entropymv.c
+++ b/libvpx/vp9/common/vp9_entropymv.c
@@ -16,7 +16,7 @@
 #define MV_MAX_UPDATE_FACTOR 128
 
 /* Integer pel reference mv threshold for use of high-precision 1/8 mv */
-#define COMPANDED_MVREF_THRESH    8
+#define COMPANDED_MVREF_THRESH 8
 
 const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = {
   -MV_JOINT_ZERO, 2,
@@ -107,12 +107,6 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
   return mv_class_base(c) + offset;
 }
 
-static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
-                                   int incr) {
-  assert (v != 0);
-  comp_counts->mvcount[MV_MAX + v] += incr;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;
@@ -164,25 +158,19 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
   }
 }
 
-void vp9_inc_mv(const MV *mv,  nmv_context_counts *mvctx) {
+void vp9_inc_mv(const MV *mv,  nmv_context_counts *counts) {
   const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
-  mvctx->joints[j]++;
+  ++counts->joints[j];
+
   if (mv_joint_vertical(j))
-    inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
+    ++counts->comps[0].mvcount[MV_MAX + mv->row];
 
   if (mv_joint_horizontal(j))
-    inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
+    ++counts->comps[1].mvcount[MV_MAX + mv->col];
 }
 
-static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
-  const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT);
-  if (count) {
-    const vp9_prob newp = get_binary_prob(ct[0], ct[1]);
-    const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT;
-    *dest = weighted_prob(prep, newp, factor);
-  } else {
-    *dest = prep;
-  }
+static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
+  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
 void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) {
@@ -195,31 +183,22 @@ static unsigned int adapt_probs(unsigned int i,
                                 vp9_prob this_probs[],
                                 const vp9_prob last_probs[],
                                 const unsigned int num_events[]) {
-  vp9_prob this_prob;
 
-  const uint32_t left = tree[i] <= 0
+
+  const unsigned int left = tree[i] <= 0
           ? num_events[-tree[i]]
           : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
 
-  const uint32_t right = tree[i + 1] <= 0
+  const unsigned int right = tree[i + 1] <= 0
           ? num_events[-tree[i + 1]]
           : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-
-  uint32_t weight = left + right;
-  if (weight) {
-    this_prob = get_binary_prob(left, right);
-    weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight;
-    this_prob = weighted_prob(last_probs[i >> 1], this_prob,
-                              MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT);
-  } else {
-    this_prob = last_probs[i >> 1];
-  }
-  this_probs[i >> 1] = this_prob;
+  const unsigned int ct[2] = { left, right };
+  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
   return left + right;
 }
 
 
-void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
   FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
@@ -228,36 +207,32 @@ void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
   nmv_context *pre_ctx = &pre_fc->nmvc;
   nmv_context_counts *cts = &cm->counts.mv;
 
-  vp9_counts_process(cts, usehp);
+  vp9_counts_process(cts, allow_hp);
 
   adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
 
   for (i = 0; i < 2; ++i) {
-    adapt_prob(&ctx->comps[i].sign, pre_ctx->comps[i].sign, cts->comps[i].sign);
+    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
     adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
                 pre_ctx->comps[i].classes, cts->comps[i].classes);
     adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
                 pre_ctx->comps[i].class0, cts->comps[i].class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      adapt_prob(&ctx->comps[i].bits[j], pre_ctx->comps[i].bits[j],
-                 cts->comps[i].bits[j]);
-  }
+        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
+                                           cts->comps[i].bits[j]);
 
-  for (i = 0; i < 2; ++i) {
     for (j = 0; j < CLASS0_SIZE; ++j)
       adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
                   pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
 
     adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
                 cts->comps[i].fp);
-  }
 
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      adapt_prob(&ctx->comps[i].class0_hp, pre_ctx->comps[i].class0_hp,
-                 cts->comps[i].class0_hp);
-      adapt_prob(&ctx->comps[i].hp, pre_ctx->comps[i].hp, cts->comps[i].hp);
+    if (allow_hp) {
+      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
+                                           cts->comps[i].class0_hp);
+      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
     }
   }
 }
diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h
index 86f0d0bfd..3208b7270 100644
--- a/libvpx/vp9/common/vp9_enums.h
+++ b/libvpx/vp9/common/vp9_enums.h
@@ -54,7 +54,7 @@ typedef enum {
   TX_8X8 = 1,                      // 8x8 dct transform
   TX_16X16 = 2,                    // 16x16 dct transform
   TX_32X32 = 3,                    // 32x32 dct transform
-  TX_SIZE_MAX_SB,                  // Number of transforms available to SBs
+  TX_SIZES
 } TX_SIZE;
 
 typedef enum {
@@ -63,7 +63,7 @@ typedef enum {
   ALLOW_16X16         = 2,
   ALLOW_32X32         = 3,
   TX_MODE_SELECT      = 4,
-  NB_TXFM_MODES       = 5,
+  TX_MODES            = 5,
 } TX_MODE;
 
 typedef enum {
diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c
index 95ec59061..d8496c4f2 100644
--- a/libvpx/vp9/common/vp9_extend.c
+++ b/libvpx/vp9/common/vp9_extend.c
@@ -8,9 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/vp9_extend.h"
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_extend.h"
+
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch,
                                   int w, int h,
@@ -107,14 +109,14 @@ void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
   const int src_y_offset = srcy * src->y_stride + srcx;
   const int dst_y_offset = srcy * dst->y_stride + srcx;
 
-  const int et_uv = (et_y + 1) >> 1;
-  const int el_uv = (el_y + 1) >> 1;
-  const int eb_uv = (eb_y + 1) >> 1;
-  const int er_uv = (er_y + 1) >> 1;
+  const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+  const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+  const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+  const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
   const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
   const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
-  const int srch_uv = (srch + 1) >> 1;
-  const int srcw_uv = (srcw + 1) >> 1;
+  const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+  const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
 
   copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
                         dst->y_buffer + dst_y_offset, dst->y_stride,
diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c
index 643b229a6..3af8b8d21 100644
--- a/libvpx/vp9/common/vp9_findnearmv.c
+++ b/libvpx/vp9/common/vp9_findnearmv.c
@@ -14,8 +14,9 @@
 #include "vp9/common/vp9_mvref_common.h"
 #include "vp9/common/vp9_sadmxn.h"
 
-static void lower_mv_precision(int_mv *mv, int usehp) {
-  if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
+static void lower_mv_precision(int_mv *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp9_use_mv_hp(&mv->as_mv);
+  if (!use_hp) {
     if (mv->as_mv.row & 1)
       mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
     if (mv->as_mv.col & 1)
@@ -32,7 +33,7 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
   // Make sure all the candidates are properly clamped etc
   for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
     lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv);
-    clamp_mv2(&mvlist[i], xd);
+    clamp_mv2(&mvlist[i].as_mv, xd);
   }
   *nearest = mvlist[0];
   *near = mvlist[1];
@@ -41,7 +42,8 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
-                                   int block_idx, int ref_idx) {
+                                   int block_idx, int ref_idx,
+                                   int mi_row, int mi_col) {
   int_mv dst_list[MAX_MV_REF_CANDIDATES];
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *mi = xd->mode_info_context;
@@ -53,7 +55,8 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
   vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context,
                        xd->prev_mode_info_context,
                        mbmi->ref_frame[ref_idx],
-                       mv_list, cm->ref_frame_sign_bias, block_idx);
+                       mv_list, cm->ref_frame_sign_bias, block_idx,
+                       mi_row, mi_col);
 
   dst_list[1].as_int = 0;
   if (block_idx == 0) {
diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h
index b0fa505b5..e5221ed67 100644
--- a/libvpx/vp9/common/vp9_findnearmv.h
+++ b/libvpx/vp9/common/vp9_findnearmv.h
@@ -29,31 +29,19 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd,
                            int_mv *near);
 
 // TODO(jingning): this mv clamping function should be block size dependent.
-static void clamp_mv(int_mv *mv,
-                     int mb_to_left_edge,
-                     int mb_to_right_edge,
-                     int mb_to_top_edge,
-                     int mb_to_bottom_edge) {
-  mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge);
-  mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge);
-}
-
-static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) {
-  int_mv tmp_mv;
-  tmp_mv.as_int = mv->as_int;
-  clamp_mv(mv,
-           xd->mb_to_left_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
-           xd->mb_to_top_edge - LEFT_TOP_MARGIN,
-           xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
-  return tmp_mv.as_int != mv->as_int;
+static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+  clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+               xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+               xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc,
                                    MACROBLOCKD *xd,
                                    int_mv *dst_nearest,
                                    int_mv *dst_near,
-                                   int block_idx, int ref_idx);
+                                   int block_idx, int ref_idx,
+                                   int mi_row, int mi_col);
 
 static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
   // FIXME(rbultje, jingning): temporary hack because jenkins doesn't
@@ -62,7 +50,7 @@ static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) {
     /* On L edge, get from MB to left of us */
     --cur_mb;
 
-    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
       return ((cur_mb->bmi + 1 + b)->as_mode);
@@ -80,7 +68,7 @@ static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb,
     /* On top edge, get from MB above us */
     cur_mb -= mi_stride;
 
-    if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) {
+    if (is_inter_block(&cur_mb->mbmi)) {
       return DC_PRED;
     } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
       return ((cur_mb->bmi + 2 + b)->as_mode);
diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c
index a95560a55..a2245259e 100644
--- a/libvpx/vp9/common/vp9_idct.c
+++ b/libvpx/vp9/common/vp9_idct.c
@@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   }
 }
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
+  int i, j;
+  int a1;
+  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
+}
+
 static void iadst4_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
-  out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 5);
-}
-
 static void idct16_1d(int16_t *input, int16_t *output) {
   int16_t step1[16], step2[16];
   int temp1, temp2;
@@ -857,10 +864,18 @@ void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
   }
 }
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest,
+                                 int dest_stride) {
+  int i, j;
+  int a1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
-  output[0] = ROUND_POWER_OF_TWO(out, 6);
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel(dest[i] + a1);
+    dest += dest_stride;
+  }
 }
 
 static void idct32_1d(int16_t *input, int16_t *output) {
@@ -1259,29 +1274,3 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   out = dct_const_round_shift(out * cospi_16_64);
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
-
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
-                                  int dest_stride) {
-  int16_t out[32 * 32] = { 0 };
-  int16_t *outptr = out;
-  int i, j;
-  int16_t temp_in[32], temp_out[32];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    idct32_1d(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j)
-      temp_in[j] = out[j * 32 + i];
-    idct32_1d(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * dest_stride + i]);
-  }
-}
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 5498b1717..66df62753 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -16,6 +16,12 @@
 
 #include "vp9/common/vp9_seg_common.h"
 
+struct loop_filter_info {
+  const uint8_t *mblim;
+  const uint8_t *lim;
+  const uint8_t *hev_thr;
+};
+
 static void lf_init_lut(loop_filter_info_n *lfi) {
   lfi->mode_lf_lut[DC_PRED] = 0;
   lfi->mode_lf_lut[D45_PRED] = 0;
@@ -73,13 +79,14 @@ void vp9_loop_filter_init(VP9_COMMON *cm, struct loopfilter *lf) {
 
 void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                 int default_filt_lvl) {
-  int seg;
+  int seg_id;
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
   const int n_shift = default_filt_lvl >> 5;
   loop_filter_info_n *const lfi = &cm->lf_info;
-  struct loopfilter *lf = &xd->lf;
+  struct loopfilter *const lf = &xd->lf;
+  struct segmentation *const seg = &xd->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -87,13 +94,13 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     lf->last_sharpness_level = lf->sharpness_level;
   }
 
-  for (seg = 0; seg < MAX_SEGMENTS; seg++) {
+  for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
     int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
 
     // Set the baseline filter values for each segment
-    if (vp9_segfeature_active(&xd->seg, seg, SEG_LVL_ALT_LF)) {
-      const int data = vp9_get_segdata(&xd->seg, seg, SEG_LVL_ALT_LF);
-      lvl_seg = xd->seg.abs_delta == SEGMENT_ABSDATA
+    if (vp9_segfeature_active(&xd->seg, seg_id, SEG_LVL_ALT_LF)) {
+      const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
+      lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
                   ? data
                   : clamp(default_filt_lvl + data, 0, MAX_LOOP_FILTER);
     }
@@ -101,18 +108,18 @@ void vp9_loop_filter_frame_init(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     if (!lf->mode_ref_delta_enabled) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
-      vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4);
+      vpx_memset(lfi->lvl[seg_id][0], lvl_seg, 4 * 4);
       continue;
     }
 
     intra_lvl = lvl_seg + (lf->ref_deltas[INTRA_FRAME] << n_shift);
-    lfi->lvl[seg][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
     for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
       for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
         const int inter_lvl = lvl_seg + (lf->ref_deltas[ref] << n_shift)
                                       + (lf->mode_deltas[mode] << n_shift);
-        lfi->lvl[seg][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
       }
   }
 }
@@ -256,7 +263,7 @@ static void filter_block_plane(VP9_COMMON *const cm,
     // Determine the vertical edges that need filtering
     for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
       const int skip_this = mi[c].mbmi.mb_skip_coeff
-                            && mi[c].mbmi.ref_frame[0] != INTRA_FRAME;
+                            && is_inter_block(&mi[c].mbmi);
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ?
           !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1;
@@ -376,3 +383,11 @@ void vp9_loop_filter_frame(VP9_COMMON *cm, MACROBLOCKD *xd,
   vp9_loop_filter_rows(cm->frame_to_show, cm, xd,
                        0, cm->mi_rows, y_only);
 }
+
+int vp9_loop_filter_worker(void *arg1, void *arg2) {
+  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
+  (void)arg2;
+  vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, &lf_data->xd,
+                       lf_data->start, lf_data->stop, lf_data->y_only);
+  return 1;
+}
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index e59cc6485..5fc909495 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -35,13 +35,6 @@ typedef struct {
   uint8_t mode_lf_lut[MB_MODE_COUNT];
 } loop_filter_info_n;
 
-struct loop_filter_info {
-  const uint8_t *mblim;
-  const uint8_t *lim;
-  const uint8_t *hev_thr;
-};
-
-
 /* assorted loopfilter functions which get used elsewhere */
 struct VP9Common;
 struct macroblockd;
@@ -64,4 +57,18 @@ void vp9_loop_filter_frame(struct VP9Common *cm,
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
                           struct VP9Common *cm, struct macroblockd *xd,
                           int start, int stop, int y_only);
+
+typedef struct LoopFilterWorkerData {
+  const YV12_BUFFER_CONFIG *frame_buffer;
+  struct VP9Common *cm;
+  struct macroblockd xd;  // TODO(jzern): most of this is unnecessary to the
+                          // loopfilter. the planes are necessary as their state
+                          // is changed during decode.
+  int start;
+  int stop;
+  int y_only;
+} LFWorkerData;
+
+// Operates on the rows described by LFWorkerData passed as 'arg1'.
+int vp9_loop_filter_worker(void *arg1, void *arg2);
 #endif  // VP9_COMMON_VP9_LOOPFILTER_H_
diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h
index a095258be..31a79b984 100644
--- a/libvpx/vp9/common/vp9_mv.h
+++ b/libvpx/vp9/common/vp9_mv.h
@@ -13,6 +13,8 @@
 
 #include "vpx/vpx_integer.h"
 
+#include "vp9/common/vp9_common.h"
+
 typedef struct {
   int16_t row;
   int16_t col;
@@ -28,4 +30,10 @@ typedef struct {
   int32_t col;
 } MV32;
 
+static void clamp_mv(MV *mv, int min_col, int max_col,
+                             int min_row, int max_row) {
+  mv->col = clamp(mv->col, min_col, max_col);
+  mv->row = clamp(mv->row, min_row, max_row);
+}
+
 #endif  // VP9_COMMON_VP9_MV_H_
diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c
index ae009b0ff..3b72f41c2 100644
--- a/libvpx/vp9/common/vp9_mvref_common.c
+++ b/libvpx/vp9/common/vp9_mvref_common.c
@@ -11,6 +11,65 @@
 #include "vp9/common/vp9_mvref_common.h"
 
 #define MVREF_NEIGHBOURS 8
+
+typedef enum {
+  BOTH_ZERO = 0,
+  ZERO_PLUS_PREDICTED = 1,
+  BOTH_PREDICTED = 2,
+  NEW_PLUS_NON_INTRA = 3,
+  BOTH_NEW = 4,
+  INTRA_PLUS_NON_INTRA = 5,
+  BOTH_INTRA = 6,
+  INVALID_CASE = 9
+} motion_vector_context;
+
+// This is used to figure out a context for the ref blocks. The code flattens
+// an array that would have 3 possible counts (0, 1 & 2) for 3 choices by
+// adding 9 for each intra block, 3 for each zero mv and 1 for each new
+// motion vector. This single number is then converted into a context
+// with a single lookup ( counter_to_context ).
+static const int mode_2_counter[MB_MODE_COUNT] = {
+  9,  // DC_PRED
+  9,  // V_PRED
+  9,  // H_PRED
+  9,  // D45_PRED
+  9,  // D135_PRED
+  9,  // D117_PRED
+  9,  // D153_PRED
+  9,  // D27_PRED
+  9,  // D63_PRED
+  9,  // TM_PRED
+  0,  // NEARESTMV
+  0,  // NEARMV
+  3,  // ZEROMV
+  1,  // NEWMV
+};
+
+// There are 3^3 different combinations of 3 counts that can be either 0,1 or
+// 2. However the actual count can never be greater than 2 so the highest
+// counter we need is 18. 9 is an invalid counter that's never used.
+static const int counter_to_context[19] = {
+  BOTH_PREDICTED,  // 0
+  NEW_PLUS_NON_INTRA,  // 1
+  BOTH_NEW,  // 2
+  ZERO_PLUS_PREDICTED,  // 3
+  NEW_PLUS_NON_INTRA,  // 4
+  INVALID_CASE,  // 5
+  BOTH_ZERO,  // 6
+  INVALID_CASE,  // 7
+  INVALID_CASE,  // 8
+  INTRA_PLUS_NON_INTRA,  // 9
+  INTRA_PLUS_NON_INTRA,  // 10
+  INVALID_CASE,  // 11
+  INTRA_PLUS_NON_INTRA,  // 12
+  INVALID_CASE,  // 13
+  INVALID_CASE,  // 14
+  INVALID_CASE,  // 15
+  INVALID_CASE,  // 16
+  INVALID_CASE,  // 17
+  BOTH_INTRA  // 18
+};
+
 static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
   // SB4X4
   {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}},
@@ -39,263 +98,212 @@ static const int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = {
   // SB64X64
   {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}}
 };
+
+static const int idx_n_column_to_subblock[4][2] = {
+  {1, 2},
+  {1, 3},
+  {3, 2},
+  {3, 3}
+};
+
 // clamp_mv_ref
 #define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
 
 static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) {
-  mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER,
-                                       xd->mb_to_right_edge + MV_BORDER);
-  mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER,
-                                       xd->mb_to_bottom_edge + MV_BORDER);
-}
-
-// Gets a candidate reference motion vector from the given mode info
-// structure if one exists that matches the given reference frame.
-static int get_matching_candidate(const MODE_INFO *candidate_mi,
-                                  MV_REFERENCE_FRAME ref_frame,
-                                  int_mv *c_mv, int block_idx) {
-  if (ref_frame == candidate_mi->mbmi.ref_frame[0]) {
-    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int;
-    else
-      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-  } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) {
-    if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8)
-      c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int;
-    else
-      c_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-  } else {
-    return 0;
-  }
-
-  return 1;
+  clamp_mv(&mv->as_mv, xd->mb_to_left_edge - MV_BORDER,
+                       xd->mb_to_right_edge + MV_BORDER,
+                       xd->mb_to_top_edge - MV_BORDER,
+                       xd->mb_to_bottom_edge + MV_BORDER);
 }
 
-// Gets candidate reference motion vector(s) from the given mode info
-// structure if they exists and do NOT match the given reference frame.
-static void get_non_matching_candidates(const MODE_INFO *candidate_mi,
-                                        MV_REFERENCE_FRAME ref_frame,
-                                        MV_REFERENCE_FRAME *c_ref_frame,
-                                        int_mv *c_mv,
-                                        MV_REFERENCE_FRAME *c2_ref_frame,
-                                        int_mv *c2_mv) {
-
-  c_mv->as_int = 0;
-  c2_mv->as_int = 0;
-  *c_ref_frame = INTRA_FRAME;
-  *c2_ref_frame = INTRA_FRAME;
-
-  // If first candidate not valid neither will be.
-  if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) {
-    // First candidate
-    if (candidate_mi->mbmi.ref_frame[0] != ref_frame) {
-      *c_ref_frame = candidate_mi->mbmi.ref_frame[0];
-      c_mv->as_int = candidate_mi->mbmi.mv[0].as_int;
-    }
-
-    // Second candidate
-    if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) &&
-        (candidate_mi->mbmi.ref_frame[1] != ref_frame) &&
-        (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) {
-      *c2_ref_frame = candidate_mi->mbmi.ref_frame[1];
-      c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int;
-    }
-  }
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MODE_INFO *candidate,
+                                      int check_sub_blocks, int which_mv,
+                                      int search_col, int block_idx) {
+  return (check_sub_blocks && candidate->mbmi.sb_type < BLOCK_SIZE_SB8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .as_mv[which_mv]
+          : candidate->mbmi.mv[which_mv]);
 }
 
 
 // Performs mv sign inversion if indicated by the reference frame combination.
-static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame,
-                     MV_REFERENCE_FRAME candidate_ref_frame,
-                     int_mv *candidate_mv, int *ref_sign_bias) {
+static INLINE int_mv scale_mv(const MODE_INFO *candidate, const int which_mv,
+                              const MV_REFERENCE_FRAME this_ref_frame,
+                              const int *ref_sign_bias) {
+  int_mv return_mv = candidate->mbmi.mv[which_mv];
 
   // Sign inversion where appropriate.
-  if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
-    candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
-    candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+  if (ref_sign_bias[candidate->mbmi.ref_frame[which_mv]] !=
+      ref_sign_bias[this_ref_frame]) {
+    return_mv.as_mv.row *= -1;
+    return_mv.as_mv.col *= -1;
   }
+  return return_mv;
 }
 
-// Add a candidate mv.
-// Discard if it has already been seen.
-static void add_candidate_mv(int_mv *mv_list,  int *mv_scores,
-                             int *candidate_count, int_mv candidate_mv,
-                             int weight) {
-  if (*candidate_count == 0) {
-    mv_list[0].as_int = candidate_mv.as_int;
-    mv_scores[0] = weight;
-    *candidate_count += 1;
-  } else if ((*candidate_count == 1) &&
-             (candidate_mv.as_int != mv_list[0].as_int)) {
-    mv_list[1].as_int = candidate_mv.as_int;
-    mv_scores[1] = weight;
-    *candidate_count += 1;
+// This macro is used to add a motion vector mv_ref list if it isn't
+// already in the list.  If it's the second motion vector it will also
+// skip all additional processing and jump to done!
+#define ADD_MV_REF_LIST(MV) \
+  if (refmv_count) { \
+    if ((MV).as_int != mv_ref_list[0].as_int) { \
+      mv_ref_list[refmv_count] = (MV); \
+      goto Done; \
+    } \
+  } else { \
+    mv_ref_list[refmv_count++] = (MV); \
+  }
+
+// If either reference frame is different, not INTRA, and they
+// are different from each other scale and add the mv to our list.
+#define IF_DIFF_REF_FRAME_ADD_MV(CANDIDATE) \
+  if ((CANDIDATE)->mbmi.ref_frame[0] != ref_frame) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 0, ref_frame, ref_sign_bias)); \
+  } \
+  if ((CANDIDATE)->mbmi.ref_frame[1] != ref_frame && \
+      (CANDIDATE)->mbmi.ref_frame[1] > INTRA_FRAME && \
+      (CANDIDATE)->mbmi.mv[1].as_int != (CANDIDATE)->mbmi.mv[0].as_int) { \
+    ADD_MV_REF_LIST(scale_mv((CANDIDATE), 1, ref_frame, ref_sign_bias)); \
   }
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const int mi_col, const int mi_row,
+                            const int cur_tile_mi_col_start,
+                            const int cur_tile_mi_col_end, const int mi_rows,
+                            const int (*mv_ref_search)[2], int idx) {
+  int mi_search_col;
+  const int mi_search_row = mi_row + mv_ref_search[idx][1];;
+
+  // Check that the candidate is within the border.  We only need to check
+  // the left side because all the positive right side ones are for blocks that
+  // are large enough to support the + value they have within their border.
+  if (mi_search_row < 0)
+    return 0;
+
+  mi_search_col = mi_col + mv_ref_search[idx][0];
+  if (mi_search_col < cur_tile_mi_col_start)
+    return 0;
+
+  return 1;
 }
 
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
-//
 void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here,
-                          MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame,
-                          int_mv *mv_ref_list, int *ref_sign_bias,
-                          int block_idx) {
-  int i;
-  MODE_INFO *candidate_mi;
-  MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-  int_mv c_refmv;
-  int_mv c2_refmv;
-  MV_REFERENCE_FRAME c_ref_frame;
-  MV_REFERENCE_FRAME c2_ref_frame;
-  int candidate_scores[MAX_MV_REF_CANDIDATES] = { 0 };
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
+                          int_mv *mv_ref_list, const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row, const int mi_col) {
+  int idx;
+  MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int refmv_count = 0;
   const int (*mv_ref_search)[2] = mv_ref_blocks[mbmi->sb_type];
-  const int mi_col = get_mi_col(xd);
-  const int mi_row = get_mi_row(xd);
-  int intra_count = 0;
-  int zero_count = 0;
-  int newmv_count = 0;
-  int x_idx = 0, y_idx = 0;
-
-  // Blank the reference vector lists and other local structures.
-  vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES);
-
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-    x_idx = block_idx & 1;
-    y_idx = block_idx >> 1;
-  }
-
-  // We first scan for candidate vectors that match the current reference frame
-  // Look at nearest neigbours
-  for (i = 0; i < 2; ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      int b;
-
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      if (block_idx >= 0) {
-        if (mv_ref_search[i][0])
-          b = 1 + y_idx * 2;
-        else
-          b = 2 + x_idx;
-      } else {
-        b = -1;
-      }
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) {
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 16);
+  const MODE_INFO *candidate;
+  const int check_sub_blocks = block_idx >= 0;
+  int different_ref_found = 0;
+  int context_counter = 0;
+
+  // Blank the reference vector list
+  vpx_memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are treated differently
+  // if the size < 8x8 we get the mv from the bmi substructure,
+  // and we also need to keep a mode count.
+  for (idx = 0; idx < 2; ++idx) {
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+      continue;
+
+    candidate = here + mv_ref_search[idx][0]
+                + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+    // Keep counts for entropy encoding.
+    context_counter += mode_2_counter[candidate->mbmi.mode];
+
+    // Check if the candidate comes from the same reference frame.
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 0,
+                                       mv_ref_search[idx][0], block_idx));
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        // Add second motion vector if it has the same ref_frame.
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate, check_sub_blocks, 1,
+                                         mv_ref_search[idx][0], block_idx));
       }
-
-      // Count number of neihgbours coded intra and zeromv
-      intra_count += (candidate_mi->mbmi.mode < NEARESTMV);
-      zero_count += (candidate_mi->mbmi.mode == ZEROMV);
-      newmv_count += (candidate_mi->mbmi.mode >= NEWMV);
     }
   }
 
-  // More distant neigbours
-  for (i = 2; (i < MVREF_NEIGHBOURS) &&
-              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 16);
+  // Check the rest of the neighbors in much the same way
+  // as before except we don't need to keep track of sub blocks or
+  // mode counts.
+  for (; idx < MVREF_NEIGHBOURS; ++idx) {
+    if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                   cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+      continue;
+
+    candidate = here + mv_ref_search[idx][0]
+                + mv_ref_search[idx][1] * xd->mode_info_stride;
+
+    if (candidate->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(candidate->mbmi.mv[0]);
+      different_ref_found = candidate->mbmi.ref_frame[1] != ref_frame;
+    } else {
+      different_ref_found = 1;
+      if (candidate->mbmi.ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(candidate->mbmi.mv[1]);
       }
     }
   }
 
-  // Look in the last frame if it exists
-  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
-    candidate_mi = lf_here;
-    if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) {
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c_refmv, 16);
+  // Check the last frame's mode and mv info.
+  if (lf_here != NULL) {
+    if (lf_here->mbmi.ref_frame[0] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[0]);
+    } else if (lf_here->mbmi.ref_frame[1] == ref_frame) {
+      ADD_MV_REF_LIST(lf_here->mbmi.mv[1]);
     }
   }
 
-  // If we have not found enough candidates consider ones where the
-  // reference frame does not match. Break out when we have
-  // MAX_MV_REF_CANDIDATES candidates.
-  // Look first at spatial neighbours
-  for (i = 0; (i < MVREF_NEIGHBOURS) &&
-              (refmv_count < MAX_MV_REF_CANDIDATES); ++i) {
-    const int mi_search_col = mi_col + mv_ref_search[i][0];
-    const int mi_search_row = mi_row + mv_ref_search[i][1];
-    if ((mi_search_col >= cm->cur_tile_mi_col_start) &&
-        (mi_search_col < cm->cur_tile_mi_col_end) &&
-        (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) {
-      candidate_mi = here + mv_ref_search[i][0] +
-                     (mv_ref_search[i][1] * xd->mode_info_stride);
-
-      get_non_matching_candidates(candidate_mi, ref_frame,
-                                  &c_ref_frame, &c_refmv,
-                                  &c2_ref_frame, &c2_refmv);
-
-      if (c_ref_frame != INTRA_FRAME) {
-        scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c_refmv, 1);
-      }
+  // Since we couldn't find 2 mvs from the same reference frame
+  // go back through the neighbors and find motion vectors from
+  // different reference frames.
+  if (different_ref_found) {
+    for (idx = 0; idx < MVREF_NEIGHBOURS; ++idx) {
+      if (!is_inside(mi_col, mi_row, cm->cur_tile_mi_col_start,
+                     cm->cur_tile_mi_col_end, cm->mi_rows, mv_ref_search, idx))
+        continue;
 
-      if (c2_ref_frame != INTRA_FRAME) {
-        scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-        add_candidate_mv(mv_ref_list, candidate_scores,
-                         &refmv_count, c2_refmv, 1);
-      }
-    }
-  }
+      candidate = here + mv_ref_search[idx][0]
+                  + mv_ref_search[idx][1] * xd->mode_info_stride;
 
-  // Look at the last frame if it exists
-  if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) {
-    candidate_mi = lf_here;
-    get_non_matching_candidates(candidate_mi, ref_frame,
-                                &c_ref_frame, &c_refmv,
-                                &c2_ref_frame, &c2_refmv);
-
-    if (c_ref_frame != INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias);
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c_refmv, 1);
-    }
+      // If the candidate is INTRA we don't want to consider its mv.
+      if (candidate->mbmi.ref_frame[0] == INTRA_FRAME)
+        continue;
 
-    if (c2_ref_frame != INTRA_FRAME) {
-      scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias);
-      add_candidate_mv(mv_ref_list, candidate_scores,
-                       &refmv_count, c2_refmv, 1);
+      IF_DIFF_REF_FRAME_ADD_MV(candidate);
     }
   }
 
-  if (!intra_count) {
-    if (!newmv_count) {
-      // 0 = both zero mv
-      // 1 = one zero mv + one a predicted mv
-      // 2 = two predicted mvs
-      mbmi->mb_mode_context[ref_frame] = 2 - zero_count;
-    } else {
-      // 3 = one predicted/zero and one new mv
-      // 4 = two new mvs
-      mbmi->mb_mode_context[ref_frame] = 2 + newmv_count;
-    }
-  } else {
-    // 5 = one intra neighbour + x
-    // 6 = two intra neighbours
-    mbmi->mb_mode_context[ref_frame] = 4 + intra_count;
+  // Since we still don't have a candidate we'll try the last frame.
+  if (lf_here != NULL && lf_here->mbmi.ref_frame[0] != INTRA_FRAME) {
+    IF_DIFF_REF_FRAME_ADD_MV(lf_here);
   }
 
+ Done:
+
+  mbmi->mb_mode_context[ref_frame] = counter_to_context[context_counter];
+
   // Clamp vectors
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
-    clamp_mv_ref(xd, &mv_ref_list[i]);
+  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+    clamp_mv_ref(xd, &mv_ref_list[idx]);
   }
 }
+
+#undef ADD_MV_REF_LIST
+#undef IF_DIFF_REF_FRAME_ADD_MV
diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h
index 7290f10ab..c5f89eb57 100644
--- a/libvpx/vp9/common/vp9_mvref_common.h
+++ b/libvpx/vp9/common/vp9_mvref_common.h
@@ -17,11 +17,13 @@
 void vp9_find_mv_refs_idx(VP9_COMMON *cm,
                           MACROBLOCKD *xd,
                           MODE_INFO *here,
-                          MODE_INFO *lf_here,
-                          MV_REFERENCE_FRAME ref_frame,
+                          const MODE_INFO *lf_here,
+                          const MV_REFERENCE_FRAME ref_frame,
                           int_mv *mv_ref_list,
-                          int *ref_sign_bias,
-                          int block_idx);
+                          const int *ref_sign_bias,
+                          const int block_idx,
+                          const int mi_row,
+                          const int mi_col);
 
 static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                     MACROBLOCKD *xd,
@@ -29,9 +31,10 @@ static INLINE void vp9_find_mv_refs(VP9_COMMON *cm,
                                     MODE_INFO *lf_here,
                                     MV_REFERENCE_FRAME ref_frame,
                                     int_mv *mv_ref_list,
-                                    int *ref_sign_bias) {
+                                    int *ref_sign_bias,
+                                    int mi_row, int mi_col) {
   vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame,
-                       mv_ref_list, ref_sign_bias, -1);
+                       mv_ref_list, ref_sign_bias, -1, mi_row, mi_col);
 }
 
 #endif  // VP9_COMMON_VP9_MVREF_COMMON_H_
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index f31f24b26..152a93293 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -42,7 +42,7 @@ typedef struct frame_contexts {
   vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
   vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS]
                          [PARTITION_TYPES - 1];
-  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
   vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1]
                                  [VP9_SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1];
@@ -59,12 +59,12 @@ typedef struct {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES];
   unsigned int uv_mode[VP9_INTRA_MODES][VP9_INTRA_MODES];
   unsigned int partition[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
-  vp9_coeff_count_model coef[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  unsigned int eob_branch[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES]
+  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
+  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
                          [COEF_BANDS][PREV_COEF_CONTEXTS];
   unsigned int switchable_interp[VP9_SWITCHABLE_FILTERS + 1]
                                 [VP9_SWITCHABLE_FILTERS];
-  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2];
+  unsigned int inter_mode[INTER_MODE_CONTEXTS][VP9_INTER_MODES];
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref[REF_CONTEXTS][2][2];
@@ -240,8 +240,7 @@ static INLINE void set_partition_seg_context(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
 }
 
-static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                int mi_row, int mi_col,
+static int check_bsize_coverage(VP9_COMMON *cm, int mi_row, int mi_col,
                                 BLOCK_SIZE_TYPE bsize) {
   int bsl = mi_width_log2(bsize), bs = 1 << bsl;
   int ms = bs / 2;
@@ -278,14 +277,6 @@ static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd,
   xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end);
 }
 
-static int get_mi_row(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE));
-}
-
-static int get_mi_col(const MACROBLOCKD *xd) {
-  return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE));
-}
-
 static int get_token_alloc(int mb_rows, int mb_cols) {
   return mb_rows * mb_cols * (48 * 16 + 4);
 }
diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c
index e8bcdea82..795962a71 100644
--- a/libvpx/vp9/common/vp9_pred_common.c
+++ b/libvpx/vp9/common/vp9_pred_common.c
@@ -55,34 +55,28 @@ unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  int pred_context;
   const MODE_INFO *const mi = xd->mode_info_context;
   const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
   const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
   const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
   const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
-  // Note:
-  // The mode info data structure has a one element border above and to the
-  // left of the entries correpsonding to real macroblocks.
-  // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
-    if (left_mbmi->ref_frame[0] == INTRA_FRAME &&
-        above_mbmi->ref_frame[0] == INTRA_FRAME) {  // intra/intra (3)
-      pred_context = 3;
-    } else {  // intra/inter (1) or inter/inter (0)
-      pred_context = left_mbmi->ref_frame[0] == INTRA_FRAME ||
-                     above_mbmi->ref_frame[0] == INTRA_FRAME;
-    }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  const int left_intra = !is_inter_block(left_mbmi);
+  const int above_intra = !is_inter_block(above_mbmi);
 
-    // inter: 0, intra: 2
-    pred_context = 2 * (edge_mbmi->ref_frame[0] == INTRA_FRAME);
-  } else {
-    pred_context = 0;
-  }
-  assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS);
-  return pred_context;
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  // 0 - inter/inter, inter/--, --/inter, --/--
+  // 1 - intra/inter, inter/intra
+  // 2 - intra/--, --/intra
+  // 3 - intra/intra
+  if (above_in_image && left_in_image)  // both edges available
+    return left_intra && above_intra ? 3
+                                     : left_intra || above_intra;
+  else if (above_in_image || left_in_image)  // one edge available
+    return 2 * (above_in_image ? above_intra : left_intra);
+  else
+    return 0;
 }
 // Returns a context number for the given MB prediction signal
 unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h
index e4b6575e3..238290b41 100644
--- a/libvpx/vp9/common/vp9_pred_common.h
+++ b/libvpx/vp9/common/vp9_pred_common.h
@@ -110,9 +110,9 @@ unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
 
 static const vp9_prob *get_tx_probs(BLOCK_SIZE_TYPE bsize, uint8_t context,
                                     const struct tx_probs *tx_probs) {
-  if (bsize < BLOCK_SIZE_MB16X16)
+  if (bsize < BLOCK_16X16)
     return tx_probs->p8x8[context];
-  else if (bsize < BLOCK_SIZE_SB32X32)
+  else if (bsize < BLOCK_32X32)
     return tx_probs->p16x16[context];
   else
     return tx_probs->p32x32[context];
@@ -127,9 +127,9 @@ static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
 
 static void update_tx_counts(BLOCK_SIZE_TYPE bsize, uint8_t context,
                              TX_SIZE tx_size, struct tx_counts *tx_counts) {
-  if (bsize >= BLOCK_SIZE_SB32X32)
+  if (bsize >= BLOCK_32X32)
     tx_counts->p32x32[context][tx_size]++;
-  else if (bsize >= BLOCK_SIZE_MB16X16)
+  else if (bsize >= BLOCK_16X16)
     tx_counts->p16x16[context][tx_size]++;
   else
     tx_counts->p8x8[context][tx_size]++;
diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c
index 63e5646ad..0b65e0610 100644
--- a/libvpx/vp9/common/vp9_reconinter.c
+++ b/libvpx/vp9/common/vp9_reconinter.c
@@ -197,14 +197,14 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
-                               const int_mv *src_mv,
+                               const MV *src_mv,
                                const struct scale_factors *scale,
                                int w, int h, int weight,
                                const struct subpix_fn_table *subpix,
                                enum mv_precision precision) {
   const MV32 mv = precision == MV_PRECISION_Q4
-                     ? scale->scale_mv_q4(&src_mv->as_mv, scale)
-                     : scale->scale_mv_q3_to_q4(&src_mv->as_mv, scale);
+                     ? scale->scale_mv_q4(src_mv, scale)
+                     : scale->scale_mv_q3_to_q4(src_mv, scale);
   const int subpel_x = mv.col & 15;
   const int subpel_y = mv.row & 15;
 
@@ -220,45 +220,44 @@ static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
 
-static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) {
-  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row +
-                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row;
-  return round_mv_comp_q4(temp);
+static MV mi_mv_pred_q4(const MODE_INFO *mi, int idx) {
+  MV res = { round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.row +
+                              mi->bmi[1].as_mv[idx].as_mv.row +
+                              mi->bmi[2].as_mv[idx].as_mv.row +
+                              mi->bmi[3].as_mv[idx].as_mv.row),
+             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                              mi->bmi[1].as_mv[idx].as_mv.col +
+                              mi->bmi[2].as_mv[idx].as_mv.col +
+                              mi->bmi[3].as_mv[idx].as_mv.col) };
+  return res;
 }
 
-static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) {
-  const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col +
-                   mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col;
-  return round_mv_comp_q4(temp);
-}
+
 
 // TODO(jkoleszar): yet another mv clamping function :-(
 MV clamp_mv_to_umv_border_sb(const MV *src_mv,
     int bwl, int bhl, int ss_x, int ss_y,
     int mb_to_left_edge, int mb_to_top_edge,
     int mb_to_right_edge, int mb_to_bottom_edge) {
-  /* If the MV points so far into the UMV border that no visible pixels
-   * are used for reconstruction, the subpel part of the MV can be
-   * discarded and the MV limited to 16 pixels with equivalent results.
-   */
+  // If the MV points so far into the UMV border that no visible pixels
+  // are used for reconstruction, the subpel part of the MV can be
+  // discarded and the MV limited to 16 pixels with equivalent results.
   const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4;
   const int spel_right = spel_left - (1 << 4);
   const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4;
   const int spel_bottom = spel_top - (1 << 4);
-  MV clamped_mv;
-
+  MV clamped_mv = {
+    src_mv->row << (1 - ss_y),
+    src_mv->col << (1 - ss_x)
+  };
   assert(ss_x <= 1);
   assert(ss_y <= 1);
-  clamped_mv.col = clamp(src_mv->col << (1 - ss_x),
-                         (mb_to_left_edge << (1 - ss_x)) - spel_left,
-                         (mb_to_right_edge << (1 - ss_x)) + spel_right);
-  clamped_mv.row = clamp(src_mv->row << (1 - ss_y),
-                         (mb_to_top_edge << (1 - ss_y)) - spel_top,
-                         (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
+  clamp_mv(&clamped_mv, (mb_to_left_edge << (1 - ss_x)) - spel_left,
+                        (mb_to_right_edge << (1 - ss_x)) + spel_right,
+                        (mb_to_top_edge << (1 - ss_y)) - spel_top,
+                        (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom);
+
   return clamped_mv;
 }
 
@@ -280,15 +279,14 @@ static void build_inter_predictors(int plane, int block,
   const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
   const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
   const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl);
-  const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const int use_second_ref = mi->mbmi.ref_frame[1] > 0;
   int which_mv;
 
   assert(x < (4 << bwl));
   assert(y < (4 << bhl));
-  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_w == (4 << bwl));
-  assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 ||
-         4 << pred_h == (4 << bhl));
+  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_w == (4 << bwl));
+  assert(mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 || 4 << pred_h == (4 << bhl));
 
   for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) {
     // source
@@ -301,44 +299,30 @@ static void build_inter_predictors(int plane, int block,
     // dest
     uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x;
 
-    // motion vector
-    const MV *mv;
-    MV split_chroma_mv;
-    int_mv clamped_mv;
-
-    if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-      if (plane == 0) {
-        mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv;
-      } else {
-        // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
-        // same MV (the average of the 4 luma MVs) but we could do something
-        // smarter for non-4:2:0. Just punt for now, pending the changes to get
-        // rid of SPLITMV mode entirely.
-        split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv);
-        split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv);
-        mv = &split_chroma_mv;
-      }
-    } else {
-      mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv;
-    }
-
-    /* TODO(jkoleszar): This clamping is done in the incorrect place for the
-     * scaling case. It needs to be done on the scaled MV, not the pre-scaling
-     * MV. Note however that it performs the subsampling aware scaling so
-     * that the result is always q4.
-     */
-    clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl,
-                                                 xd->plane[plane].subsampling_x,
-                                                 xd->plane[plane].subsampling_y,
-                                                 xd->mb_to_left_edge,
-                                                 xd->mb_to_top_edge,
-                                                 xd->mb_to_right_edge,
-                                                 xd->mb_to_bottom_edge);
+    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+    // same MV (the average of the 4 luma MVs) but we could do something
+    // smarter for non-4:2:0. Just punt for now, pending the changes to get
+    // rid of SPLITMV mode entirely.
+    const MV mv = mi->mbmi.sb_type < BLOCK_SIZE_SB8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[which_mv].as_mv
+                             : mi_mv_pred_q4(mi, which_mv))
+               : mi->mbmi.mv[which_mv].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    const MV res_mv = clamp_mv_to_umv_border_sb(&mv, bwl, bhl,
+                                                xd->plane[plane].subsampling_x,
+                                                xd->plane[plane].subsampling_y,
+                                                xd->mb_to_left_edge,
+                                                xd->mb_to_top_edge,
+                                                xd->mb_to_right_edge,
+                                                xd->mb_to_bottom_edge);
     scale->set_scaled_offsets(scale, arg->y + y, arg->x + x);
-
     vp9_build_inter_predictor(pre, pre_stride,
                               dst, arg->dst_stride[plane],
-                              &clamped_mv, &xd->scale_factor[which_mv],
+                              &res_mv, &xd->scale_factor[which_mv],
                               4 << pred_w, 4 << pred_h, which_mv,
                               &xd->subpix, MV_PRECISION_Q4);
   }
@@ -400,7 +384,7 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
   struct scale_factors *const sf = &cm->active_ref_scale[i];
   if (ref >= NUM_YV12_BUFFERS) {
-    memset(sf, 0, sizeof(*sf));
+    vp9_zero(*sf);
   } else {
     YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref];
     vp9_setup_scale_factors_for_frame(sf,
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index e37750dea..6ec7323e1 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -39,7 +39,7 @@ void vp9_setup_scale_factors_for_frame(struct scale_factors *scale,
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
-                               const int_mv *mv_q3,
+                               const MV *mv_q3,
                                const struct scale_factors *scale,
                                int w, int h, int do_avg,
                                const struct subpix_fn_table *subpix,
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh
index c357ef62a..6bb3cb888 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -7,9 +7,7 @@ cat <<EOF
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -22,7 +20,11 @@ EOF
 }
 forward_decls vp9_common_forward_decls
 
-[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2
+# x86inc.asm doesn't work if pic is enabled on 32 bit platforms so no assembly.
+[ "$CONFIG_USE_X86INC" = "yes" ] && mmx_x86inc=mmx && sse2_x86inc=sse2  && ssse3_x86inc=ssse3
+
+# this variable is for functions that are 64 bit only.
+[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2  && ssse3_x86_64=ssse3
 
 #
 # Dequant
@@ -47,7 +49,7 @@ prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
 specialize vp9_d27_predictor_4x4
 
 prototype void vp9_d45_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_4x4
+specialize vp9_d45_predictor_4x4 ssse3
 
 prototype void vp9_d63_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_4x4
@@ -86,7 +88,7 @@ prototype void vp9_d27_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, ui
 specialize vp9_d27_predictor_8x8
 
 prototype void vp9_d45_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_8x8
+specialize vp9_d45_predictor_8x8 ssse3
 
 prototype void vp9_d63_predictor_8x8 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_8x8
@@ -125,7 +127,7 @@ prototype void vp9_d27_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
 specialize vp9_d27_predictor_16x16
 
 prototype void vp9_d45_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_16x16
+specialize vp9_d45_predictor_16x16 ssse3
 
 prototype void vp9_d63_predictor_16x16 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_16x16
@@ -164,7 +166,7 @@ prototype void vp9_d27_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride,
 specialize vp9_d27_predictor_32x32
 
 prototype void vp9_d45_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
-specialize vp9_d45_predictor_32x32
+specialize vp9_d45_predictor_32x32 ssse3
 
 prototype void vp9_d63_predictor_32x32 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
 specialize vp9_d63_predictor_32x32
@@ -214,7 +216,7 @@ fi
 # Loopfilter
 #
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon
 
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_vertical_edge sse2 neon
@@ -223,7 +225,7 @@ prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8
 specialize vp9_loop_filter_vertical_edge mmx neon
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2
+specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon
@@ -265,10 +267,10 @@ specialize vp9_blend_b
 # Sub Pixel Filters
 #
 prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_copy sse2
+specialize vp9_convolve_copy $sse2_x86inc
 
 prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
-specialize vp9_convolve_avg sse2
+specialize vp9_convolve_avg $sse2_x86inc
 
 prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
 specialize vp9_convolve8 ssse3 neon
@@ -297,14 +299,17 @@ specialize vp9_short_idct4x4_1_add sse2
 prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct4x4_add sse2
 
+prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_1_add sse2
+
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2 neon
 
 prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_8x8_add sse2
 
-prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_8x8
+prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_1_add sse2
 
 prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct16x16_add sse2
@@ -312,18 +317,12 @@ specialize vp9_short_idct16x16_add sse2
 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_16x16_add sse2
 
-prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
-specialize vp9_short_idct1_16x16
-
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct32x32_add sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct10_32x32_add
-
 prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht4x4_add sse2
 
@@ -702,12 +701,10 @@ specialize vp9_get_mb_ss mmx sse2
 # ENCODEMB INVOKE
 
 prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size, int64_t *ssz"
-specialize vp9_block_error sse2
+specialize vp9_block_error $sse2_x86inc
 
 prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
-specialize vp9_subtract_block sse2
-
-[ $arch = "x86_64" ] && ssse3_x86_64=ssse3
+specialize vp9_subtract_block $sse2_x86inc
 
 prototype void vp9_quantize_b "int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"
 specialize vp9_quantize_b $ssse3_x86_64
@@ -719,13 +716,11 @@ specialize vp9_quantize_b_32x32 $ssse3_x86_64
 # Structured Similarity (SSIM)
 #
 if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then
-    [ $arch = "x86_64" ] && sse2_on_x86_64=sse2
-
     prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_8x8 $sse2_on_x86_64
+    specialize vp9_ssim_parms_8x8 $sse2_x86_64
 
     prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr"
-    specialize vp9_ssim_parms_16x16 $sse2_on_x86_64
+    specialize vp9_ssim_parms_16x16 $sse2_x86_64
 fi
 
 # fdct functions
diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h
index ebcd4116f..31182c35c 100644
--- a/libvpx/vp9/common/vp9_treecoder.h
+++ b/libvpx/vp9/common/vp9_treecoder.h
@@ -79,4 +79,22 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  const unsigned int count = MIN(ct[0] + ct[1], count_sat);
+  const unsigned int factor = max_update_factor * count / count_sat;
+  return weighted_prob(pre_prob, prob, factor);
+}
+
+static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
+                                   const unsigned int ct[2],
+                                   unsigned int count_sat,
+                                   unsigned int max_update_factor) {
+  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
+                     max_update_factor);
+}
+
+
 #endif  // VP9_COMMON_VP9_TREECODER_H_
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index a1e14b482..8f740f412 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
   {                                                     \
      __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
       d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      _mm_storel_epi64((__m128i *)(dest), d0); \
       dest += stride; \
   }
 
@@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   RECON_AND_STORE(dest, in7);
 }
 
+void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 5);
+
+  dc_value = _mm_set1_epi16(a);
+
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+  RECON_AND_STORE(dest, dc_value);
+}
+
 // perform 8x8 transpose
 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
@@ -1449,6 +1470,38 @@ void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   }
 }
 
+void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+  __m128i dc_value;
+  const __m128i zero = _mm_setzero_si128();
+  int a, i;
+
+  a = dct_const_round_shift(input[0] * cospi_16_64);
+  a = dct_const_round_shift(a * cospi_16_64);
+  a = ROUND_POWER_OF_TWO(a, 6);
+
+  dc_value = _mm_set1_epi16(a);
+
+  for (i = 0; i < 2; ++i) {
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    RECON_AND_STORE(dest, dc_value);
+    dest += 8 - (stride * 16);
+  }
+}
+
 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
   __m128i tbuf[8];
   array_transpose_8x8(res0, res0);
@@ -2760,6 +2813,12 @@ void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
   }
 }
 
+#define LOAD_DQCOEFF(reg, input) \
+  {  \
+    reg = _mm_load_si128((__m128i *) input); \
+    input += 8; \
+  }  \
+
 void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
@@ -2827,48 +2886,126 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j;
+  int i, j, i32;
+  __m128i zero_idx[16];
+  int zero_flag[2];
 
   // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
   for (i = 0; i < 8; i++) {
+    i32 = (i << 5);
     if (i < 4) {
       // First 1-D idct
       // Load input data.
-      in0 = _mm_load_si128((__m128i *)input);
-      in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
-      in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
-      in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
-      in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
-      in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
-      in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
-      in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
-      in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
-      in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
-      in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
-      in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
-      in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
-      in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
-      in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
-      in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
-
-      in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
-      in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
-      in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
-      in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
-      in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
-      in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
-      in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
-      in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
-      in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
-      in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
-      in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
-      in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
-      in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
-      in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
-      in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
-      in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
-
-      input += 256;
+      LOAD_DQCOEFF(in0, input);
+      LOAD_DQCOEFF(in8, input);
+      LOAD_DQCOEFF(in16, input);
+      LOAD_DQCOEFF(in24, input);
+      LOAD_DQCOEFF(in1, input);
+      LOAD_DQCOEFF(in9, input);
+      LOAD_DQCOEFF(in17, input);
+      LOAD_DQCOEFF(in25, input);
+      LOAD_DQCOEFF(in2, input);
+      LOAD_DQCOEFF(in10, input);
+      LOAD_DQCOEFF(in18, input);
+      LOAD_DQCOEFF(in26, input);
+      LOAD_DQCOEFF(in3, input);
+      LOAD_DQCOEFF(in11, input);
+      LOAD_DQCOEFF(in19, input);
+      LOAD_DQCOEFF(in27, input);
+
+      LOAD_DQCOEFF(in4, input);
+      LOAD_DQCOEFF(in12, input);
+      LOAD_DQCOEFF(in20, input);
+      LOAD_DQCOEFF(in28, input);
+      LOAD_DQCOEFF(in5, input);
+      LOAD_DQCOEFF(in13, input);
+      LOAD_DQCOEFF(in21, input);
+      LOAD_DQCOEFF(in29, input);
+      LOAD_DQCOEFF(in6, input);
+      LOAD_DQCOEFF(in14, input);
+      LOAD_DQCOEFF(in22, input);
+      LOAD_DQCOEFF(in30, input);
+      LOAD_DQCOEFF(in7, input);
+      LOAD_DQCOEFF(in15, input);
+      LOAD_DQCOEFF(in23, input);
+      LOAD_DQCOEFF(in31, input);
+
+      // checking if all entries are zero
+      zero_idx[0] = _mm_or_si128(in0, in1);
+      zero_idx[1] = _mm_or_si128(in2, in3);
+      zero_idx[2] = _mm_or_si128(in4, in5);
+      zero_idx[3] = _mm_or_si128(in6, in7);
+      zero_idx[4] = _mm_or_si128(in8, in9);
+      zero_idx[5] = _mm_or_si128(in10, in11);
+      zero_idx[6] = _mm_or_si128(in12, in13);
+      zero_idx[7] = _mm_or_si128(in14, in15);
+      zero_idx[8] = _mm_or_si128(in16, in17);
+      zero_idx[9] = _mm_or_si128(in18, in19);
+      zero_idx[10] = _mm_or_si128(in20, in21);
+      zero_idx[11] = _mm_or_si128(in22, in23);
+      zero_idx[12] = _mm_or_si128(in24, in25);
+      zero_idx[13] = _mm_or_si128(in26, in27);
+      zero_idx[14] = _mm_or_si128(in28, in29);
+      zero_idx[15] = _mm_or_si128(in30, in31);
+
+      zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+      zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+      zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+      zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+      zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+      zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+      zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+      zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
+
+      zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
+      zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
+      zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
+      zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
+      zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
+      zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
+      zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
+
+      zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
+      zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
+      zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
+      zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
+      zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
+
+      if (!zero_flag[0] && !zero_flag[1]) {
+        col[i32 + 0] = _mm_setzero_si128();
+        col[i32 + 1] = _mm_setzero_si128();
+        col[i32 + 2] = _mm_setzero_si128();
+        col[i32 + 3] = _mm_setzero_si128();
+        col[i32 + 4] = _mm_setzero_si128();
+        col[i32 + 5] = _mm_setzero_si128();
+        col[i32 + 6] = _mm_setzero_si128();
+        col[i32 + 7] = _mm_setzero_si128();
+        col[i32 + 8] = _mm_setzero_si128();
+        col[i32 + 9] = _mm_setzero_si128();
+        col[i32 + 10] = _mm_setzero_si128();
+        col[i32 + 11] = _mm_setzero_si128();
+        col[i32 + 12] = _mm_setzero_si128();
+        col[i32 + 13] = _mm_setzero_si128();
+        col[i32 + 14] = _mm_setzero_si128();
+        col[i32 + 15] = _mm_setzero_si128();
+        col[i32 + 16] = _mm_setzero_si128();
+        col[i32 + 17] = _mm_setzero_si128();
+        col[i32 + 18] = _mm_setzero_si128();
+        col[i32 + 19] = _mm_setzero_si128();
+        col[i32 + 20] = _mm_setzero_si128();
+        col[i32 + 21] = _mm_setzero_si128();
+        col[i32 + 22] = _mm_setzero_si128();
+        col[i32 + 23] = _mm_setzero_si128();
+        col[i32 + 24] = _mm_setzero_si128();
+        col[i32 + 25] = _mm_setzero_si128();
+        col[i32 + 26] = _mm_setzero_si128();
+        col[i32 + 27] = _mm_setzero_si128();
+        col[i32 + 28] = _mm_setzero_si128();
+        col[i32 + 29] = _mm_setzero_si128();
+        col[i32 + 30] = _mm_setzero_si128();
+        col[i32 + 31] = _mm_setzero_si128();
+        continue;
+      }
 
       // Transpose 32x8 block to 8x32 block
       TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
@@ -3239,38 +3376,38 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
     // final stage
     if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
       const __m128i zero = _mm_setzero_si128();
 
diff --git a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
index bc8ed5c1f..8ba26f310 100644
--- a/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
+++ b/libvpx/vp9/common/x86/vp9_intrapred_ssse3.asm
@@ -10,6 +10,31 @@
 
 %include "third_party/x86inc/x86inc.asm"
 
+SECTION_RODATA
+
+pb_1: times 16 db 1
+pw_2: times 8 dw 2
+pb_7m1: times 8 db 7, -1
+pb_15: times 16 db 15
+
+sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7
+sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7
+sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0
+sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+sh_b2w01234577: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 7, -1, 7, -1
+sh_b2w12345677: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1
+sh_b2w23456777: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 7, -1, 7, -1
+sh_b2w01234567: db 0, -1, 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1
+sh_b2w12345678: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1
+sh_b2w23456789: db 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1, 9, -1
+sh_b2w89abcdef: db 8, -1, 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1
+sh_b2w9abcdeff: db 9, -1, 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1
+sh_b2wabcdefff: db 10, -1, 11, -1, 12, -1, 13, -1, 14, -1, 15, -1, 15, -1, 15, -1
+sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15
+sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15
+
 SECTION .text
 
 INIT_MMX ssse3
@@ -85,3 +110,182 @@ cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left
   inc                lineq
   jnz .loop
   REP_RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_4x4, 3, 3, 4, dst, stride, above
+  movq                m0, [aboveq]
+  pshufb              m2, m0, [sh_b23456777]
+  pshufb              m1, m0, [sh_b01234577]
+  pshufb              m0, [sh_b12345677]
+  pavgb               m3, m2, m1
+  pxor                m2, m1
+  pand                m2, [pb_1]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  lea               dstq, [dstq+strideq*2]
+  psrlq               m0, 8
+  movd    [dstq        ], m0
+  psrlq               m0, 8
+  movd    [dstq+strideq], m0
+  RET
+
+INIT_MMX ssse3
+cglobal d45_predictor_8x8, 3, 3, 4, dst, stride, above
+  movq                m0, [aboveq]
+  mova                m1, [sh_b12345677]
+  DEFINE_ARGS dst, stride, stride3, line
+  lea           stride3q, [strideq*3]
+  pshufb              m2, m0, [sh_b23456777]
+  pavgb               m3, m2, m0
+  pxor                m2, m0
+  pshufb              m0, m1
+  pand                m2, [pb_1]
+  psubb               m3, m2
+  pavgb               m0, m3
+
+  ; store 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  pshufb              m0, m1
+  lea               dstq, [dstq+strideq*4]
+
+  ; store next 4 lines
+  movq  [dstq          ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq  ], m0
+  pshufb              m0, m1
+  movq  [dstq+strideq*2], m0
+  pshufb              m0, m1
+  movq  [dstq+stride3q ], m0
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_16x16, 3, 5, 4, dst, stride, above, dst8, line
+  mova                   m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, dst8, line
+  lea              stride3q, [strideq*3]
+  lea                 dst8q, [dstq+strideq*8]
+  mova                   m1, [sh_b123456789abcdeff]
+  pshufb                 m2, m0, [sh_b23456789abcdefff]
+  pavgb                  m3, m2, m0
+  pxor                   m2, m0
+  pshufb                 m0, m1
+  pand                   m2, [pb_1]
+  psubb                  m3, m2
+  pavgb                  m0, m3
+
+  ; first 4 lines and first half of 3rd 4 lines
+  mov                 lined, 2
+.loop:
+  mova   [dstq            ], m0
+  movhps [dst8q           ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq   ], m0
+  movhps [dst8q+strideq   ], m0
+  pshufb                 m0, m1
+  mova   [dstq +strideq*2 ], m0
+  movhps [dst8q+strideq*2 ], m0
+  pshufb                 m0, m1
+  mova   [dstq +stride3q  ], m0
+  movhps [dst8q+stride3q  ], m0
+  pshufb                 m0, m1
+  lea                  dstq, [dstq +strideq*4]
+  lea                 dst8q, [dst8q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; bottom-right 8x8 block
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  lea                  dstq, [dstq+strideq*4]
+  movhps [dstq          +8], m0
+  movhps [dstq+strideq  +8], m0
+  movhps [dstq+strideq*2+8], m0
+  movhps [dstq+stride3q +8], m0
+  RET
+
+INIT_XMM ssse3
+cglobal d45_predictor_32x32, 3, 5, 7, dst, stride, above, dst16, line
+  mova                   m0, [aboveq]
+  mova                   m4, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, dst16, line
+  lea              stride3q, [strideq*3]
+  lea                dst16q, [dstq  +strideq*8]
+  lea                dst16q, [dst16q+strideq*8]
+  mova                   m1, [sh_b123456789abcdeff]
+  pshufb                 m2, m4, [sh_b23456789abcdefff]
+  pavgb                  m3, m2, m4
+  pxor                   m2, m4
+  palignr                m5, m4, m0, 1
+  palignr                m6, m4, m0, 2
+  pshufb                 m4, m1
+  pand                   m2, [pb_1]
+  psubb                  m3, m2
+  pavgb                  m4, m3
+  pavgb                  m3, m0, m6
+  pxor                   m0, m6
+  pand                   m0, [pb_1]
+  psubb                  m3, m0
+  pavgb                  m5, m3
+
+  ; write 4x4 lines (and the first half of the second 4x4 lines)
+  mov                  lined, 4
+.loop:
+  mova [dstq               ], m5
+  mova [dstq            +16], m4
+  mova [dst16q             ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq     ], m3
+  mova [dstq  +strideq  +16], m4
+  mova [dst16q+strideq     ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  mova [dstq  +strideq*2   ], m5
+  mova [dstq  +strideq*2+16], m4
+  mova [dst16q+strideq*2   ], m4
+  palignr                 m3, m4, m5, 1
+  pshufb                  m4, m1
+  mova [dstq  +stride3q    ], m3
+  mova [dstq  +stride3q +16], m4
+  mova [dst16q+stride3q    ], m4
+  palignr                 m5, m4, m3, 1
+  pshufb                  m4, m1
+  lea                  dstq, [dstq  +strideq*4]
+  lea                dst16q, [dst16q+strideq*4]
+  dec                 lined
+  jnz .loop
+
+  ; write second half of second 4x4 lines
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  lea                  dstq, [dstq  +strideq*4]
+  mova [dstq            +16], m4
+  mova [dstq  +strideq  +16], m4
+  mova [dstq  +strideq*2+16], m4
+  mova [dstq  +stride3q +16], m4
+  RET
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 6f0044a4a..a3e2ad39d 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -30,8 +30,12 @@ static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
   return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
 }
 
-static MB_PREDICTION_MODE read_inter_mode(vp9_reader *r, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(r, vp9_inter_mode_tree, p);
+static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
+                                          uint8_t context) {
+  MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
+                            cm->fc.inter_mode_probs[context]);
+  ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
+  return mode;
 }
 
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
@@ -43,9 +47,9 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
   const uint8_t context = vp9_get_pred_context_tx_size(xd);
   const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
   TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
-  if (tx_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+  if (tx_size != TX_4X4 && bsize >= BLOCK_16X16) {
     tx_size += vp9_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+    if (tx_size != TX_8X8 && bsize >= BLOCK_32X32)
       tx_size += vp9_read(r, tx_probs[2]);
   }
 
@@ -54,18 +58,18 @@ static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
 }
 
 static TX_SIZE read_tx_size(VP9D_COMP *pbi, TX_MODE tx_mode,
-                            BLOCK_SIZE_TYPE bsize, int select_cond,
+                            BLOCK_SIZE_TYPE bsize, int allow_select,
                             vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
-  if (tx_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
     return read_selected_tx_size(cm, xd, bsize, r);
-  else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
+  else if (tx_mode >= ALLOW_32X32 && bsize >= BLOCK_32X32)
     return TX_32X32;
-  else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
+  else if (tx_mode >= ALLOW_16X16 && bsize >= BLOCK_16X16)
     return TX_16X16;
-  else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8)
+  else if (tx_mode >= ALLOW_8X8 && bsize >= BLOCK_8X8)
     return TX_8X8;
   else
     return TX_4X4;
@@ -146,8 +150,8 @@ static uint8_t read_skip_coeff(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
   return skip_coeff;
 }
 
-static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
-                                 int mi_row, int mi_col, vp9_reader *r) {
+static void read_intra_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
+                                       int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   MB_MODE_INFO *const mbmi = &m->mbmi;
@@ -158,6 +162,7 @@ static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
   mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
   mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize, 1, r);
   mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_SIZE_SB8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
@@ -166,12 +171,12 @@ static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
     mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
   } else {
     // Only 4x4, 4x8, 8x4 blocks
-    const int bw = 1 << b_width_log2(bsize);
-    const int bh = 1 << b_height_log2(bsize);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
     int idx, idy;
 
-    for (idy = 0; idy < 2; idy += bh) {
-      for (idx = 0; idx < 2; idx += bw) {
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int ib = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, ib, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
@@ -179,9 +184,9 @@ static void read_intra_mode_info(VP9D_COMP *pbi, MODE_INFO *m,
         const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
                                               vp9_kf_y_mode_prob[A][L]);
         m->bmi[ib].as_mode = b_mode;
-        if (bh == 2)
+        if (num_4x4_h == 2)
           m->bmi[ib + 2].as_mode = b_mode;
-        if (bw == 2)
+        if (num_4x4_w == 2)
           m->bmi[ib + 1].as_mode = b_mode;
       }
     }
@@ -228,16 +233,16 @@ static int read_mv_component(vp9_reader *r,
 
 static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
-                           nmv_context_counts *counts, int usehp) {
+                           nmv_context_counts *counts, int allow_hp) {
   const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+  const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
-  usehp = usehp && vp9_use_mv_hp(ref);
   if (mv_joint_vertical(j))
-    diff.row = read_mv_component(r, &ctx->comps[0], usehp);
+    diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
 
   if (mv_joint_horizontal(j))
-    diff.col = read_mv_component(r, &ctx->comps[1], usehp);
+    diff.col = read_mv_component(r, &ctx->comps[1], use_hp);
 
   vp9_inc_mv(&diff, counts);
 
@@ -245,29 +250,30 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
-static void update_mv(vp9_reader *r, vp9_prob *p, vp9_prob upd_p) {
-  if (vp9_read(r, upd_p))
+static void update_mv(vp9_reader *r, vp9_prob *p) {
+  if (vp9_read(r, VP9_NMV_UPDATE_PROB))
     *p = (vp9_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
+static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
   int i, j, k;
 
   for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(r, &mvc->joints[j], VP9_NMV_UPDATE_PROB);
+    update_mv(r, &mvc->joints[j]);
 
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp = &mvc->comps[i];
 
-    update_mv(r, &comp->sign, VP9_NMV_UPDATE_PROB);
+    update_mv(r, &comp->sign);
+
     for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(r, &comp->classes[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->classes[j]);
 
     for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(r, &comp->class0[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->class0[j]);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(r, &comp->bits[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->bits[j]);
   }
 
   for (i = 0; i < 2; ++i) {
@@ -275,23 +281,23 @@ static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int usehp) {
 
     for (j = 0; j < CLASS0_SIZE; ++j)
       for (k = 0; k < 3; ++k)
-        update_mv(r, &comp->class0_fp[j][k], VP9_NMV_UPDATE_PROB);
+        update_mv(r, &comp->class0_fp[j][k]);
 
     for (j = 0; j < 3; ++j)
-      update_mv(r, &comp->fp[j], VP9_NMV_UPDATE_PROB);
+      update_mv(r, &comp->fp[j]);
   }
 
-  if (usehp) {
+  if (allow_hp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(r, &mvc->comps[i].class0_hp, VP9_NMV_UPDATE_PROB);
-      update_mv(r, &mvc->comps[i].hp, VP9_NMV_UPDATE_PROB);
+      update_mv(r, &mvc->comps[i].class0_hp);
+      update_mv(r, &mvc->comps[i].hp);
     }
   }
 }
 
 // Read the referncence frame
-static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,
-                           int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
+static void read_ref_frames(VP9D_COMP *pbi, vp9_reader *r,
+                            int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   FRAME_CONTEXT *const fc = &cm->fc;
@@ -320,18 +326,19 @@ static void read_ref_frame(VP9D_COMP *pbi, vp9_reader *r,
       ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
       ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
     } else {
-      const int ref1_ctx = vp9_get_pred_context_single_ref_p1(xd);
-      ref_frame[1] = NONE;
-      if (vp9_read(r, fc->single_ref_prob[ref1_ctx][0])) {
-        const int ref2_ctx = vp9_get_pred_context_single_ref_p2(xd);
-        const int b = vp9_read(r, fc->single_ref_prob[ref2_ctx][1]);
-        ref_frame[0] = b ? ALTREF_FRAME : GOLDEN_FRAME;
-        counts->single_ref[ref1_ctx][0][1]++;
-        counts->single_ref[ref2_ctx][1][b]++;
+      const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
+      const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
+      ++counts->single_ref[ctx0][0][bit0];
+      if (bit0) {
+        const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
+        const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
+        ++counts->single_ref[ctx1][1][bit1];
       } else {
         ref_frame[0] = LAST_FRAME;
-        counts->single_ref[ref1_ctx][0][0]++;
       }
+
+      ref_frame[1] = NONE;
     }
   }
 }
@@ -359,16 +366,6 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
   return mode;
 }
 
-static INLINE void assign_and_clamp_mv(int_mv *dst, const int_mv *src,
-                                       int mb_to_left_edge,
-                                       int mb_to_right_edge,
-                                       int mb_to_top_edge,
-                                       int mb_to_bottom_edge) {
-  dst->as_int = src->as_int;
-  clamp_mv(dst, mb_to_left_edge, mb_to_right_edge, mb_to_top_edge,
-           mb_to_bottom_edge);
-}
-
 static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
     VP9D_COMP *pbi, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
@@ -380,32 +377,35 @@ static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
   return vp9_switchable_interp[index];
 }
 
-static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
-                                   vp9_reader *r) {
+static void read_intra_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+                                  vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
-  const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_SIZE_SB8X8) {
-    const int size_group = MIN(3, MIN(bwl, bhl));
+    const int size_group = size_group_lookup[bsize];
     mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[size_group]);
     cm->counts.y_mode[size_group][mbmi->mode]++;
   } else {
      // Only 4x4, 4x8, 8x4 blocks
-     const int bw = 1 << bwl, bh = 1 << bhl;
+     const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
+     const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
      int idx, idy;
 
-     for (idy = 0; idy < 2; idy += bh) {
-       for (idx = 0; idx < 2; idx += bw) {
+     for (idy = 0; idy < 2; idy += num_4x4_h) {
+       for (idx = 0; idx < 2; idx += num_4x4_w) {
          const int ib = idy * 2 + idx;
          const int b_mode = read_intra_mode(r, cm->fc.y_mode_prob[0]);
          mi->bmi[ib].as_mode = b_mode;
          cm->counts.y_mode[0][b_mode]++;
 
-         if (bh == 2)
+         if (num_4x4_h == 2)
            mi->bmi[ib + 2].as_mode = b_mode;
-         if (bw == 2)
+         if (num_4x4_w == 2)
            mi->bmi[ib + 1].as_mode = b_mode;
       }
     }
@@ -416,203 +416,197 @@ static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
   cm->counts.uv_mode[mbmi->mode][mbmi->uv_mode]++;
 }
 
-static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id,
-                                               vp9_reader *r) {
+static int read_is_inter_block(VP9D_COMP *pbi, int segment_id, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
 
-  MV_REFERENCE_FRAME ref;
-  if (!vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
-    const int ctx = vp9_get_pred_context_intra_inter(xd);
-    ref = (MV_REFERENCE_FRAME)
-              vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
-    cm->counts.intra_inter[ctx][ref != INTRA_FRAME]++;
+  if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME)) {
+    return vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
+           INTRA_FRAME;
   } else {
-    ref = (MV_REFERENCE_FRAME) vp9_get_segdata(&xd->seg, segment_id,
-                                   SEG_LVL_REF_FRAME) != INTRA_FRAME;
+    const int ctx = vp9_get_pred_context_intra_inter(xd);
+    const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
+    ++cm->counts.intra_inter[ctx][is_inter];
+    return is_inter;
   }
-  return ref;
 }
 
-static void read_inter_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
-                                 int mi_row, int mi_col, vp9_reader *r) {
+static void read_inter_block_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+                                       int mi_row, int mi_col, vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
   nmv_context *const nmvc = &cm->fc.nmvc;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-
   int_mv *const mv0 = &mbmi->mv[0];
   int_mv *const mv1 = &mbmi->mv[1];
-  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
-  const int bw = 1 << b_width_log2(bsize);
-  const int bh = 1 << b_height_log2(bsize);
-
-  int idx, idy;
+  const BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
+  const int allow_hp = xd->allow_high_precision_mv;
 
-  mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
-  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
-  mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
-  mbmi->ref_frame[1] = NONE;
-  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, bsize,
-     (!mbmi->mb_skip_coeff || mbmi->ref_frame[0] == INTRA_FRAME), r);
+  int_mv nearest, nearby, best_mv;
+  int_mv nearest_second, nearby_second, best_mv_second;
+  uint8_t inter_mode_ctx;
+  MV_REFERENCE_FRAME ref0, ref1;
 
-  if (mbmi->ref_frame[0] != INTRA_FRAME) {
-    int_mv nearest, nearby, best_mv;
-    int_mv nearest_second, nearby_second, best_mv_second;
-    vp9_prob *mv_ref_p;
-    MV_REFERENCE_FRAME ref0, ref1;
+  read_ref_frames(pbi, r, mbmi->segment_id, mbmi->ref_frame);
+  ref0 = mbmi->ref_frame[0];
+  ref1 = mbmi->ref_frame[1];
 
-    read_ref_frame(pbi, r, mbmi->segment_id, mbmi->ref_frame);
-    ref0 = mbmi->ref_frame[0];
-    ref1 = mbmi->ref_frame[1];
+  vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+                   ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias,
+                   mi_row, mi_col);
 
-    vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                     ref0, mbmi->ref_mvs[ref0], cm->ref_frame_sign_bias);
+  inter_mode_ctx = mbmi->mb_mode_context[ref0];
 
-    mv_ref_p = cm->fc.inter_mode_probs[mbmi->mb_mode_context[ref0]];
+  if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP))
+    mbmi->mode = ZEROMV;
+  else if (bsize >= BLOCK_SIZE_SB8X8)
+    mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
 
-    if (vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      mbmi->mode = ZEROMV;
-    } else if (bsize >= BLOCK_SIZE_SB8X8) {
-      mbmi->mode = read_inter_mode(r, mv_ref_p);
-      vp9_accum_mv_refs(cm, mbmi->mode, mbmi->mb_mode_context[ref0]);
-    }
-    mbmi->uv_mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
 
-    // nearest, nearby
-    if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
-      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
-      best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
-    }
+  // nearest, nearby
+  if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+    vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref0], &nearest, &nearby);
+    best_mv.as_int = mbmi->ref_mvs[ref0][0].as_int;
+  }
 
-    mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
-                              ? read_switchable_filter_type(pbi, r)
-                              : cm->mcomp_filter_type;
+  mbmi->interp_filter = cm->mcomp_filter_type == SWITCHABLE
+                            ? read_switchable_filter_type(pbi, r)
+                            : cm->mcomp_filter_type;
 
-    if (ref1 > INTRA_FRAME) {
-      vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
-                       ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias);
+  if (ref1 > INTRA_FRAME) {
+    vp9_find_mv_refs(cm, xd, mi, xd->prev_mode_info_context,
+                     ref1, mbmi->ref_mvs[ref1], cm->ref_frame_sign_bias,
+                     mi_row, mi_col);
 
-      if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
-        vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
-                              &nearest_second, &nearby_second);
-        best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
-      }
+    if (bsize < BLOCK_SIZE_SB8X8 || mbmi->mode != ZEROMV) {
+      vp9_find_best_ref_mvs(xd, mbmi->ref_mvs[ref1],
+                            &nearest_second, &nearby_second);
+      best_mv_second.as_int = mbmi->ref_mvs[ref1][0].as_int;
     }
+  }
 
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];  // 1 or 2
+    const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];  // 1 or 2
+    int idx, idy;
+    for (idy = 0; idy < 2; idy += num_4x4_h) {
+      for (idx = 0; idx < 2; idx += num_4x4_w) {
+        int_mv blockmv, secondmv;
+        const int j = idy * 2 + idx;
+        const int b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
-    if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
-      for (idy = 0; idy < 2; idy += bh) {
-        for (idx = 0; idx < 2; idx += bw) {
-          int_mv blockmv, secondmv;
-          const int j = idy * 2 + idx;
-          const int blockmode = read_inter_mode(r, mv_ref_p);
+        if (b_mode == NEARESTMV || b_mode == NEARMV) {
+          vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0,
+                                        mi_row, mi_col);
 
-          vp9_accum_mv_refs(cm, blockmode, mbmi->mb_mode_context[ref0]);
-          if (blockmode == NEARESTMV || blockmode == NEARMV) {
-            vp9_append_sub8x8_mvs_for_idx(cm, xd, &nearest, &nearby, j, 0);
-            if (ref1 > 0)
-              vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
-                                            &nearby_second, j, 1);
-          }
-
-          switch (blockmode) {
-            case NEWMV:
-              read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
-                      &cm->counts.mv, xd->allow_high_precision_mv);
-
-              if (ref1 > 0)
-                read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
-                        &cm->counts.mv, xd->allow_high_precision_mv);
-              break;
-            case NEARESTMV:
-              blockmv.as_int = nearest.as_int;
-              if (ref1 > 0)
-                secondmv.as_int = nearest_second.as_int;
-              break;
-            case NEARMV:
-              blockmv.as_int = nearby.as_int;
-              if (ref1 > 0)
-                secondmv.as_int = nearby_second.as_int;
-              break;
-            case ZEROMV:
-              blockmv.as_int = 0;
-              if (ref1 > 0)
-                secondmv.as_int = 0;
-              break;
-            default:
-              assert(!"Invalid inter mode value");
-          }
-          mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
           if (ref1 > 0)
-            mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
-
-          if (bh == 2)
-            mi->bmi[j + 2] = mi->bmi[j];
-          if (bw == 2)
-            mi->bmi[j + 1] = mi->bmi[j];
-          mi->mbmi.mode = blockmode;
+            vp9_append_sub8x8_mvs_for_idx(cm, xd,  &nearest_second,
+                                         &nearby_second, j, 1,
+                                         mi_row, mi_col);
         }
-      }
 
-      mv0->as_int = mi->bmi[3].as_mv[0].as_int;
-      mv1->as_int = mi->bmi[3].as_mv[1].as_int;
-    } else {
-      const int mb_to_top_edge = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
-      const int mb_to_bottom_edge = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
-      const int mb_to_left_edge = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
-      const int mb_to_right_edge = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
-
-      switch (mbmi->mode) {
-        case NEARMV:
-          // Clip "next_nearest" so that it does not extend to far out of image
-          assign_and_clamp_mv(mv0, &nearby, mb_to_left_edge,
-                                            mb_to_right_edge,
-                                            mb_to_top_edge,
-                                            mb_to_bottom_edge);
-          if (ref1 > 0)
-            assign_and_clamp_mv(mv1, &nearby_second, mb_to_left_edge,
-                                                     mb_to_right_edge,
-                                                     mb_to_top_edge,
-                                                     mb_to_bottom_edge);
-          break;
-
-        case NEARESTMV:
-          // Clip "next_nearest" so that it does not extend to far out of image
-          assign_and_clamp_mv(mv0, &nearest, mb_to_left_edge,
-                                             mb_to_right_edge,
-                                             mb_to_top_edge,
-                                             mb_to_bottom_edge);
-          if (ref1 > 0)
-            assign_and_clamp_mv(mv1, &nearest_second, mb_to_left_edge,
-                                                      mb_to_right_edge,
-                                                      mb_to_top_edge,
-                                                      mb_to_bottom_edge);
-          break;
-
-        case ZEROMV:
-          mv0->as_int = 0;
-          if (ref1 > 0)
-            mv1->as_int = 0;
-          break;
+        switch (b_mode) {
+          case NEWMV:
+            read_mv(r, &blockmv.as_mv, &best_mv.as_mv, nmvc,
+                    &cm->counts.mv, allow_hp);
 
-        case NEWMV:
-          read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv,
-                  xd->allow_high_precision_mv);
-          if (ref1 > 0)
-            read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc,
-                    &cm->counts.mv, xd->allow_high_precision_mv);
-          break;
-        default:
-          assert(!"Invalid inter mode value");
+            if (ref1 > 0)
+              read_mv(r, &secondmv.as_mv, &best_mv_second.as_mv, nmvc,
+                      &cm->counts.mv, allow_hp);
+            break;
+          case NEARESTMV:
+            blockmv.as_int = nearest.as_int;
+            if (ref1 > 0)
+              secondmv.as_int = nearest_second.as_int;
+            break;
+          case NEARMV:
+            blockmv.as_int = nearby.as_int;
+            if (ref1 > 0)
+              secondmv.as_int = nearby_second.as_int;
+            break;
+          case ZEROMV:
+            blockmv.as_int = 0;
+            if (ref1 > 0)
+              secondmv.as_int = 0;
+            break;
+          default:
+            assert(!"Invalid inter mode value");
+        }
+        mi->bmi[j].as_mv[0].as_int = blockmv.as_int;
+        if (ref1 > 0)
+          mi->bmi[j].as_mv[1].as_int = secondmv.as_int;
+
+        if (num_4x4_h == 2)
+          mi->bmi[j + 2] = mi->bmi[j];
+        if (num_4x4_w == 2)
+          mi->bmi[j + 1] = mi->bmi[j];
+        mi->mbmi.mode = b_mode;
       }
     }
+
+    mv0->as_int = mi->bmi[3].as_mv[0].as_int;
+    mv1->as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    mv0->as_int = 0;  // required for left and above block mv
-    read_intra_block_modes(pbi, mi, r);
+    switch (mbmi->mode) {
+      case NEARMV:
+        mv0->as_int = nearby.as_int;
+        clamp_mv2(&mv0->as_mv, xd);
+
+        if (ref1 > 0) {
+          mv1->as_int = nearby_second.as_int;
+          clamp_mv2(&mv1->as_mv, xd);
+        }
+        break;
+
+      case NEARESTMV:
+        mv0->as_int = nearest.as_int;
+        clamp_mv2(&mv0->as_mv, xd);
+
+        if (ref1 > 0) {
+          mv1->as_int = nearest_second.as_int;
+          clamp_mv2(&mv1->as_mv, xd);
+        }
+        break;
+
+      case ZEROMV:
+        mv0->as_int = 0;
+        if (ref1 > 0)
+          mv1->as_int = 0;
+        break;
+
+      case NEWMV:
+        read_mv(r, &mv0->as_mv, &best_mv.as_mv, nmvc, &cm->counts.mv, allow_hp);
+        if (ref1 > 0)
+          read_mv(r, &mv1->as_mv, &best_mv_second.as_mv, nmvc, &cm->counts.mv,
+                  allow_hp);
+        break;
+      default:
+        assert(!"Invalid inter mode value");
+    }
   }
 }
 
+static void read_inter_frame_mode_info(VP9D_COMP *pbi, MODE_INFO *mi,
+                                       int mi_row, int mi_col, vp9_reader *r) {
+  VP9_COMMON *const cm = &pbi->common;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  int inter_block;
+
+  mbmi->mv[0].as_int = 0;
+  mbmi->mv[1].as_int = 0;
+  mbmi->segment_id = read_inter_segment_id(pbi, mi_row, mi_col, r);
+  mbmi->mb_skip_coeff = read_skip_coeff(pbi, mbmi->segment_id, r);
+  inter_block = read_is_inter_block(pbi, mbmi->segment_id, r);
+  mbmi->txfm_size = read_tx_size(pbi, cm->tx_mode, mbmi->sb_type,
+                                 !mbmi->mb_skip_coeff || !inter_block, r);
+
+  if (inter_block)
+    read_inter_block_mode_info(pbi, mi, mi_row, mi_col, r);
+  else
+    read_intra_block_mode_info(pbi, mi, r);
+}
+
 static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
   int i;
 
@@ -690,9 +684,9 @@ void vp9_read_mode_info(VP9D_COMP* pbi, int mi_row, int mi_col, vp9_reader *r) {
   int x, y;
 
   if (cm->frame_type == KEY_FRAME || cm->intra_only)
-    read_intra_mode_info(pbi, mi, mi_row, mi_col, r);
+    read_intra_frame_mode_info(pbi, mi, mi_row, mi_col, r);
   else
-    read_inter_mode_info(pbi, mi, mi_row, mi_col, r);
+    read_inter_frame_mode_info(pbi, mi, mi_row, mi_col, r);
 
   for (y = 0; y < y_mis; y++)
     for (x = !y; x < x_mis; x++)
diff --git a/libvpx/vp9/decoder/vp9_decodemv.h b/libvpx/vp9/decoder/vp9_decodemv.h
index 4073d9e04..462d2e398 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.h
+++ b/libvpx/vp9/decoder/vp9_decodemv.h
@@ -12,6 +12,7 @@
 #define VP9_DECODER_VP9_DECODEMV_H_
 
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dboolhuff.h"
 
 void vp9_prepare_read_mode_info(VP9D_COMP* pbi, vp9_reader *r);
 
diff --git a/libvpx/vp9/decoder/vp9_decodframe.c b/libvpx/vp9/decoder/vp9_decodframe.c
index ffec8ea44..feb602402 100644
--- a/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/libvpx/vp9/decoder/vp9_decodframe.c
@@ -31,8 +31,11 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
+#include "vp9/decoder/vp9_idct_blk.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
+#include "vp9/decoder/vp9_thread.h"
+#include "vp9/decoder/vp9_treereader.h"
 
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
@@ -59,17 +62,17 @@ static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j)
+    for (j = 0; j < TX_SIZES - 3; ++j)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p8x8[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j)
+    for (j = 0; j < TX_SIZES - 2; ++j)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p16x16[i][j]);
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j)
+    for (j = 0; j < TX_SIZES - 1; ++j)
       if (vp9_read(r, VP9_MODE_UPDATE_PROB))
         vp9_diff_update_prob(r, &tx_probs->p32x32[i][j]);
 }
@@ -138,8 +141,8 @@ static void decode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
   const int mode = plane == 0 ? mi->mbmi.mode
                               : mi->mbmi.uv_mode;
 
-  if (plane == 0 && mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) {
-    assert(bsize == BLOCK_SIZE_SB8X8);
+  if (plane == 0 && mi->mbmi.sb_type < BLOCK_8X8) {
+    assert(bsize == BLOCK_8X8);
     b_mode = mi->bmi[raster_block].as_mode;
   } else {
     b_mode = mode;
@@ -223,7 +226,7 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-  const int less8x8 = bsize < BLOCK_SIZE_SB8X8;
+  const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
   if (less8x8)
@@ -234,12 +237,12 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
   vp9_read_mode_info(pbi, mi_row, mi_col, r);
 
   if (less8x8)
-    bsize = BLOCK_SIZE_SB8X8;
+    bsize = BLOCK_8X8;
 
   // Has to be called after set_offsets
   mbmi = &xd->mode_info_context->mbmi;
 
-  if (mbmi->ref_frame[0] == INTRA_FRAME) {
+  if (!is_inter_block(mbmi)) {
     // Intra reconstruction
     decode_tokens(pbi, bsize, r);
     foreach_transformed_block(xd, bsize, decode_block_intra, xd);
@@ -280,12 +283,12 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
   if (mi_row >= pc->mi_rows || mi_col >= pc->mi_cols)
     return;
 
-  if (bsize < BLOCK_SIZE_SB8X8) {
+  if (bsize < BLOCK_8X8) {
     if (xd->ab_index != 0)
       return;
   } else {
     int pl;
-    const int idx = check_bsize_coverage(pc, xd, mi_row, mi_col, bsize);
+    const int idx = check_bsize_coverage(pc, mi_row, mi_col, bsize);
     set_partition_seg_context(pc, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
 
@@ -332,8 +335,8 @@ static void decode_modes_sb(VP9D_COMP *pbi, int mi_row, int mi_col,
   }
 
   // update partition context
-  if (bsize >= BLOCK_SIZE_SB8X8 &&
-      (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
+  if (bsize >= BLOCK_8X8 &&
+      (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT)) {
     set_partition_seg_context(pc, xd, mi_row, mi_col);
     update_partition_context(xd, subsize, bsize);
   }
@@ -499,7 +502,7 @@ static INTERPOLATIONFILTERTYPE read_interp_filter_type(
                              : vp9_rb_read_literal(rb, 2);
 }
 
-static void read_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb,
+static void read_frame_size(struct vp9_read_bit_buffer *rb,
                             int *width, int *height) {
   const int w = vp9_rb_read_literal(rb, 16) + 1;
   const int h = vp9_rb_read_literal(rb, 16) + 1;
@@ -507,12 +510,11 @@ static void read_frame_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb,
   *height = h;
 }
 
-static void setup_display_size(VP9D_COMP *pbi, struct vp9_read_bit_buffer *rb) {
-  VP9_COMMON *const cm = &pbi->common;
+static void setup_display_size(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
   cm->display_width = cm->width;
   cm->display_height = cm->height;
   if (vp9_rb_read_bit(rb))
-    read_frame_size(cm, rb, &cm->display_width, &cm->display_height);
+    read_frame_size(rb, &cm->display_width, &cm->display_height);
 }
 
 static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
@@ -548,10 +550,9 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
 
 static void setup_frame_size(VP9D_COMP *pbi,
                              struct vp9_read_bit_buffer *rb) {
-  VP9_COMMON *const cm = &pbi->common;
   int width, height;
-  read_frame_size(cm, rb, &width, &height);
-  setup_display_size(pbi, rb);
+  read_frame_size(rb, &width, &height);
+  setup_display_size(&pbi->common, rb);
   apply_frame_size(pbi, width, height);
 }
 
@@ -572,21 +573,29 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
   }
 
   if (!found)
-    read_frame_size(cm, rb, &width, &height);
+    read_frame_size(rb, &width, &height);
 
   if (!width || !height)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                        "Referenced frame with invalid size");
 
-  setup_display_size(pbi, rb);
+  setup_display_size(cm, rb);
   apply_frame_size(pbi, width, height);
 }
 
 static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
+  const int num_threads = pbi->oxcf.max_threads;
   VP9_COMMON *const pc = &pbi->common;
   int mi_row, mi_col;
 
   if (pbi->do_loopfilter_inline) {
+    if (num_threads > 1) {
+      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+      lf_data->frame_buffer = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+      lf_data->cm = pc;
+      lf_data->xd = pbi->mb;
+      lf_data->y_only = 0;
+    }
     vp9_loop_filter_frame_init(pc, &pbi->mb, pbi->mb.lf.filter_level);
   }
 
@@ -597,21 +606,37 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
     vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
     for (mi_col = pc->cur_tile_mi_col_start; mi_col < pc->cur_tile_mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
+      decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_64X64);
     }
 
     if (pbi->do_loopfilter_inline) {
-      YV12_BUFFER_CONFIG *const fb =
-          &pbi->common.yv12_fb[pbi->common.new_fb_idx];
       // delay the loopfilter by 1 macroblock row.
       const int lf_start = mi_row - MI_BLOCK_SIZE;
       if (lf_start < 0) continue;
-      vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+
+      if (num_threads > 1) {
+        LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+
+        vp9_worker_sync(&pbi->lf_worker);
+        lf_data->start = lf_start;
+        lf_data->stop = mi_row;
+        pbi->lf_worker.hook = vp9_loop_filter_worker;
+        vp9_worker_launch(&pbi->lf_worker);
+      } else {
+        YV12_BUFFER_CONFIG *const fb =
+            &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+        vp9_loop_filter_rows(fb, pc, &pbi->mb, lf_start, mi_row, 0);
+      }
     }
   }
 
   if (pbi->do_loopfilter_inline) {
     YV12_BUFFER_CONFIG *const fb = &pbi->common.yv12_fb[pbi->common.new_fb_idx];
+    if (num_threads > 1) {
+      // TODO(jzern): since the loop filter is delayed one mb row, this will be
+      // forced to wait for the last row scheduled in the for loop.
+      vp9_worker_sync(&pbi->lf_worker);
+    }
     vp9_loop_filter_rows(fb, pc, &pbi->mb,
                          mi_row - MI_BLOCK_SIZE, pc->mi_rows, 0);
   }
@@ -994,7 +1019,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
 
     if (!keyframe && !pc->intra_only) {
       vp9_adapt_mode_probs(pc);
-      vp9_adapt_mode_context(pc);
       vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
     }
   }
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index 01c1db0b7..002164307 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -15,8 +15,10 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#include "vp9/decoder/vp9_dboolhuff.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_treereader.h"
 
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
@@ -73,7 +75,7 @@ DECLARE_ALIGNED(16, extern const uint8_t,
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
     qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
-                            dq[c > 0] / (1 + (txfm_size == TX_32X32)); \
+                            dq[c > 0] / (1 + (tx_size == TX_32X32)); \
     INCREMENT_COUNT(token);                              \
     c++;                                                 \
     continue;                                            \
@@ -88,33 +90,24 @@ DECLARE_ALIGNED(16, extern const uint8_t,
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE txfm_size, const int16_t *dq,
+                        TX_SIZE tx_size, const int16_t *dq,
                         ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
   FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   ENTROPY_CONTEXT above_ec, left_ec;
-  int pt, c = 0;
-  int band;
-  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES];
+  const int ref = is_inter_block(&xd->mode_info_context->mbmi);
+  int band, pt, c = 0;
+  vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
+      fc->coef_probs[tx_size][type][ref];
   vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = {
-    {0, 0, 0, 0, 0, 0},
-    {0, 0, 0, 0, 0, 0},
-    {0, 0, 0, 0, 0, 0},
-    {0, 0, 0, 0, 0, 0},
-    {0, 0, 0, 0, 0, 0},
-    {0, 0, 0, 0, 0, 0},
-  };
-
+  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } };
   vp9_prob *prob;
-  vp9_coeff_count_model *coef_counts;
-  const int ref = xd->mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
+  vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
   const int16_t *scan, *nb;
   uint8_t token_cache[1024];
   const uint8_t * band_translate;
-  coef_probs  = fc->coef_probs[txfm_size][type][ref];
-  coef_counts = counts->coef[txfm_size];
-  switch (txfm_size) {
+
+  switch (tx_size) {
     default:
     case TX_4X4: {
       scan = get_scan_4x4(get_tx_type_4x4(type, xd, block_idx));
@@ -125,22 +118,22 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
     case TX_8X8: {
       scan = get_scan_8x8(get_tx_type_8x8(type, xd));
-      above_ec = (A[0] + A[1]) != 0;
-      left_ec = (L[0] + L[1]) != 0;
+      above_ec = !!*(uint16_t *)A;
+      left_ec  = !!*(uint16_t *)L;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_16X16: {
       scan = get_scan_16x16(get_tx_type_16x16(type, xd));
-      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+      above_ec = !!*(uint32_t *)A;
+      left_ec  = !!*(uint32_t *)L;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     }
     case TX_32X32:
       scan = vp9_default_scan_32x32;
-      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+      above_ec = !!*(uint64_t *)A;
+      left_ec  = !!*(uint64_t *)L;
       band_translate = vp9_coefband_trans_8x8plus;
       break;
   }
@@ -157,7 +150,7 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       pt = get_coef_context(nb, token_cache, c);
     band = get_coef_band(band_translate, c);
     prob = coef_probs[band][pt];
-    counts->eob_branch[txfm_size][type][ref][band][pt]++;
+    counts->eob_branch[tx_size][type][ref][band][pt]++;
     if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
       break;
 
@@ -276,7 +269,7 @@ static void decode_block(int plane, int block,
   const int mod = bw - ss_tx_size - pd->subsampling_x;
   const int aoff = (off & ((1 << mod) - 1)) << ss_tx_size;
   const int loff = (off >> mod) << ss_tx_size;
-
+  const int tx_size_in_blocks = 1 << ss_tx_size;
   ENTROPY_CONTEXT *A = pd->above_context + aoff;
   ENTROPY_CONTEXT *L = pd->left_context + loff;
   const int eob = decode_coefs(&arg->pbi->common, xd, arg->r, block,
@@ -285,10 +278,11 @@ static void decode_block(int plane, int block,
                                ss_tx_size, pd->dequant, A, L);
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, bsize, plane, ss_tx_size, eob, aoff, loff, A, L);
+    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
+                           A, L);
   } else {
     int pt;
-    for (pt = 0; pt < (1 << ss_tx_size); pt++)
+    for (pt = 0; pt < tx_size_in_blocks; pt++)
       A[pt] = L[pt] = eob > 0;
   }
   pd->eobs[block] = eob;
diff --git a/libvpx/vp9/decoder/vp9_detokenize.h b/libvpx/vp9/decoder/vp9_detokenize.h
index d46b59635..f98fe8d4c 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/libvpx/vp9/decoder/vp9_detokenize.h
@@ -13,6 +13,7 @@
 #define VP9_DECODER_VP9_DETOKENIZE_H_
 
 #include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_dboolhuff.h"
 
 int vp9_decode_tokens(VP9D_COMP* pbi, vp9_reader *r, BLOCK_SIZE_TYPE bsize);
 
diff --git a/libvpx/vp9/decoder/vp9_idct_blk.c b/libvpx/vp9/decoder/vp9_idct_blk.c
index 0217919da..395e636b8 100644
--- a/libvpx/vp9/decoder/vp9_idct_blk.c
+++ b/libvpx/vp9/decoder/vp9_idct_blk.c
@@ -93,15 +93,11 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob) {
     if (eob == 1) {
       // DC only DCT coefficient
-      int16_t in = input[0];
-      int16_t out;
-
-      // Note: the idct1 will need to be modified accordingly whenever
-      // vp9_short_idct8x8_c() is modified.
-      vp9_short_idct1_8x8_c(&in, &out);
+      vp9_short_idct8x8_1_add(input, dest, stride);
       input[0] = 0;
-
-      vp9_add_constant_residual_8x8(out, dest, stride);
+    } else if (eob <= 10) {
+      vp9_short_idct10_8x8_add(input, dest, stride);
+      vpx_memset(input, 0, 128);
     } else {
       vp9_short_idct8x8_add(input, dest, stride);
       vpx_memset(input, 0, 128);
@@ -127,14 +123,11 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
   if (eob) {
     if (eob == 1) {
       /* DC only DCT coefficient. */
-      int16_t in = input[0];
-      int16_t out;
-      /* Note: the idct1 will need to be modified accordingly whenever
-       * vp9_short_idct16x16() is modified. */
-      vp9_short_idct1_16x16_c(&in, &out);
+      vp9_short_idct16x16_1_add(input, dest, stride);
       input[0] = 0;
-
-      vp9_add_constant_residual_16x16(out, dest, stride);
+    } else if (eob <= 10) {
+      vp9_short_idct10_16x16_add(input, dest, stride);
+      vpx_memset(input, 0, 512);
     } else {
       vp9_short_idct16x16_add(input, dest, stride);
       vpx_memset(input, 0, 512);
diff --git a/libvpx/vp9/decoder/vp9_onyxd_if.c b/libvpx/vp9/decoder/vp9_onyxd_if.c
index cb7292006..5a01dd790 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -8,9 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-#include <stdio.h>
 #include <assert.h>
+#include <limits.h>
+#include <stdio.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
 #if CONFIG_POSTPROC
@@ -114,7 +114,7 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   if (!pbi)
     return NULL;
 
-  vpx_memset(pbi, 0, sizeof(VP9D_COMP));
+  vp9_zero(*pbi);
 
   if (setjmp(pbi->common.error.jmp)) {
     pbi->common.error.setjmp = 0;
@@ -141,6 +141,16 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   pbi->common.error.setjmp = 0;
   pbi->decoded_key_frame = 0;
 
+  if (pbi->oxcf.max_threads > 1) {
+    vp9_worker_init(&pbi->lf_worker);
+    pbi->lf_worker.data1 = vpx_malloc(sizeof(LFWorkerData));
+    pbi->lf_worker.hook = (VP9WorkerHook)vp9_loop_filter_worker;
+    if (pbi->lf_worker.data1 == NULL || !vp9_worker_reset(&pbi->lf_worker)) {
+      vp9_remove_decompressor(pbi);
+      return NULL;
+    }
+  }
+
   return pbi;
 }
 
@@ -154,6 +164,8 @@ void vp9_remove_decompressor(VP9D_PTR ptr) {
     vpx_free(pbi->common.last_frame_seg_map);
 
   vp9_remove_common(&pbi->common);
+  vp9_worker_end(&pbi->lf_worker);
+  vpx_free(pbi->lf_worker.data1);
   vpx_free(pbi);
 }
 
diff --git a/libvpx/vp9/decoder/vp9_onyxd_int.h b/libvpx/vp9/decoder/vp9_onyxd_int.h
index 476006616..a051971a1 100644
--- a/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -14,10 +14,8 @@
 #include "./vpx_config.h"
 
 #include "vp9/common/vp9_onyxc_int.h"
-
-#include "vp9/decoder/vp9_idct_blk.h"
 #include "vp9/decoder/vp9_onyxd.h"
-#include "vp9/decoder/vp9_treereader.h"
+#include "vp9/decoder/vp9_thread.h"
 
 typedef struct VP9Decompressor {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
@@ -40,6 +38,7 @@ typedef struct VP9Decompressor {
   int initial_height;
 
   int do_loopfilter_inline;  // apply loopfilter to available rows immediately
+  VP9Worker lf_worker;
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/libvpx/vp9/decoder/vp9_thread.c b/libvpx/vp9/decoder/vp9_thread.c
new file mode 100644
index 000000000..dc3b68196
--- /dev/null
+++ b/libvpx/vp9/decoder/vp9_thread.c
@@ -0,0 +1,248 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+//  http://git.chromium.org/webm/libwebp.git
+//  100644 blob eff8f2a8c20095aade3c292b0e9292dac6cb3587  src/utils/thread.c
+
+
+#include <assert.h>
+#include <string.h>   // for memset()
+#include "./vp9_thread.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32)
+
+//------------------------------------------------------------------------------
+// simplistic pthread emulation layer
+
+#include <process.h>
+
+// _beginthreadex requires __stdcall
+#define THREADFN unsigned int __stdcall
+#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val)
+
+static int pthread_create(pthread_t* const thread, const void* attr,
+                          unsigned int (__stdcall *start)(void*), void* arg) {
+  (void)attr;
+  *thread = (pthread_t)_beginthreadex(NULL,   /* void *security */
+                                      0,      /* unsigned stack_size */
+                                      start,
+                                      arg,
+                                      0,      /* unsigned initflag */
+                                      NULL);  /* unsigned *thrdaddr */
+  if (*thread == NULL) return 1;
+  SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL);
+  return 0;
+}
+
+static int pthread_join(pthread_t thread, void** value_ptr) {
+  (void)value_ptr;
+  return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 ||
+          CloseHandle(thread) == 0);
+}
+
+// Mutex
+static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) {
+  (void)mutexattr;
+  InitializeCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_lock(pthread_mutex_t* const mutex) {
+  EnterCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_unlock(pthread_mutex_t* const mutex) {
+  LeaveCriticalSection(mutex);
+  return 0;
+}
+
+static int pthread_mutex_destroy(pthread_mutex_t* const mutex) {
+  DeleteCriticalSection(mutex);
+  return 0;
+}
+
+// Condition
+static int pthread_cond_destroy(pthread_cond_t* const condition) {
+  int ok = 1;
+  ok &= (CloseHandle(condition->waiting_sem_) != 0);
+  ok &= (CloseHandle(condition->received_sem_) != 0);
+  ok &= (CloseHandle(condition->signal_event_) != 0);
+  return !ok;
+}
+
+static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) {
+  (void)cond_attr;
+  condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL);
+  condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL);
+  if (condition->waiting_sem_ == NULL ||
+      condition->received_sem_ == NULL ||
+      condition->signal_event_ == NULL) {
+    pthread_cond_destroy(condition);
+    return 1;
+  }
+  return 0;
+}
+
+static int pthread_cond_signal(pthread_cond_t* const condition) {
+  int ok = 1;
+  if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) {
+    // a thread is waiting in pthread_cond_wait: allow it to be notified
+    ok = SetEvent(condition->signal_event_);
+    // wait until the event is consumed so the signaler cannot consume
+    // the event via its own pthread_cond_wait.
+    ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) !=
+           WAIT_OBJECT_0);
+  }
+  return !ok;
+}
+
+static int pthread_cond_wait(pthread_cond_t* const condition,
+                             pthread_mutex_t* const mutex) {
+  int ok;
+  // note that there is a consumer available so the signal isn't dropped in
+  // pthread_cond_signal
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
+    return 1;
+  // now unlock the mutex so pthread_cond_signal may be issued
+  pthread_mutex_unlock(mutex);
+  ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
+        WAIT_OBJECT_0);
+  ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL);
+  pthread_mutex_lock(mutex);
+  return !ok;
+}
+
+#else  // _WIN32
+# define THREADFN void*
+# define THREAD_RETURN(val) val
+#endif
+
+//------------------------------------------------------------------------------
+
+static THREADFN thread_loop(void *ptr) {    // thread loop
+  VP9Worker* const worker = (VP9Worker*)ptr;
+  int done = 0;
+  while (!done) {
+    pthread_mutex_lock(&worker->mutex_);
+    while (worker->status_ == OK) {   // wait in idling mode
+      pthread_cond_wait(&worker->condition_, &worker->mutex_);
+    }
+    if (worker->status_ == WORK) {
+      if (worker->hook) {
+        worker->had_error |= !worker->hook(worker->data1, worker->data2);
+      }
+      worker->status_ = OK;
+    } else if (worker->status_ == NOT_OK) {   // finish the worker
+      done = 1;
+    }
+    // signal to the main thread that we're done (for Sync())
+    pthread_cond_signal(&worker->condition_);
+    pthread_mutex_unlock(&worker->mutex_);
+  }
+  return THREAD_RETURN(NULL);    // Thread is finished
+}
+
+// main thread state control
+static void change_state(VP9Worker* const worker,
+                         VP9WorkerStatus new_status) {
+  // no-op when attempting to change state on a thread that didn't come up
+  if (worker->status_ < OK) return;
+
+  pthread_mutex_lock(&worker->mutex_);
+  // wait for the worker to finish
+  while (worker->status_ != OK) {
+    pthread_cond_wait(&worker->condition_, &worker->mutex_);
+  }
+  // assign new status and release the working thread if needed
+  if (new_status != OK) {
+    worker->status_ = new_status;
+    pthread_cond_signal(&worker->condition_);
+  }
+  pthread_mutex_unlock(&worker->mutex_);
+}
+
+#endif
+
+//------------------------------------------------------------------------------
+
+void vp9_worker_init(VP9Worker* const worker) {
+  memset(worker, 0, sizeof(*worker));
+  worker->status_ = NOT_OK;
+}
+
+int vp9_worker_sync(VP9Worker* const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, OK);
+#endif
+  assert(worker->status_ <= OK);
+  return !worker->had_error;
+}
+
+int vp9_worker_reset(VP9Worker* const worker) {
+  int ok = 1;
+  worker->had_error = 0;
+  if (worker->status_ < OK) {
+#if CONFIG_MULTITHREAD
+    if (pthread_mutex_init(&worker->mutex_, NULL) ||
+        pthread_cond_init(&worker->condition_, NULL)) {
+      return 0;
+    }
+    pthread_mutex_lock(&worker->mutex_);
+    ok = !pthread_create(&worker->thread_, NULL, thread_loop, worker);
+    if (ok) worker->status_ = OK;
+    pthread_mutex_unlock(&worker->mutex_);
+#else
+    worker->status_ = OK;
+#endif
+  } else if (worker->status_ > OK) {
+    ok = vp9_worker_sync(worker);
+  }
+  assert(!ok || (worker->status_ == OK));
+  return ok;
+}
+
+void vp9_worker_launch(VP9Worker* const worker) {
+#if CONFIG_MULTITHREAD
+  change_state(worker, WORK);
+#else
+  if (worker->hook)
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+#endif
+}
+
+void vp9_worker_end(VP9Worker* const worker) {
+  if (worker->status_ >= OK) {
+#if CONFIG_MULTITHREAD
+    change_state(worker, NOT_OK);
+    pthread_join(worker->thread_, NULL);
+    pthread_mutex_destroy(&worker->mutex_);
+    pthread_cond_destroy(&worker->condition_);
+#else
+    worker->status_ = NOT_OK;
+#endif
+  }
+  assert(worker->status_ == NOT_OK);
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
diff --git a/libvpx/vp9/decoder/vp9_thread.h b/libvpx/vp9/decoder/vp9_thread.h
new file mode 100644
index 000000000..a8f7e046a
--- /dev/null
+++ b/libvpx/vp9/decoder/vp9_thread.h
@@ -0,0 +1,93 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Multi-threaded worker
+//
+// Original source:
+//  http://git.chromium.org/webm/libwebp.git
+//  100644 blob 13a61a4c84194c3374080cbf03d881d3cd6af40d  src/utils/thread.h
+
+
+#ifndef VP9_DECODER_VP9_THREAD_H_
+#define VP9_DECODER_VP9_THREAD_H_
+
+#include "vpx_config.h"
+
+#if defined(__cplusplus) || defined(c_plusplus)
+extern "C" {
+#endif
+
+#if CONFIG_MULTITHREAD
+
+#if defined(_WIN32)
+
+#include <windows.h>
+typedef HANDLE pthread_t;
+typedef CRITICAL_SECTION pthread_mutex_t;
+typedef struct {
+  HANDLE waiting_sem_;
+  HANDLE received_sem_;
+  HANDLE signal_event_;
+} pthread_cond_t;
+
+#else
+
+#include <pthread.h>
+
+#endif    /* _WIN32 */
+#endif    /* CONFIG_MULTITHREAD */
+
+// State of the worker thread object
+typedef enum {
+  NOT_OK = 0,   // object is unusable
+  OK,           // ready to work
+  WORK          // busy finishing the current task
+} VP9WorkerStatus;
+
+// Function to be called by the worker thread. Takes two opaque pointers as
+// arguments (data1 and data2), and should return false in case of error.
+typedef int (*VP9WorkerHook)(void*, void*);
+
+// Synchronize object used to launch job in the worker thread
+typedef struct {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t mutex_;
+  pthread_cond_t  condition_;
+  pthread_t       thread_;
+#endif
+  VP9WorkerStatus status_;
+  VP9WorkerHook hook;     // hook to call
+  void* data1;            // first argument passed to 'hook'
+  void* data2;            // second argument passed to 'hook'
+  int had_error;          // return value of the last call to 'hook'
+} VP9Worker;
+
+// Must be called first, before any other method.
+void vp9_worker_init(VP9Worker* const worker);
+// Must be called to initialize the object and spawn the thread. Re-entrant.
+// Will potentially launch the thread. Returns false in case of error.
+int vp9_worker_reset(VP9Worker* const worker);
+// Makes sure the previous work is finished. Returns true if worker->had_error
+// was not set and no error condition was triggered by the working thread.
+int vp9_worker_sync(VP9Worker* const worker);
+// Triggers the thread to call hook() with data1 and data2 argument. These
+// hook/data1/data2 can be changed at any time before calling this function,
+// but not be changed afterward until the next call to vp9_worker_sync().
+void vp9_worker_launch(VP9Worker* const worker);
+// Kill the thread and terminate the object. To use the object again, one
+// must call vp9_worker_reset() again.
+void vp9_worker_end(VP9Worker* const worker);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus) || defined(c_plusplus)
+}    // extern "C"
+#endif
+
+#endif  /* VP9_DECODER_VP9_THREAD_H_ */
diff --git a/libvpx/vp9/decoder/vp9_treereader.h b/libvpx/vp9/decoder/vp9_treereader.h
index 4535688ea..710cc4cd0 100644
--- a/libvpx/vp9/decoder/vp9_treereader.h
+++ b/libvpx/vp9/decoder/vp9_treereader.h
@@ -15,7 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-#define vp9_read_prob(r) ((vp9_prob)vp9_read_literal(r, 8))
 #define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))
 
 // Intent of tree data structure is to make decoding trivial.
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index ad0f6c531..98ef42074 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -44,16 +44,16 @@ unsigned __int64 Sectionbits[500];
 int intra_mode_stats[VP9_INTRA_MODES]
                     [VP9_INTRA_MODES]
                     [VP9_INTRA_MODES];
-vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
+vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
 
 extern unsigned int active_section;
 #endif
 
 
 #ifdef MODE_STATS
-int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB];
-int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1];
-int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2];
+int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
+int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
+int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
 int64_t switchable_interp_stats[VP9_SWITCHABLE_FILTERS+1]
                                [VP9_SWITCHABLE_FILTERS];
 
@@ -70,17 +70,17 @@ void init_switchable_interp_stats() {
 static void update_tx_count_stats(VP9_COMMON *cm) {
   int i, j;
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZE_MAX_SB; j++) {
+    for (j = 0; j < TX_SIZES; j++) {
       tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j];
     }
   }
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
+    for (j = 0; j < TX_SIZES - 1; j++) {
       tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j];
     }
   }
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
+    for (j = 0; j < TX_SIZES - 2; j++) {
       tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j];
     }
   }
@@ -103,30 +103,30 @@ void write_tx_count_stats() {
   fclose(fp);
 
   printf(
-      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB] = {\n");
+      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZES] = {\n");
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     printf("  { ");
-    for (j = 0; j < TX_SIZE_MAX_SB; j++) {
+    for (j = 0; j < TX_SIZES; j++) {
       printf("%"PRId64", ", tx_count_32x32p_stats[i][j]);
     }
     printf("},\n");
   }
   printf("};\n");
   printf(
-      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-1] = {\n");
+      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZES-1] = {\n");
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     printf("  { ");
-    for (j = 0; j < TX_SIZE_MAX_SB - 1; j++) {
+    for (j = 0; j < TX_SIZES - 1; j++) {
       printf("%"PRId64", ", tx_count_16x16p_stats[i][j]);
     }
     printf("},\n");
   }
   printf("};\n");
   printf(
-      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB-2] = {\n");
+      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZES-2] = {\n");
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     printf("  { ");
-    for (j = 0; j < TX_SIZE_MAX_SB - 2; j++) {
+    for (j = 0; j < TX_SIZES - 2; j++) {
       printf("%"PRId64", ", tx_count_8x8p_stats[i][j]);
     }
     printf("},\n");
@@ -169,7 +169,6 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
 static void update_mode(
   vp9_writer *w,
   int n,
-  const struct vp9_token tok[/* n */],
   vp9_tree tree,
   vp9_prob Pnew[/* n-1 */],
   vp9_prob Pcur[/* n-1 */],
@@ -194,20 +193,19 @@ static void update_mbintra_mode_probs(VP9_COMP* const cpi,
   unsigned int bct[VP9_INTRA_MODES - 1][2];
 
   for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_encodings,
-                vp9_intra_mode_tree, pnew,
+    update_mode(bc, VP9_INTRA_MODES, vp9_intra_mode_tree, pnew,
                 cm->fc.y_mode_prob[j], bct,
                 (unsigned int *)cpi->y_mode_count[j]);
 }
 
-static void write_selected_txfm_size(const VP9_COMP *cpi, TX_SIZE tx_size,
-                                     BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
+static void write_selected_tx_size(const VP9_COMP *cpi, TX_SIZE tx_size,
+                                   BLOCK_SIZE_TYPE bsize, vp9_writer *w) {
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (bsize >= BLOCK_SIZE_MB16X16 && tx_size != TX_4X4) {
+  if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (bsize >= BLOCK_SIZE_SB32X32 && tx_size != TX_8X8)
+    if (bsize >= BLOCK_32X32 && tx_size != TX_8X8)
       vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
@@ -265,12 +263,17 @@ static void update_switchable_interp_probs(VP9_COMP *const cpi,
 static void update_inter_mode_probs(VP9_COMMON *pc, vp9_writer* const bc) {
   int i, j;
 
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < VP9_INTER_MODES - 1; j++) {
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    unsigned int branch_ct[VP9_INTER_MODES - 1][2];
+    vp9_prob new_prob[VP9_INTER_MODES - 1];
+
+    vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
+                                     new_prob, branch_ct,
+                                     pc->counts.inter_mode[i], NEARESTMV);
+
+    for (j = 0; j < VP9_INTER_MODES - 1; ++j)
       vp9_cond_prob_diff_update(bc, &pc->fc.inter_mode_probs[i][j],
-                                VP9_MODE_UPDATE_PROB,
-                                pc->counts.inter_mode[i][j]);
-    }
+                                VP9_MODE_UPDATE_PROB, branch_ct[j]);
   }
 }
 
@@ -393,8 +396,7 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
   // the reference frame is fully coded by the segment
 }
 
-static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
-                                vp9_writer *bc, int mi_row, int mi_col) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   VP9_COMMON *const pc = &cpi->common;
   const nmv_context *nmvc = &pc->fc.nmvc;
   MACROBLOCK *const x = &cpi->mb;
@@ -406,6 +408,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   const int segment_id = mi->segment_id;
   int skip_coeff;
   const BLOCK_SIZE_TYPE bsize = mi->sb_type;
+  const int allow_hp = xd->allow_high_precision_mv;
 
   x->partition_info = x->pi + (m - pc->mi);
 
@@ -434,7 +437,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
   if (bsize >= BLOCK_SIZE_SB8X8 && pc->tx_mode == TX_MODE_SELECT &&
       !(rf != INTRA_FRAME &&
         (skip_coeff || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
-    write_selected_txfm_size(cpi, mi->txfm_size, bsize, bc);
+    write_selected_tx_size(cpi, mi->txfm_size, bsize, bc);
   }
 
   if (rf == INTRA_FRAME) {
@@ -443,18 +446,17 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #endif
 
     if (bsize >= BLOCK_SIZE_SB8X8) {
-      const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
-      const int bsl = MIN(bwl, bhl);
-      write_intra_mode(bc, mode, pc->fc.y_mode_prob[MIN(3, bsl)]);
+      write_intra_mode(bc, mode, pc->fc.y_mode_prob[size_group_lookup[bsize]]);
     } else {
       int idx, idy;
-      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
-      for (idy = 0; idy < 2; idy += num_4x4_blocks_high)
+      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+      for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const MB_PREDICTION_MODE bm = m->bmi[idy * 2 + idx].as_mode;
           write_intra_mode(bc, bm, pc->fc.y_mode_prob[0]);
         }
+      }
     }
     write_intra_mode(bc, mi->uv_mode, pc->fc.uv_mode_prob[mode]);
   } else {
@@ -470,7 +472,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_SIZE_SB8X8) {
         write_sb_mv_ref(bc, mode, mv_ref_p);
-        vp9_accum_mv_refs(&cpi->common, mode, mi->mb_mode_context[rf]);
+        ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+                               [inter_mode_offset(mode)];
       }
     }
 
@@ -487,8 +490,8 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
       int j;
       MB_PREDICTION_MODE blockmode;
       int_mv blockmv;
-      int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-      int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+      const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+      const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
       int idx, idy;
       for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -496,19 +499,21 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
           blockmode = x->partition_info->bmi[j].mode;
           blockmv = m->bmi[j].as_mv[0];
           write_sb_mv_ref(bc, blockmode, mv_ref_p);
-          vp9_accum_mv_refs(&cpi->common, blockmode, mi->mb_mode_context[rf]);
+          ++pc->counts.inter_mode[mi->mb_mode_context[rf]]
+                                 [inter_mode_offset(blockmode)];
+
           if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
             active_section = 11;
 #endif
             vp9_encode_mv(cpi, bc, &blockmv.as_mv, &mi->best_mv.as_mv,
-                          nmvc, xd->allow_high_precision_mv);
+                          nmvc, allow_hp);
 
             if (mi->ref_frame[1] > INTRA_FRAME)
               vp9_encode_mv(cpi, bc,
                             &m->bmi[j].as_mv[1].as_mv,
                             &mi->best_second_mv.as_mv,
-                            nmvc, xd->allow_high_precision_mv);
+                            nmvc, allow_hp);
           }
         }
       }
@@ -516,21 +521,18 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m,
 #ifdef ENTROPY_STATS
       active_section = 5;
 #endif
-      vp9_encode_mv(cpi, bc,
-                    &mi->mv[0].as_mv, &mi->best_mv.as_mv,
-                    nmvc, xd->allow_high_precision_mv);
+      vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, &mi->best_mv.as_mv,
+                    nmvc, allow_hp);
 
       if (mi->ref_frame[1] > INTRA_FRAME)
-        vp9_encode_mv(cpi, bc,
-                      &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
-                      nmvc, xd->allow_high_precision_mv);
+        vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, &mi->best_second_mv.as_mv,
+                      nmvc, allow_hp);
     }
   }
 }
 
-static void write_mb_modes_kf(const VP9_COMP *cpi,
-                              MODE_INFO *m,
-                              vp9_writer *bc, int mi_row, int mi_col) {
+static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO *m,
+                              vp9_writer *bc) {
   const VP9_COMMON *const c = &cpi->common;
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   const int ym = m->mbmi.mode;
@@ -543,7 +545,7 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
   write_skip_coeff(cpi, segment_id, m, bc);
 
   if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8 && c->tx_mode == TX_MODE_SELECT)
-    write_selected_txfm_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
+    write_selected_tx_size(cpi, m->mbmi.txfm_size, m->mbmi.sb_type, bc);
 
   if (m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
     const MB_PREDICTION_MODE A = above_block_mode(m, 0, mis);
@@ -552,11 +554,11 @@ static void write_mb_modes_kf(const VP9_COMP *cpi,
     write_intra_mode(bc, ym, vp9_kf_y_mode_prob[A][L]);
   } else {
     int idx, idy;
-    int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
-    int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
+    const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[m->mbmi.sb_type];
+    const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[m->mbmi.sb_type];
     for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
-        int i = idy * 2 + idx;
+        const int i = idy * 2 + idx;
         const MB_PREDICTION_MODE A = above_block_mode(m, i, mis);
         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
                                      left_block_mode(m, i) : DC_PRED;
@@ -586,12 +588,12 @@ static void write_modes_b(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
                  1 << mi_height_log2(m->mbmi.sb_type),
                  mi_col, 1 << mi_width_log2(m->mbmi.sb_type));
   if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
-    write_mb_modes_kf(cpi, m, bc, mi_row, mi_col);
+    write_mb_modes_kf(cpi, m, bc);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
   } else {
-    pack_inter_mode_mvs(cpi, m, bc, mi_row, mi_col);
+    pack_inter_mode_mvs(cpi, m, bc);
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
@@ -625,7 +627,7 @@ static void write_modes_sb(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc,
 
   if (bsize >= BLOCK_SIZE_SB8X8) {
     int pl;
-    const int idx = check_bsize_coverage(cm, xd, mi_row, mi_col, bsize);
+    const int idx = check_bsize_coverage(cm, mi_row, mi_col, bsize);
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
     // encode the partition information
@@ -692,8 +694,7 @@ static void write_modes(VP9_COMP *cpi, vp9_writer* const bc,
     vp9_zero(c->left_seg_context);
     for (mi_col = c->cur_tile_mi_col_start; mi_col < c->cur_tile_mi_col_end;
          mi_col += MI_BLOCK_SIZE, m += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_SIZE_SB64X64);
+      write_modes_sb(cpi, m, bc, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
   }
 }
 
@@ -726,12 +727,12 @@ static void print_prob_tree(vp9_coeff_probs *coef_probs, int block_types) {
   fclose(f);
 }
 
-static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
-  vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[txfm_size];
-  vp9_coeff_count *coef_counts = cpi->coef_counts[txfm_size];
+static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
+  vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
+  vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
   unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
-      cpi->common.counts.eob_branch[txfm_size];
-  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[txfm_size];
+      cpi->common.counts.eob_branch[tx_size];
+  vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
   vp9_prob full_probs[ENTROPY_NODES];
   int i, j, k, l;
 
@@ -756,9 +757,9 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE txfm_size) {
           if (!cpi->dummy_packing) {
             int t;
             for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
-              context_counters[txfm_size][i][j][k][l][t] +=
+              context_counters[tx_size][i][j][k][l][t] +=
                   coef_counts[i][j][k][l][t];
-            context_counters[txfm_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=
+            context_counters[tx_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=
                 eob_branch_ct[i][j][k][l];
           }
 #endif
@@ -1036,15 +1037,15 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
   // Probabilities
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
-    unsigned int ct_8x8p[TX_SIZE_MAX_SB - 3][2];
-    unsigned int ct_16x16p[TX_SIZE_MAX_SB - 2][2];
-    unsigned int ct_32x32p[TX_SIZE_MAX_SB - 1][2];
+    unsigned int ct_8x8p[TX_SIZES - 3][2];
+    unsigned int ct_16x16p[TX_SIZES - 2][2];
+    unsigned int ct_32x32p[TX_SIZES - 1][2];
 
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_8x8(cm->counts.tx.p8x8[i],
                                      ct_8x8p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 3; j++)
+      for (j = 0; j < TX_SIZES - 3; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p8x8[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_8x8p[j]);
     }
@@ -1052,14 +1053,14 @@ static void encode_txfm_probs(VP9_COMP *cpi, vp9_writer *w) {
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_16x16(cm->counts.tx.p16x16[i],
                                        ct_16x16p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 2; j++)
+      for (j = 0; j < TX_SIZES - 2; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p16x16[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_16x16p[j]);
     }
 
     for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
       tx_counts_to_branch_counts_32x32(cm->counts.tx.p32x32[i], ct_32x32p);
-      for (j = 0; j < TX_SIZE_MAX_SB - 1; j++)
+      for (j = 0; j < TX_SIZES - 1; j++)
         vp9_cond_prob_diff_update(w, &cm->fc.tx_probs.p32x32[i][j],
                                   VP9_MODE_UPDATE_PROB, ct_32x32p[j]);
     }
@@ -1422,7 +1423,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     for (i = 0; i < NUM_PARTITION_CONTEXTS; ++i) {
       vp9_prob pnew[PARTITION_TYPES - 1];
       unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES, vp9_partition_encodings,
+      update_mode(&header_bc, PARTITION_TYPES,
                   vp9_partition_tree, pnew,
                   fc->partition_prob[cm->frame_type][i], bct,
                   (unsigned int *)cpi->partition_count[i]);
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index 4b49b17a2..3e377cf6f 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -47,7 +47,7 @@ typedef struct {
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  int64_t txfm_rd_diff[NB_TXFM_MODES];
+  int64_t tx_rd_diff[TX_MODES];
   int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
 
   // Bit flag for each mode whether it has high error in comparison to others.
@@ -72,6 +72,11 @@ struct macroblock_plane {
   int16_t zbin_extra;
 };
 
+/* The [2] dimension is for whether we skip the EOB node (i.e. if previous
+ * coefficient in this block was zero) or not. */
+typedef unsigned int vp9_coeff_cost[BLOCK_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -97,6 +102,7 @@ struct macroblock {
 
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
+  unsigned int source_variance;
 
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -133,7 +139,7 @@ struct macroblock {
   unsigned char *active_ptr;
 
   // note that token_costs is the cost when eob node is skipped
-  vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
+  vp9_coeff_cost token_costs[TX_SIZES];
 
   int optimize;
 
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 798adc1f3..66eae41da 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -60,11 +60,28 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
  * Eventually this should be replaced by custom no-reference routines,
  *  which will be faster.
  */
-static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128,
-    128, 128, 128, 128, 128, 128, 128, 128};
+static const uint8_t VP9_VAR_OFFS[64] = {
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128,
+  128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static unsigned int get_sb_variance(VP9_COMP *cpi, MACROBLOCK *x,
+                                    BLOCK_SIZE_TYPE bs) {
+  unsigned int var, sse;
+  var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
+                           x->plane[0].src.stride,
+                           VP9_VAR_OFFS, 0, &sse);
+  return var >> num_pels_log2_lookup[bs];
+}
 
 // Original activity measure from Tim T's code.
-static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int act;
   unsigned int sse;
   /* TODO: This could also be done over smaller areas (8x8), but that would
@@ -106,7 +123,7 @@ static unsigned int mb_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
     mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
   } else {
     // Original activity measure from Tim T's code.
-    mb_activity = tt_activity_measure(cpi, x);
+    mb_activity = tt_activity_measure(x);
   }
 
   if (mb_activity < VP9_ACTIVITY_AVG_MIN)
@@ -323,8 +340,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
 
   int mb_mode_index = ctx->best_mode_index;
   const int mis = cpi->common.mode_info_stride;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mb_mode_index < MAX_MODES);
@@ -345,13 +362,13 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   }
   // FIXME(rbultje) I'm pretty sure this should go to the end of this block
   // (i.e. after the output_enabled)
-  if (bsize < BLOCK_SIZE_SB32X32) {
-    if (bsize < BLOCK_SIZE_MB16X16)
-      ctx->txfm_rd_diff[ALLOW_16X16] = ctx->txfm_rd_diff[ALLOW_8X8];
-    ctx->txfm_rd_diff[ALLOW_32X32] = ctx->txfm_rd_diff[ALLOW_16X16];
+  if (bsize < BLOCK_32X32) {
+    if (bsize < BLOCK_16X16)
+      ctx->tx_rd_diff[ALLOW_16X16] = ctx->tx_rd_diff[ALLOW_8X8];
+    ctx->tx_rd_diff[ALLOW_32X32] = ctx->tx_rd_diff[ALLOW_16X16];
   }
 
-  if (mbmi->ref_frame[0] != INTRA_FRAME && mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
     *x->partition_info = ctx->partition_info;
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
@@ -362,9 +379,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
     return;
 
   if (!vp9_segfeature_active(&xd->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      cpi->rd_tx_select_diff[i] += ctx->txfm_rd_diff[i];
-    }
+    for (i = 0; i < TX_MODES; i++)
+      cpi->rd_tx_select_diff[i] += ctx->tx_rd_diff[i];
   }
 
   if (cpi->common.frame_type == KEY_FRAME) {
@@ -395,7 +411,7 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   } else {
     // Note how often each mode chosen as best
     cpi->mode_chosen_counts[mb_mode_index]++;
-    if (mbmi->ref_frame[0] != INTRA_FRAME
+    if (is_inter_block(mbmi)
         && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
       int_mv best_mv, best_second_mv;
       const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
@@ -465,6 +481,7 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   const int mb_row = mi_row >> 1;
   const int mb_col = mi_col >> 1;
   const int idx_map = mb_row * cm->mb_cols + mb_col;
+  const struct segmentation *const seg = &xd->seg;
   int i;
 
   // entropy context structures
@@ -514,16 +531,16 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
   x->rdmult = cpi->RDMULT;
 
   /* segment ID */
-  if (xd->seg.enabled) {
-    uint8_t *map = xd->seg.update_map ? cpi->segmentation_map
-                                      : cm->last_frame_seg_map;
+  if (seg->enabled) {
+    uint8_t *map = seg->update_map ? cpi->segmentation_map
+                                   : cm->last_frame_seg_map;
     mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
 
     vp9_mb_init_quantizer(cpi, x);
 
-    if (xd->seg.enabled && cpi->seg0_cnt > 0
-        && !vp9_segfeature_active(&xd->seg, 0, SEG_LVL_REF_FRAME)
-        && vp9_segfeature_active(&xd->seg, 1, SEG_LVL_REF_FRAME)) {
+    if (seg->enabled && cpi->seg0_cnt > 0
+        && !vp9_segfeature_active(seg, 0, SEG_LVL_REF_FRAME)
+        && vp9_segfeature_active(seg, 1, SEG_LVL_REF_FRAME)) {
       cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
     } else {
       const int y = mb_row & ~3;
@@ -537,8 +554,11 @@ static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
       cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
           << 16) / cm->MBs;
     }
+
+    x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
   } else {
     mbmi->segment_id = 0;
+    x->encode_breakout = cpi->oxcf.encode_breakout;
   }
 }
 
@@ -552,12 +572,17 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
 
   x->rd_search = 1;
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
     if (xd->ab_index != 0)
       return;
+  }
 
   set_offsets(cpi, mi_row, mi_col, bsize);
   xd->mode_info_context->mbmi.sb_type = bsize;
+
+  x->source_variance = get_sb_variance(cpi, x, bsize);
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
@@ -571,12 +596,12 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
                               bsize, ctx, best_rd);
 }
 
-static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+static void update_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *mi = xd->mode_info_context;
-  MB_MODE_INFO * const mbmi = &mi->mbmi;
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (cm->frame_type != KEY_FRAME) {
     const int seg_ref_active = vp9_segfeature_active(&xd->seg, mbmi->segment_id,
@@ -612,38 +637,38 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
 }
 
 // TODO(jingning): the variables used here are little complicated. need further
-// refactoring on organizing the the temporary buffers, when recursive
+// refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD * const xd = &x->e_mbd;
 
   switch (bsize) {
-    case BLOCK_SIZE_SB64X64:
+    case BLOCK_64X64:
       return &x->sb64_context;
-    case BLOCK_SIZE_SB64X32:
+    case BLOCK_64X32:
       return &x->sb64x32_context[xd->sb_index];
-    case BLOCK_SIZE_SB32X64:
+    case BLOCK_32X64:
       return &x->sb32x64_context[xd->sb_index];
-    case BLOCK_SIZE_SB32X32:
+    case BLOCK_32X32:
       return &x->sb32_context[xd->sb_index];
-    case BLOCK_SIZE_SB32X16:
+    case BLOCK_32X16:
       return &x->sb32x16_context[xd->sb_index][xd->mb_index];
-    case BLOCK_SIZE_SB16X32:
+    case BLOCK_16X32:
       return &x->sb16x32_context[xd->sb_index][xd->mb_index];
-    case BLOCK_SIZE_MB16X16:
+    case BLOCK_16X16:
       return &x->mb_context[xd->sb_index][xd->mb_index];
-    case BLOCK_SIZE_SB16X8:
+    case BLOCK_16X8:
       return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_SIZE_SB8X16:
+    case BLOCK_8X16:
       return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_8X8:
       return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_SIZE_SB8X4:
+    case BLOCK_8X4:
       return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_SIZE_SB4X8:
+    case BLOCK_4X8:
       return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
-    case BLOCK_SIZE_AB4X4:
+    case BLOCK_4X4:
       return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
@@ -655,13 +680,13 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
                                             BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD *xd = &x->e_mbd;
   switch (bsize) {
-    case BLOCK_SIZE_SB64X64:
+    case BLOCK_64X64:
       return &x->sb64_partitioning;
-    case BLOCK_SIZE_SB32X32:
+    case BLOCK_32X32:
       return &x->sb_partitioning[xd->sb_index];
-    case BLOCK_SIZE_MB16X16:
+    case BLOCK_16X16:
       return &x->mb_partitioning[xd->sb_index][xd->mb_index];
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_8X8:
       return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
     default:
       assert(0);
@@ -674,12 +699,12 @@ static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
                             ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                             PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                             BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
   int p;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int mi_height = num_8x8_blocks_high_lookup[bsize];
   for (p = 0; p < MAX_MB_PLANE; p++) {
@@ -705,12 +730,12 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
                          ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
                          PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
                          BLOCK_SIZE_TYPE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  const VP9_COMMON *const cm = &cpi->common;
+  const MACROBLOCK *const x = &cpi->mb;
+  const MACROBLOCKD *const xd = &x->e_mbd;
   int p;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int mi_width = num_8x8_blocks_wide_lookup[bsize];
   int mi_height = num_8x8_blocks_high_lookup[bsize];
 
@@ -746,15 +771,18 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   if (sub_index != -1)
     *(get_sb_index(xd, bsize)) = sub_index;
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
     if (xd->ab_index > 0)
       return;
+  }
   set_offsets(cpi, mi_row, mi_col, bsize);
   update_state(cpi, get_block_context(x, bsize), bsize, output_enabled);
   encode_superblock(cpi, tp, output_enabled, mi_row, mi_col, bsize);
 
   if (output_enabled) {
-    update_stats(cpi, mi_row, mi_col);
+    update_stats(cpi);
 
     (*tp)->token = EOSB_TOKEN;
     (*tp)++;
@@ -776,7 +804,7 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  c1 = BLOCK_SIZE_AB4X4;
+  c1 = BLOCK_4X4;
   if (bsize >= BLOCK_SIZE_SB8X8) {
     set_partition_seg_context(cm, xd, mi_row, mi_col);
     pl = partition_plane_context(xd, bsize);
@@ -858,7 +886,7 @@ static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
   int bhl = b_height_log2(bsize);
   int bsl = (bwl > bhl ? bwl : bhl);
 
-  int bs = (1 << bsl) / 2;  //
+  int bs = (1 << bsl) / 2;  // Block size in units of 8 pels.
   MODE_INFO *m2 = m + mi_row * mis + mi_col;
   for (row = 0; row < bs; row++) {
     for (col = 0; col < bs; col++) {
@@ -906,28 +934,28 @@ typedef enum {
 static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
   int i;
   switch (block_size) {
-    case BLOCK_SIZE_SB64X64: {
+    case BLOCK_64X64: {
       v64x64 *vt = (v64x64 *) data;
       node->vt = &vt->vt;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].vt.none;
       break;
     }
-    case BLOCK_SIZE_SB32X32: {
+    case BLOCK_32X32: {
       v32x32 *vt = (v32x32 *) data;
       node->vt = &vt->vt;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].vt.none;
       break;
     }
-    case BLOCK_SIZE_MB16X16: {
+    case BLOCK_16X16: {
       v16x16 *vt = (v16x16 *) data;
       node->vt = &vt->vt;
       for (i = 0; i < 4; i++)
         node->split[i] = &vt->split[i].vt.none;
       break;
     }
-    case BLOCK_SIZE_SB8X8: {
+    case BLOCK_8X8: {
       v8x8 *vt = (v8x8 *) data;
       node->vt = &vt->vt;
       for (i = 0; i < 4; i++)
@@ -1066,8 +1094,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   int dp;
   int pixels_wide = 64, pixels_high = 64;
 
-  vpx_memset(&vt, 0, sizeof(vt));
-
+  vp9_zero(vt);
   set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
 
   if (xd->mb_to_right_edge < 0)
@@ -1087,7 +1114,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   dp = 64;
   if (cm->frame_type != KEY_FRAME) {
     int_mv nearest_mv, near_mv;
-    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
+    const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, LAST_FRAME)];
+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
     YV12_BUFFER_CONFIG *second_ref_fb = NULL;
 
     setup_pre_planes(xd, 0, ref_fb, mi_row, mi_col,
@@ -1103,7 +1131,6 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
     vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
-
   }
 
   // Fill in the entire tree of 8x8 variances for splits.
@@ -1130,32 +1157,32 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
   // values.
   for (i = 0; i < 4; i++) {
     for (j = 0; j < 4; j++) {
-      fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16);
+      fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
     }
-    fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32);
+    fill_variance_tree(&vt.split[i], BLOCK_32X32);
   }
-  fill_variance_tree(&vt, BLOCK_SIZE_SB64X64);
+  fill_variance_tree(&vt, BLOCK_64X64);
   // Now go through the entire structure,  splitting every block size until
   // we get to one that's got a variance lower than our threshold,  or we
   // hit 8x8.
-  if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col,
+  if (!set_vt_partitioning(cpi, &vt, m, BLOCK_64X64, mi_row, mi_col,
                            4)) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
-      if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32,
+      if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
           if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
-                                   BLOCK_SIZE_MB16X16,
+                                   BLOCK_16X16,
                                    (mi_row + y32_idx + y16_idx),
                                    (mi_col + x32_idx + x16_idx), 1)) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
-              set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+              set_block_size(cm, m, BLOCK_8X8, mis,
                              (mi_row + y32_idx + y16_idx + y8_idx),
                              (mi_col + x32_idx + x16_idx + x8_idx));
             }
@@ -1165,6 +1192,7 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
     }
   }
 }
+
 static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
                              int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
                              int *rate, int64_t *dist, int do_recon) {
@@ -1173,8 +1201,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
   const int mis = cm->mode_info_stride;
   int bsl = b_width_log2(bsize);
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int ms = num_4x4_blocks_wide / 2;
   int mh = num_4x4_blocks_high / 2;
   int bss = (1 << bsl) / 4;
@@ -1191,7 +1219,7 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   int64_t none_dist = INT_MAX;
   int chosen_rate = INT_MAX;
   int64_t chosen_dist = INT_MAX;
-  BLOCK_SIZE_TYPE sub_subsize = BLOCK_SIZE_AB4X4;
+  BLOCK_SIZE_TYPE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
   BLOCK_SIZE_TYPE bs_type = m->mbmi.sb_type;
 
@@ -1203,6 +1231,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   subsize = get_subsize(bsize, partition);
 
   if (bsize < BLOCK_SIZE_SB8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
     if (xd->ab_index != 0) {
       *rate = 0;
       *dist = 0;
@@ -1213,6 +1243,10 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   }
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
+  x->fast_ms = 0;
+  x->pred_mv.as_int = 0;
+  x->subblock_ref = 0;
+
   if (cpi->sf.adjust_partitioning_from_last_frame) {
     // Check if any of the sub blocks are further split.
     if (partition == PARTITION_SPLIT && subsize > BLOCK_SIZE_SB8X8) {
@@ -1422,9 +1456,59 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
   *dist = chosen_dist;
 }
 
+static BLOCK_SIZE_TYPE min_partition_size[BLOCK_SIZE_TYPES] =
+  { BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
+    BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, BLOCK_8X8,
+    BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16 };
+static BLOCK_SIZE_TYPE max_partition_size[BLOCK_SIZE_TYPES] =
+  { BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
+    BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, BLOCK_64X64,
+    BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, BLOCK_64X64 };
+
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(VP9_COMP *cpi,
+                                    BLOCK_SIZE_TYPE * min_block_size,
+                                    BLOCK_SIZE_TYPE * max_block_size) {
+  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  const MODE_INFO *const mi = xd->mode_info_context;
+  const MB_MODE_INFO *const above_mbmi = &mi[-xd->mode_info_stride].mbmi;
+  const MB_MODE_INFO *const left_mbmi = &mi[-1].mbmi;
+  const int left_in_image = xd->left_available && left_mbmi->mb_in_image;
+  const int above_in_image = xd->up_available && above_mbmi->mb_in_image;
+
+  // Frequency check
+  if (cpi->sf.auto_min_max_partition_count <= 0) {
+    cpi->sf.auto_min_max_partition_count =
+      cpi->sf.auto_min_max_partition_interval;
+    *min_block_size = BLOCK_4X4;
+    *max_block_size = BLOCK_64X64;
+    return;
+  } else {
+    --cpi->sf.auto_min_max_partition_count;
+  }
+
+  // Check for edge cases
+  if (!left_in_image && !above_in_image) {
+    *min_block_size = BLOCK_4X4;
+    *max_block_size = BLOCK_64X64;
+  } else if (!left_in_image) {
+    *min_block_size = min_partition_size[above_mbmi->sb_type];
+    *max_block_size = max_partition_size[above_mbmi->sb_type];
+  } else if (!above_in_image) {
+    *min_block_size = min_partition_size[left_mbmi->sb_type];
+    *max_block_size = max_partition_size[left_mbmi->sb_type];
+  } else {
+    *min_block_size =
+      min_partition_size[MIN(left_mbmi->sb_type, above_mbmi->sb_type)];
+    *max_block_size =
+      max_partition_size[MAX(left_mbmi->sb_type, above_mbmi->sb_type)];
+  }
+}
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
-// unlikely to be selected depending on previously rate-distortion optimization
+// unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
 static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
@@ -1444,20 +1528,22 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 
   (void) *tp_orig;
 
-  if (bsize < BLOCK_SIZE_SB8X8)
+  if (bsize < BLOCK_SIZE_SB8X8) {
+    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
+    // there is nothing to be done.
     if (xd->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
     }
+  }
   assert(mi_height_log2(bsize) == mi_width_log2(bsize));
 
   save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
 
   // PARTITION_SPLIT
-  if (!cpi->sf.use_partitions_greater_than
-      || (cpi->sf.use_partitions_greater_than
-          && bsize > cpi->sf.greater_than_block_size)) {
+  if (!cpi->sf.auto_min_max_partition_size ||
+      bsize >= cpi->sf.min_partition_size) {
     if (bsize > BLOCK_SIZE_SB8X8) {
       int r4 = 0;
       int64_t d4 = 0, sum_rd = 0;
@@ -1500,41 +1586,39 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     }
   }
 
+  // Use 4 subblocks' motion estimation results to speed up current
+  // partition's checking.
   x->fast_ms = 0;
   x->pred_mv.as_int = 0;
   x->subblock_ref = 0;
 
-  // Use 4 subblocks' motion estimation results to speed up current
-  // partition's checking.
-  if (cpi->sf.using_small_partition_info) {
+  if (cpi->sf.using_small_partition_info &&
+      (!cpi->sf.auto_min_max_partition_size ||
+      (bsize <= cpi->sf.max_partition_size &&
+      bsize >= cpi->sf.min_partition_size))) {
     // Only use 8x8 result for non HD videos.
     // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
     int use_8x8 = 1;
 
     if (cm->frame_type && !cpi->is_src_frame_alt_ref &&
-        ((use_8x8 && bsize == BLOCK_SIZE_MB16X16) ||
-        bsize == BLOCK_SIZE_SB32X32 || bsize == BLOCK_SIZE_SB64X64)) {
+        ((use_8x8 && bsize == BLOCK_16X16) ||
+        bsize == BLOCK_32X32 || bsize == BLOCK_64X64)) {
       int ref0 = 0, ref1 = 0, ref2 = 0, ref3 = 0;
+      PICK_MODE_CONTEXT *block_context = NULL;
 
-      if (bsize == BLOCK_SIZE_MB16X16) {
-        ref0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.
-            ref_frame[0];
-        ref1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.
-            ref_frame[0];
-        ref2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.
-            ref_frame[0];
-        ref3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.
-            ref_frame[0];
-      } else if (bsize == BLOCK_SIZE_SB32X32) {
-        ref0 = x->mb_context[xd->sb_index][0].mic.mbmi.ref_frame[0];
-        ref1 = x->mb_context[xd->sb_index][1].mic.mbmi.ref_frame[0];
-        ref2 = x->mb_context[xd->sb_index][2].mic.mbmi.ref_frame[0];
-        ref3 = x->mb_context[xd->sb_index][3].mic.mbmi.ref_frame[0];
+      if (bsize == BLOCK_16X16) {
+        block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+      } else if (bsize == BLOCK_32X32) {
+        block_context = x->mb_context[xd->sb_index];
       } else if (bsize == BLOCK_SIZE_SB64X64) {
-        ref0 = x->sb32_context[0].mic.mbmi.ref_frame[0];
-        ref1 = x->sb32_context[1].mic.mbmi.ref_frame[0];
-        ref2 = x->sb32_context[2].mic.mbmi.ref_frame[0];
-        ref3 = x->sb32_context[3].mic.mbmi.ref_frame[0];
+        block_context = x->sb32_context;
+      }
+
+      if (block_context) {
+        ref0 = block_context[0].mic.mbmi.ref_frame[0];
+        ref1 = block_context[1].mic.mbmi.ref_frame[0];
+        ref2 = block_context[2].mic.mbmi.ref_frame[0];
+        ref3 = block_context[3].mic.mbmi.ref_frame[0];
       }
 
       // Currently, only consider 4 inter ref frames.
@@ -1544,42 +1628,14 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
         int d01, d23, d02, d13;  // motion vector distance between 2 blocks
 
         // Get each subblock's motion vectors.
-        if (bsize == BLOCK_SIZE_MB16X16) {
-          mvr0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
-              as_mv.row;
-          mvc0 = x->sb8x8_context[xd->sb_index][xd->mb_index][0].mic.mbmi.mv[0].
-              as_mv.col;
-          mvr1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
-              as_mv.row;
-          mvc1 = x->sb8x8_context[xd->sb_index][xd->mb_index][1].mic.mbmi.mv[0].
-              as_mv.col;
-          mvr2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
-              as_mv.row;
-          mvc2 = x->sb8x8_context[xd->sb_index][xd->mb_index][2].mic.mbmi.mv[0].
-              as_mv.col;
-          mvr3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
-              as_mv.row;
-          mvc3 = x->sb8x8_context[xd->sb_index][xd->mb_index][3].mic.mbmi.mv[0].
-              as_mv.col;
-        } else if (bsize == BLOCK_SIZE_SB32X32) {
-          mvr0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.row;
-          mvc0 = x->mb_context[xd->sb_index][0].mic.mbmi.mv[0].as_mv.col;
-          mvr1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.row;
-          mvc1 = x->mb_context[xd->sb_index][1].mic.mbmi.mv[0].as_mv.col;
-          mvr2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.row;
-          mvc2 = x->mb_context[xd->sb_index][2].mic.mbmi.mv[0].as_mv.col;
-          mvr3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.row;
-          mvc3 = x->mb_context[xd->sb_index][3].mic.mbmi.mv[0].as_mv.col;
-        } else if (bsize == BLOCK_SIZE_SB64X64) {
-          mvr0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.row;
-          mvc0 = x->sb32_context[0].mic.mbmi.mv[0].as_mv.col;
-          mvr1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.row;
-          mvc1 = x->sb32_context[1].mic.mbmi.mv[0].as_mv.col;
-          mvr2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.row;
-          mvc2 = x->sb32_context[2].mic.mbmi.mv[0].as_mv.col;
-          mvr3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.row;
-          mvc3 = x->sb32_context[3].mic.mbmi.mv[0].as_mv.col;
-        }
+        mvr0 = block_context[0].mic.mbmi.mv[0].as_mv.row;
+        mvc0 = block_context[0].mic.mbmi.mv[0].as_mv.col;
+        mvr1 = block_context[1].mic.mbmi.mv[0].as_mv.row;
+        mvc1 = block_context[1].mic.mbmi.mv[0].as_mv.col;
+        mvr2 = block_context[2].mic.mbmi.mv[0].as_mv.row;
+        mvc2 = block_context[2].mic.mbmi.mv[0].as_mv.col;
+        mvr3 = block_context[3].mic.mbmi.mv[0].as_mv.row;
+        mvc3 = block_context[3].mic.mbmi.mv[0].as_mv.col;
 
         // Adjust sign if ref is alt_ref
         if (cm->ref_frame_sign_bias[ref0]) {
@@ -1631,9 +1687,8 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     }
   }
 
-  if (!cpi->sf.use_partitions_less_than
-      || (cpi->sf.use_partitions_less_than
-          && bsize <= cpi->sf.less_than_block_size)) {
+  if (!cpi->sf.auto_min_max_partition_size ||
+      bsize <= cpi->sf.max_partition_size) {
     int larger_is_better = 0;
     // PARTITION_NONE
     if ((mi_row + (ms >> 1) < cm->mi_rows) &&
@@ -1804,8 +1859,7 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
 }
 
 // Examines 64x64 block and chooses a best reference frame
-static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
-                                    int mi_col, int *rate, int64_t *dist) {
+static void rd_pick_reference_frame(VP9_COMP *cpi, int mi_row, int mi_col) {
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCK * const x = &cpi->mb;
   MACROBLOCKD * const xd = &x->e_mbd;
@@ -1836,23 +1890,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
     cpi->set_ref_frame_mask = 0;
   }
 
-  *rate = r;
-  *dist = d;
-  // RDCOST(x->rdmult, x->rddiv, r, d)
-
   restore_context(cpi, mi_row, mi_col, a, l, sa, sl, BLOCK_SIZE_SB64X64);
-
-  /*if (srate < INT_MAX && sdist < INT_MAX)
-    encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64);
-
-  if (bsize == BLOCK_SIZE_SB64X64) {
-    assert(tp_orig < *tp);
-    assert(srate < INT_MAX);
-    assert(sdist < INT_MAX);
-  } else {
-    assert(tp_orig == *tp);
-  }
-  */
 }
 
 static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
@@ -1877,10 +1915,8 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
     else
       cpi->unused_mode_skip_mask = 0xFFFFFFFFFFFFFE00;
 
-    if (cpi->sf.reference_masking) {
-      rd_pick_reference_frame(cpi, tp, mi_row, mi_col,
-                              &dummy_rate, &dummy_dist);
-    }
+    if (cpi->sf.reference_masking)
+      rd_pick_reference_frame(cpi, mi_row, mi_col);
 
     if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
         cpi->sf.use_one_partition_size_always ) {
@@ -1888,6 +1924,7 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
       MODE_INFO *m = cm->mi + idx_str;
       MODE_INFO *p = cm->prev_mi + idx_str;
 
+      cpi->mb.source_variance = UINT_MAX;
       if (cpi->sf.use_one_partition_size_always) {
         set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
         set_partitioning(cpi, m, cpi->sf.always_this_block_size);
@@ -1904,6 +1941,12 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
             || cpi->common.show_frame == 0
             || cpi->common.frame_type == KEY_FRAME
             || cpi->is_src_frame_alt_ref) {
+          // If required set upper and lower partition size limits
+          if (cpi->sf.auto_min_max_partition_size) {
+            rd_auto_partition_range(cpi,
+                                    &cpi->sf.min_partition_size,
+                                    &cpi->sf.max_partition_size);
+          }
           rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
                             &dummy_rate, &dummy_dist, 1, INT64_MAX);
         } else {
@@ -1913,6 +1956,12 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
         }
       }
     } else {
+      // If required set upper and lower partition size limits
+      if (cpi->sf.auto_min_max_partition_size) {
+        rd_auto_partition_range(cpi, &cpi->sf.min_partition_size,
+                                &cpi->sf.max_partition_size);
+      }
+
       rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
                         &dummy_rate, &dummy_dist, 1, INT64_MAX);
     }
@@ -2086,7 +2135,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     }
 
     vpx_usec_timer_mark(&emr_timer);
-    cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+    cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer);
   }
 
   if (cpi->sf.skip_encode_sb) {
@@ -2203,13 +2252,13 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
     int n;
 
     assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_SIZE_SB64X64) {
-      subsize = BLOCK_SIZE_SB32X32;
-    } else if (bsize == BLOCK_SIZE_SB32X32) {
-      subsize = BLOCK_SIZE_MB16X16;
+    if (bsize == BLOCK_64X64) {
+      subsize = BLOCK_32X32;
+    } else if (bsize == BLOCK_32X32) {
+      subsize = BLOCK_16X16;
     } else {
-      assert(bsize == BLOCK_SIZE_MB16X16);
-      subsize = BLOCK_SIZE_SB8X8;
+      assert(bsize == BLOCK_16X16);
+      subsize = BLOCK_8X8;
     }
 
     for (n = 0; n < 4; n++) {
@@ -2267,7 +2316,7 @@ static void select_tx_mode(VP9_COMP *cpi) {
     } else {
       unsigned int total = 0;
       int i;
-      for (i = 0; i < TX_SIZE_MAX_SB; ++i)
+      for (i = 0; i < TX_SIZES; ++i)
         total += cpi->txfm_stepdown_count[i];
       if (total) {
         double fraction = (double)cpi->txfm_stepdown_count[0] / total;
@@ -2376,12 +2425,12 @@ void vp9_encode_frame(VP9_COMP *cpi) {
           (cpi->rd_filter_threshes[frame_type][i] + diff) / 2;
     }
 
-    for (i = 0; i < NB_TXFM_MODES; ++i) {
+    for (i = 0; i < TX_MODES; ++i) {
       int64_t pd = cpi->rd_tx_select_diff[i];
       int diff;
       if (i == TX_MODE_SELECT)
         pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
-                     2048 * (TX_SIZE_MAX_SB - 1), 0);
+                     2048 * (TX_SIZES - 1), 0);
       diff = (int) (pd / cpi->common.MBs);
       cpi->rd_tx_select_threshes[frame_type][i] += diff;
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
@@ -2527,7 +2576,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
     // Increase zbin size to suppress noise
     cpi->zbin_mode_boost = 0;
     if (cpi->zbin_mode_boost_enabled) {
-      if (mbmi->ref_frame[0] != INTRA_FRAME) {
+      if (is_inter_block(mbmi)) {
         if (mbmi->mode == ZEROMV) {
           if (mbmi->ref_frame[0] != LAST_FRAME)
             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
@@ -2600,7 +2649,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_SIZE_SB8X8  &&
-        !(mbmi->ref_frame[0] != INTRA_FRAME &&
+        !(is_inter_block(mbmi) &&
             (mbmi->mb_skip_coeff ||
              vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP)))) {
       const uint8_t context = vp9_get_pred_context_tx_size(xd);
@@ -2609,14 +2658,14 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
       int x, y;
       TX_SIZE sz = (cm->tx_mode == TX_MODE_SELECT) ? TX_32X32 : cm->tx_mode;
       // The new intra coding scheme requires no change of transform size
-      if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
-        if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
+      if (is_inter_block(&mi->mbmi)) {
+        if (sz == TX_32X32 && bsize < BLOCK_32X32)
           sz = TX_16X16;
-        if (sz == TX_16X16 && bsize < BLOCK_SIZE_MB16X16)
+        if (sz == TX_16X16 && bsize < BLOCK_16X16)
           sz = TX_8X8;
-        if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
+        if (sz == TX_8X8 && bsize < BLOCK_8X8)
           sz = TX_4X4;
-      } else if (bsize >= BLOCK_SIZE_SB8X8) {
+      } else if (bsize >= BLOCK_8X8) {
         sz = mbmi->txfm_size;
       } else {
         sz = TX_4X4;
diff --git a/libvpx/vp9/encoder/vp9_encodeintra.c b/libvpx/vp9/encoder/vp9_encodeintra.c
index d49e53258..edbd2d909 100644
--- a/libvpx/vp9/encoder/vp9_encodeintra.c
+++ b/libvpx/vp9/encoder/vp9_encodeintra.c
@@ -21,7 +21,7 @@ int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred) {
   x->skip_encode = 0;
   mbmi->mode = DC_PRED;
   mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_SIZE_MB16X16 ?
+  mbmi->txfm_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ?
                                      TX_16X16 : TX_8X8) : TX_4X4;
   vp9_encode_intra_block_y(&cpi->common, x, mbmi->sb_type);
   return vp9_get_mb_ss(x->plane[0].src_diff);
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 66e35a991..40b0a4e5a 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -47,6 +47,27 @@ static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
     xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
 
+static void inverse_transform_b_8x8_add(int eob,
+                                        int16_t *dqcoeff, uint8_t *dest,
+                                        int stride) {
+  if (eob <= 1)
+    vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
+  else if (eob <= 10)
+    vp9_short_idct10_8x8_add(dqcoeff, dest, stride);
+  else
+    vp9_short_idct8x8_add(dqcoeff, dest, stride);
+}
+
+static void inverse_transform_b_16x16_add(int eob,
+                                          int16_t *dqcoeff, uint8_t *dest,
+                                          int stride) {
+  if (eob <= 1)
+    vp9_short_idct16x16_1_add(dqcoeff, dest, stride);
+  else if (eob <= 10)
+    vp9_short_idct10_16x16_add(dqcoeff, dest, stride);
+  else
+    vp9_short_idct16x16_add(dqcoeff, dest, stride);
+}
 
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
@@ -120,12 +141,12 @@ static int trellis_get_coeff_context(const int16_t *scan,
   return pt;
 }
 
-static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
+static void optimize_b(MACROBLOCK *mb,
                        int plane, int block, BLOCK_SIZE_TYPE bsize,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
-  const int ref = mb->e_mbd.mode_info_context->mbmi.ref_frame[0] != INTRA_FRAME;
   MACROBLOCKD *const xd = &mb->e_mbd;
+  const int ref = is_inter_block(&xd->mode_info_context->mbmi);
   vp9_token_state tokens[1025][2];
   unsigned best_index[1025][2];
   const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff,
@@ -214,10 +235,10 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         band = get_coef_band(band_translate, i + 1);
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
-          mb->token_costs[tx_size][type][ref][0][band][pt]
+          mb->token_costs[tx_size][type][ref][band][0][pt]
                          [tokens[next][0].token];
         rate1 +=
-          mb->token_costs[tx_size][type][ref][0][band][pt]
+          mb->token_costs[tx_size][type][ref][band][0][pt]
                          [tokens[next][1].token];
       }
       UPDATE_RD_COST();
@@ -265,12 +286,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
         band = get_coef_band(band_translate, i + 1);
         if (t0 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][0].token];
         }
         if (t1 != DCT_EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
+          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][1].token];
         }
       }
@@ -303,12 +324,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != DCT_EOB_TOKEN) {
         tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][1][band][0][t0];
+            mb->token_costs[tx_size][type][ref][band][1][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != DCT_EOB_TOKEN) {
         tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][1][band][0][t1];
+            mb->token_costs[tx_size][type][ref][band][1][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       best_index[i][0] = best_index[i][1] = 0;
@@ -325,8 +346,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
+  rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
+  rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
   final_eob = i0 - 1;
@@ -351,7 +372,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
 }
 
 void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *mb,
+                    int ss_txfrm_size, MACROBLOCK *mb,
                     struct optimize_ctx *ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   int x, y;
@@ -359,51 +380,61 @@ void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   // find current entropy context
   txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y);
 
-  optimize_b(cm, mb, plane, block, bsize,
+  optimize_b(mb, plane, block, bsize,
              &ctx->ta[plane][x], &ctx->tl[plane][y], ss_txfrm_size / 2);
 }
 
 static void optimize_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                            int ss_txfrm_size, void *arg) {
   const struct encode_b_args* const args = arg;
-  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, args->x,
-                 args->ctx);
+  vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->x, args->ctx);
 }
 
-void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                       struct optimize_ctx *ctx) {
-  int p;
-
-  for (p = 0; p < MAX_MB_PLANE; p++) {
-    const struct macroblockd_plane* const plane = &xd->plane[p];
-    const int bwl = b_width_log2(bsize) - plane->subsampling_x;
-    const int bhl = b_height_log2(bsize) - plane->subsampling_y;
-    const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
-    const TX_SIZE tx_size = p ? get_uv_tx_size(mbmi)
-                              : mbmi->txfm_size;
-    int i, j;
-
-    for (i = 0; i < 1 << bwl; i += 1 << tx_size) {
-      int c = 0;
-      ctx->ta[p][i] = 0;
-      for (j = 0; j < 1 << tx_size && !c; j++) {
-        c = ctx->ta[p][i] |= plane->above_context[i + j];
-      }
-    }
-    for (i = 0; i < 1 << bhl; i += 1 << tx_size) {
-      int c = 0;
-      ctx->tl[p][i] = 0;
-      for (j = 0; j < 1 << tx_size && !c; j++) {
-        c = ctx->tl[p][i] |= plane->left_context[i + j];
-      }
-    }
+void optimize_init_b(int plane, BLOCK_SIZE_TYPE bsize, void *arg) {
+  const struct encode_b_args* const args = arg;
+  const MACROBLOCKD *xd = &args->x->e_mbd;
+  const struct macroblockd_plane* const pd = &xd->plane[plane];
+  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
+  const int bhl = b_height_log2(bsize) - pd->subsampling_y;
+  const int bw = 1 << bwl, bh = 1 << bhl;
+  const MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->txfm_size;
+  int i;
+
+  switch (tx_size) {
+    case TX_4X4:
+      vpx_memcpy(args->ctx->ta[plane], pd->above_context,
+                 sizeof(ENTROPY_CONTEXT) * bw);
+      vpx_memcpy(args->ctx->tl[plane], pd->left_context,
+                 sizeof(ENTROPY_CONTEXT) * bh);
+      break;
+    case TX_8X8:
+      for (i = 0; i < bw; i += 2)
+        args->ctx->ta[plane][i] = !!*(uint16_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 2)
+        args->ctx->tl[plane][i] = !!*(uint16_t *)&pd->left_context[i];
+      break;
+    case TX_16X16:
+      for (i = 0; i < bw; i += 4)
+        args->ctx->ta[plane][i] = !!*(uint32_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 4)
+        args->ctx->tl[plane][i] = !!*(uint32_t *)&pd->left_context[i];
+      break;
+    case TX_32X32:
+      for (i = 0; i < bw; i += 8)
+        args->ctx->ta[plane][i] = !!*(uint64_t *)&pd->above_context[i];
+      for (i = 0; i < bh; i += 8)
+        args->ctx->tl[plane][i] = !!*(uint64_t *)&pd->left_context[i];
+      break;
+    default:
+      assert(0);
   }
 }
 
 void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   struct optimize_ctx ctx;
   struct encode_b_args arg = {cm, x, &ctx};
-  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  optimize_init_b(0, bsize, &arg);
   foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0, optimize_block, &arg);
 }
 
@@ -411,7 +442,10 @@ void vp9_optimize_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                        BLOCK_SIZE_TYPE bsize) {
   struct optimize_ctx ctx;
   struct encode_b_args arg = {cm, x, &ctx};
-  vp9_optimize_init(&x->e_mbd, bsize, &ctx);
+  int i;
+  for (i = 1; i < MAX_MB_PLANE; ++i)
+    optimize_init_b(i, bsize, &arg);
+
   foreach_transformed_block_uv(&x->e_mbd, bsize, optimize_block, &arg);
 }
 
@@ -504,7 +538,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
 
   if (x->optimize)
-    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, args->cm, x, args->ctx);
+    vp9_optimize_b(plane, block, bsize, ss_txfrm_size, x, args->ctx);
 
   if (x->skip_encode)
     return;
@@ -516,10 +550,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
       vp9_short_idct32x32_add(dqcoeff, dst, pd->dst.stride);
       break;
     case TX_16X16:
-      vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+      inverse_transform_b_16x16_add(pd->eobs[block], dqcoeff, dst,
+                                    pd->dst.stride);
       break;
     case TX_8X8:
-      vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+      inverse_transform_b_8x8_add(pd->eobs[block], dqcoeff, dst,
+                                  pd->dst.stride);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -553,7 +589,7 @@ void vp9_encode_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
-    vp9_optimize_init(xd, bsize, &ctx);
+    optimize_init_b(0, bsize, &arg);
 
   foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
@@ -564,8 +600,11 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sbuv(x, bsize);
-  if (x->optimize)
-    vp9_optimize_init(xd, bsize, &ctx);
+  if (x->optimize) {
+    int i;
+    for (i = 1; i < MAX_MB_PLANE; ++i)
+      optimize_init_b(i, bsize, &arg);
+  }
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 }
@@ -576,8 +615,12 @@ void vp9_encode_sb(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) {
   struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sb(x, bsize);
-  if (x->optimize)
-    vp9_optimize_init(xd, bsize, &ctx);
+
+  if (x->optimize) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; ++i)
+      optimize_init_b(i, bsize, &arg);
+  }
 
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
@@ -610,7 +653,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   // if (x->optimize)
   // vp9_optimize_b(plane, block, bsize, ss_txfrm_size,
-  //                args->cm, x, args->ctx);
+  //                x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
@@ -661,7 +704,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
-          vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
+          inverse_transform_b_16x16_add(*eob, dqcoeff, dst, pd->dst.stride);
         else
           vp9_short_iht16x16_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
@@ -690,7 +733,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                      pd->dequant, p->zbin_extra, eob, scan, iscan);
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
-          vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
+          inverse_transform_b_8x8_add(*eob, dqcoeff, dst, pd->dst.stride);
         else
           vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
@@ -699,11 +742,11 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
       scan = get_scan_4x4(tx_type);
       iscan = get_iscan_4x4(tx_type);
-      if (mbmi->sb_type < BLOCK_SIZE_SB8X8 && plane == 0) {
+      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
         mode = xd->mode_info_context->bmi[block].as_mode;
-      } else {
+      else
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      }
+
       xoff = 4 * (block & twmask);
       yoff = 4 * (block >> twl);
       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
@@ -725,8 +768,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
-                                      dst, pd->dst.stride);
+          inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
         else
           vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
       }
diff --git a/libvpx/vp9/encoder/vp9_encodemb.h b/libvpx/vp9/encoder/vp9_encodemb.h
index defaa48a3..f647fd979 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/libvpx/vp9/encoder/vp9_encodemb.h
@@ -33,10 +33,8 @@ struct encode_b_args {
   struct optimize_ctx *ctx;
 };
 
-void vp9_optimize_init(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize,
-                       struct optimize_ctx *ctx);
 void vp9_optimize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
-                    int ss_txfrm_size, VP9_COMMON *cm, MACROBLOCK *x,
+                    int ss_txfrm_size, MACROBLOCK *x,
                     struct optimize_ctx *ctx);
 void vp9_optimize_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
 void vp9_optimize_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
diff --git a/libvpx/vp9/encoder/vp9_encodemv.c b/libvpx/vp9/encoder/vp9_encodemv.c
index 2f5e16ccf..1c6fa3a3d 100644
--- a/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/libvpx/vp9/encoder/vp9_encodemv.c
@@ -478,7 +478,7 @@ void vp9_update_nmv_count(VP9_COMP *cpi, MACROBLOCK *x,
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   int idx, idy;
 
-  if (mbmi->sb_type < BLOCK_SIZE_SB8X8) {
+  if (mbmi->sb_type < BLOCK_8X8) {
     PARTITION_INFO *pi = x->partition_info;
     for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
       for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index ec2e361ee..6ba2a4fc9 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -347,17 +347,17 @@ static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, YV12_BUFFER_CONFIG *r
   xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
 
   switch (xd->mode_info_context->mbmi.sb_type) {
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_8X8:
       vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
                  (unsigned int *)(best_motion_err));
       break;
-    case BLOCK_SIZE_SB16X8:
+    case BLOCK_16X8:
       vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride,
                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
                   (unsigned int *)(best_motion_err));
       break;
-    case BLOCK_SIZE_SB8X16:
+    case BLOCK_8X16:
       vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride,
                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
                   (unsigned int *)(best_motion_err));
@@ -403,13 +403,13 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
 
   // override the default variance function to use MSE
   switch (xd->mode_info_context->mbmi.sb_type) {
-    case BLOCK_SIZE_SB8X8:
+    case BLOCK_8X8:
       v_fn_ptr.vf = vp9_mse8x8;
       break;
-    case BLOCK_SIZE_SB16X8:
+    case BLOCK_16X8:
       v_fn_ptr.vf = vp9_mse16x8;
       break;
-    case BLOCK_SIZE_SB8X16:
+    case BLOCK_8X16:
       v_fn_ptr.vf = vp9_mse8x16;
       break;
     default:
@@ -549,15 +549,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
       if (mb_col * 2 + 1 < cm->mi_cols) {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_MB16X16;
+          xd->mode_info_context->mbmi.sb_type = BLOCK_16X16;
         } else {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB16X8;
+          xd->mode_info_context->mbmi.sb_type = BLOCK_16X8;
         }
       } else {
         if (mb_row * 2 + 1 < cm->mi_rows) {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X16;
+          xd->mode_info_context->mbmi.sb_type = BLOCK_8X16;
         } else {
-          xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB8X8;
+          xd->mode_info_context->mbmi.sb_type = BLOCK_8X8;
         }
       }
       xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
@@ -1282,7 +1282,6 @@ static int detect_flash(VP9_COMP *cpi, int offset) {
 
 // Update the motion related elements to the GF arf boost calculation
 static void accumulate_frame_motion_stats(
-  VP9_COMP *cpi,
   FIRSTPASS_STATS *this_frame,
   double *this_frame_mv_in_out,
   double *mv_in_out_accumulator,
@@ -1377,7 +1376,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
       break;
 
     // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &this_frame,
+    accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
@@ -1413,7 +1412,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset,
       break;
 
     // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &this_frame,
+    accumulate_frame_motion_stats(&this_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
@@ -1665,7 +1664,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     flash_detected = detect_flash(cpi, 0);
 
     // Update the motion related elements to the boost calculation
-    accumulate_frame_motion_stats(cpi, &next_frame,
+    accumulate_frame_motion_stats(&next_frame,
                                   &this_frame_mv_in_out, &mv_in_out_accumulator,
                                   &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
@@ -2139,8 +2138,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
       adjust_active_maxq(cpi->active_worst_quality, tmp_q);
   }
 #endif
-
-  vpx_memset(&this_frame, 0, sizeof(FIRSTPASS_STATS));
+  vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
     return;
 
@@ -2318,7 +2316,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double kf_group_coded_err = 0.0;
   double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
-  vpx_memset(&next_frame, 0, sizeof(next_frame)); // assure clean
+  vp9_zero(next_frame);
 
   vp9_clear_system_state();  // __asm emms;
   start_position = cpi->twopass.stats_in;
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 7d6db071d..154d31af6 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -63,7 +63,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   }
 
   vp9_set_mbmode_and_mvs(x, NEWMV, dst_mv);
-  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_SIZE_MB16X16);
+  vp9_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
   best_err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                           xd->plane[0].dst.buf, xd->plane[0].dst.stride,
                           INT_MAX);
@@ -77,9 +77,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
   return best_err;
 }
 
-static int do_16x16_motion_search(VP9_COMP *cpi,
-                                  int_mv *ref_mv, int_mv *dst_mv,
-                                  int buf_mb_y_offset, int mb_y_offset,
+static int do_16x16_motion_search(VP9_COMP *cpi, int_mv *ref_mv, int_mv *dst_mv,
                                   int mb_row, int mb_col) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -118,9 +116,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi,
   return err;
 }
 
-static int do_16x16_zerozero_search(VP9_COMP *cpi,
-                                    int_mv *dst_mv,
-                                    int buf_mb_y_offset, int mb_y_offset) {
+static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) {
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err;
@@ -210,7 +206,6 @@ static void update_mbgraph_mb_stats
     g_motion_error = do_16x16_motion_search(cpi,
                                             prev_golden_ref_mv,
                                             &stats->ref[GOLDEN_FRAME].m.mv,
-                                            mb_y_offset, gld_y_offset,
                                             mb_row, mb_col);
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
@@ -224,8 +219,7 @@ static void update_mbgraph_mb_stats
     xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
     xd->plane[0].pre[0].stride = alt_ref->y_stride;
     a_motion_error = do_16x16_zerozero_search(cpi,
-                                              &stats->ref[ALTREF_FRAME].m.mv,
-                                              mb_y_offset, arf_y_offset);
+                                              &stats->ref[ALTREF_FRAME].m.mv);
 
     stats->ref[ALTREF_FRAME].err = a_motion_error;
   } else {
@@ -248,8 +242,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   int_mv arf_top_mv, gld_top_mv;
   MODE_INFO mi_local;
 
-  // Make sure the mi context starts in a consistent state.
-  memset(&mi_local, 0, sizeof(mi_local));
+  vp9_zero(mi_local);
 
   // Set up limit values for motion vectors to prevent them extending outside the UMV borders
   arf_top_mv.as_int = 0;
@@ -262,7 +255,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->plane[0].pre[0].stride  = buf->y_stride;
   xd->plane[1].dst.stride = buf->uv_stride;
   xd->mode_info_context = &mi_local;
-  mi_local.mbmi.sb_type = BLOCK_SIZE_MB16X16;
+  mi_local.mbmi.sb_type = BLOCK_16X16;
   mi_local.mbmi.ref_frame[0] = LAST_FRAME;
   mi_local.mbmi.ref_frame[1] = NONE;
 
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 0be98913e..88beee791 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -58,7 +58,7 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
 }
 
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
-                    int weight, int ishp) {
+                    int weight) {
   MV v;
   v.row = mv->as_mv.row - ref->as_mv.row;
   v.col = mv->as_mv.col - ref->as_mv.col;
@@ -68,7 +68,7 @@ int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2],
-                       int error_per_bit, int ishp) {
+                       int error_per_bit) {
   if (mvcost) {
     MV v;
     v.row = mv->as_mv.row - ref->as_mv.row;
@@ -269,7 +269,6 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   int maxc, minc, maxr, minr;
   int y_stride;
   int offset;
-  int usehp = xd->allow_high_precision_mv;
 
   uint8_t *y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
@@ -300,8 +299,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
   // calculate central point error
   besterr = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
-                         error_per_bit, xd->allow_high_precision_mv);
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   // TODO: Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
@@ -371,13 +369,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
-  } else {
-    usehp = 0;
-  }
-
-  if (usehp) {
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
     hstep >>= 1;
     while (--eighthiters) {
       CHECK_BETTER(left, tr, tc - hstep);
@@ -451,7 +443,6 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   int maxc, minc, maxr, minr;
   int y_stride;
   int offset;
-  int usehp = xd->allow_high_precision_mv;
 
   DECLARE_ALIGNED_ARRAY(16, uint8_t, comp_pred, 64 * 64);
   uint8_t *y = xd->plane[0].pre[0].buf +
@@ -490,8 +481,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
   comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
   besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
   *distortion = besterr;
-  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
-                         error_per_bit, xd->allow_high_precision_mv);
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   // Each subsequent iteration checks at least one point in
   // common with the last iteration could be 2 ( if diag selected)
@@ -561,13 +551,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
     tc = bc;
   }
 
-  if (xd->allow_high_precision_mv) {
-    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
-  } else {
-    usehp = 0;
-  }
-
-  if (usehp) {
+  if (xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)) {
     hstep >>= 1;
     while (--eighthiters) {
       CHECK_BETTER(left, tr, tc - hstep);
@@ -638,7 +622,6 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   int thismse;
   int y_stride;
   MACROBLOCKD *xd = &x->e_mbd;
-  int usehp = xd->allow_high_precision_mv;
 
   uint8_t *y = xd->plane[0].pre[0].buf +
                (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
@@ -654,15 +637,14 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   // calculate central point error
   bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,
-                         xd->allow_high_precision_mv);
+  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   // go left then right and check error
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
   thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (left < bestmse) {
     *bestmv = this_mv;
@@ -674,7 +656,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   this_mv.as_mv.col += 8;
   thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit, xd->allow_high_precision_mv);
+                                error_per_bit);
 
   if (right < bestmse) {
     *bestmv = this_mv;
@@ -687,8 +669,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
   thismse =  vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                             xd->allow_high_precision_mv);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   if (up < bestmse) {
     *bestmv = this_mv;
@@ -699,8 +680,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
 
   this_mv.as_mv.row += 8;
   thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (down < bestmse) {
     *bestmv = this_mv;
@@ -742,8 +723,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
       break;
   }
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (diag < bestmse) {
     *bestmv = this_mv;
@@ -784,8 +765,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                        src_stride, &sse);
   }
 
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (left < bestmse) {
     *bestmv = this_mv;
@@ -799,7 +780,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                      SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
                      z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit, xd->allow_high_precision_mv);
+                                error_per_bit);
 
   if (right < bestmse) {
     *bestmv = this_mv;
@@ -822,8 +803,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                        z, src_stride, &sse);
   }
 
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                             xd->allow_high_precision_mv);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   if (up < bestmse) {
     *bestmv = this_mv;
@@ -835,8 +815,9 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   this_mv.as_mv.row += 4;
   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
                      z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
+
 
   if (down < bestmse) {
     *bestmv = this_mv;
@@ -923,8 +904,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
       break;
   }
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (diag < bestmse) {
     *bestmv = this_mv;
@@ -933,12 +914,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
     *sse1 = sse;
   }
 
-  if (x->e_mbd.allow_high_precision_mv) {
-    usehp = vp9_use_mv_hp(&ref_mv->as_mv);
-  } else {
-    usehp = 0;
-  }
-  if (!usehp)
+  if (!(xd->allow_high_precision_mv && vp9_use_mv_hp(&ref_mv->as_mv)))
     return bestmse;
 
   /* Now do 1/8th pixel */
@@ -968,8 +944,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                        z, src_stride, &sse);
   }
 
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (left < bestmse) {
     *bestmv = this_mv;
@@ -982,7 +958,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   thismse = vfp->svf(y, y_stride, SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
                      z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit, xd->allow_high_precision_mv);
+                                error_per_bit);
 
   if (right < bestmse) {
     *bestmv = this_mv;
@@ -1005,8 +981,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
                        SP(this_mv.as_mv.col), SP(7), z, src_stride, &sse);
   }
 
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                             xd->allow_high_precision_mv);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   if (up < bestmse) {
     *bestmv = this_mv;
@@ -1019,8 +994,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
   thismse = vfp->svf(y, y_stride,
                      SP(this_mv.as_mv.col), SP(this_mv.as_mv.row),
                      z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (down < bestmse) {
     *bestmv = this_mv;
@@ -1107,8 +1082,8 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
       break;
   }
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (diag < bestmse) {
     *bestmv = this_mv;
@@ -1153,15 +1128,14 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
   // calculate central point error
   bestmse = vfp->vf(y, y_stride, z, src_stride, sse1);
   *distortion = bestmse;
-  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit,
-                         xd->allow_high_precision_mv);
+  bestmse += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   // go left then right and check error
   this_mv.as_mv.row = startmv.as_mv.row;
   this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4);
   thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, src_stride, &sse);
-  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  left = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (left < bestmse) {
     *bestmv = this_mv;
@@ -1173,7 +1147,7 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
   this_mv.as_mv.col += 8;
   thismse = vfp->svf_halfpix_h(y, y_stride, z, src_stride, &sse);
   right = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
-                                error_per_bit, xd->allow_high_precision_mv);
+                                error_per_bit);
 
   if (right < bestmse) {
     *bestmv = this_mv;
@@ -1186,8 +1160,7 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
   this_mv.as_mv.col = startmv.as_mv.col;
   this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4);
   thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, src_stride, &sse);
-  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                             xd->allow_high_precision_mv);
+  up = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
   if (up < bestmse) {
     *bestmv = this_mv;
@@ -1198,8 +1171,8 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
 
   this_mv.as_mv.row += 8;
   thismse = vfp->svf_halfpix_v(y, y_stride, z, src_stride, &sse);
-  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  down = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (down < bestmse) {
     *bestmv = this_mv;
@@ -1238,8 +1211,8 @@ int vp9_find_best_half_pixel_step(MACROBLOCK *x,
       break;
   }
 
-  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit,
-                               xd->allow_high_precision_mv);
+  diag = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+                               error_per_bit);
 
   if (diag < bestmse) {
     *bestmv = this_mv;
@@ -1326,7 +1299,8 @@ int vp9_hex_search
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  clamp_mv(&ref_mv->as_mv,
+           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
   br = ref_mv->as_mv.row;
   bc = ref_mv->as_mv.col;
 
@@ -1482,7 +1456,8 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  clamp_mv(&ref_mv->as_mv,
+           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
   ref_row = ref_mv->as_mv.row;
   ref_col = ref_mv->as_mv.col;
   *num00 = 0;
@@ -1580,11 +1555,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   if (bestsad == INT_MAX)
     return INT_MAX;
 
-  return
-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                 (unsigned int *)(&thissad)) +
-      mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                  xd->allow_high_precision_mv);
+  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+         (unsigned int *)(&thissad)) + mv_err_cost(&this_mv, center_mv, mvjcost,
+                                                   mvcost, x->errorperbit);
 }
 
 int vp9_diamond_search_sadx4(MACROBLOCK *x,
@@ -1624,7 +1597,8 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
   fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
 
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  clamp_mv(&ref_mv->as_mv,
+           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
   ref_row = ref_mv->as_mv.row;
   ref_col = ref_mv->as_mv.col;
   *num00 = 0;
@@ -1754,11 +1728,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   if (bestsad == INT_MAX)
     return INT_MAX;
 
-  return
-      fn_ptr->vf(what, what_stride, best_address, in_what_stride,
-                 (unsigned int *)(&thissad)) +
-      mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                  xd->allow_high_precision_mv);
+  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                    (unsigned int *)(&thissad)) + mv_err_cost(&this_mv,
+                            center_mv, mvjcost, mvcost, x->errorperbit);
 }
 
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
@@ -1914,8 +1886,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
     return
         fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                    (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2042,8 +2013,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
     return
         fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                    (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2197,8 +2167,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
     return
         fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                    (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2274,8 +2243,7 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
     return
         fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                    (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2381,8 +2349,7 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     return
         fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                    (unsigned int *)(&thissad)) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
@@ -2472,12 +2439,10 @@ int vp9_refining_search_8p_c(MACROBLOCK *x,
   if (bestsad < INT_MAX) {
     // FIXME(rbultje, yunqing): add full-pixel averaging variance functions
     // so we don't have to use the subpixel with xoff=0,yoff=0 here.
-    int besterr = fn_ptr->svaf(best_address, in_what_stride, 0, 0,
+    return fn_ptr->svaf(best_address, in_what_stride, 0, 0,
                                what, what_stride, (unsigned int *)(&thissad),
                                second_pred) +
-        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
-                    xd->allow_high_precision_mv);
-    return besterr;
+        mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit);
   } else {
     return INT_MAX;
   }
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index c13ea7597..097d33c65 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -25,7 +25,7 @@
 
 void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv);
 int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost,
-                           int *mvcost[2], int weight, int ishp);
+                           int *mvcost[2], int weight);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
 void vp9_init3smotion_compensation(MACROBLOCK *x,  int stride);
 
diff --git a/libvpx/vp9/encoder/vp9_onyx_if.c b/libvpx/vp9/encoder/vp9_onyx_if.c
index e5f1a5c2c..db039959f 100644
--- a/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -243,16 +243,17 @@ void vp9_initialize_enc() {
 
 static void setup_features(VP9_COMP *cpi) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
-  struct loopfilter *lf = &xd->lf;
+  struct loopfilter *const lf = &xd->lf;
+  struct segmentation *const seg = &xd->seg;
 
   // Set up default state for MB feature flags
-  xd->seg.enabled = 0;
+  seg->enabled = 0;
 
-  xd->seg.update_map = 0;
-  xd->seg.update_data = 0;
-  vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
+  seg->update_map = 0;
+  seg->update_data = 0;
+  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
 
-  vp9_clearall_segfeatures(&xd->seg);
+  vp9_clearall_segfeatures(seg);
 
   lf->mode_ref_delta_enabled = 0;
   lf->mode_ref_delta_update = 0;
@@ -324,6 +325,7 @@ static int compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  struct segmentation *seg = &xd->seg;
 
   int high_q = (int)(cpi->avg_q > 48.0);
   int qi_delta;
@@ -332,26 +334,26 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   if (cm->frame_type == KEY_FRAME) {
     // Clear down the global segmentation map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    xd->seg.update_map = 0;
-    xd->seg.update_data = 0;
+    seg->update_map = 0;
+    seg->update_data = 0;
     cpi->static_mb_pct = 0;
 
     // Disable segmentation
     vp9_disable_segmentation((VP9_PTR)cpi);
 
     // Clear down the segment features.
-    vp9_clearall_segfeatures(&xd->seg);
+    vp9_clearall_segfeatures(seg);
   } else if (cpi->refresh_alt_ref_frame) {
     // If this is an alt ref frame
     // Clear down the global segmentation map
     vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
-    xd->seg.update_map = 0;
-    xd->seg.update_data = 0;
+    seg->update_map = 0;
+    seg->update_data = 0;
     cpi->static_mb_pct = 0;
 
     // Disable segmentation and individual segment features by default
     vp9_disable_segmentation((VP9_PTR)cpi);
-    vp9_clearall_segfeatures(&xd->seg);
+    vp9_clearall_segfeatures(seg);
 
     // Scan frames from current to arf frame.
     // This function re-enables segmentation if appropriate.
@@ -359,45 +361,45 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
     // If segmentation was enabled set those features needed for the
     // arf itself.
-    if (xd->seg.enabled) {
-      xd->seg.update_map = 1;
-      xd->seg.update_data = 1;
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
 
       qi_delta = compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
-      vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
-      vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
+      vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
-      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
-      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
 
       // Where relevant assume segment data is delta data
-      xd->seg.abs_delta = SEGMENT_DELTADATA;
+      seg->abs_delta = SEGMENT_DELTADATA;
 
     }
-  } else if (xd->seg.enabled) {
+  } else if (seg->enabled) {
     // All other frames if segmentation has been enabled
 
     // First normal frame in a valid gf or alt ref group
     if (cpi->frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
       if (cpi->source_alt_ref_active) {
-        xd->seg.update_map = 0;
-        xd->seg.update_data = 1;
-        xd->seg.abs_delta = SEGMENT_DELTADATA;
+        seg->update_map = 0;
+        seg->update_data = 1;
+        seg->abs_delta = SEGMENT_DELTADATA;
 
         qi_delta = compute_qdelta(cpi, cpi->avg_q,
                                   (cpi->avg_q * 1.125));
-        vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
-        vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_Q);
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
-        vp9_set_segdata(&xd->seg, 1, SEG_LVL_ALT_LF, -2);
-        vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_ALT_LF);
+        vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
+        vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_LF);
 
         // Segment coding disabled for compred testing
         if (high_q || (cpi->static_mb_pct == 100)) {
-          vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
-          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
+          vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
         }
       } else {
         // Disable segmentation and clear down features if alt ref
@@ -407,10 +409,10 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
 
         vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
 
-        xd->seg.update_map = 0;
-        xd->seg.update_data = 0;
+        seg->update_map = 0;
+        seg->update_data = 0;
 
-        vp9_clearall_segfeatures(&xd->seg);
+        vp9_clearall_segfeatures(seg);
       }
     } else if (cpi->is_src_frame_alt_ref) {
       // Special case where we are coding over the top of a previous
@@ -418,28 +420,28 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       // Segment coding disabled for compred testing
 
       // Enable ref frame features for segment 0 as well
-      vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_REF_FRAME);
-      vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
 
       // All mbs should use ALTREF_FRAME
-      vp9_clear_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(&xd->seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
-      vp9_clear_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME);
-      vp9_set_segdata(&xd->seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+      vp9_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+      vp9_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
 
       // Skip all MBs if high Q (0,0 mv and skip coeffs)
       if (high_q) {
-          vp9_enable_segfeature(&xd->seg, 0, SEG_LVL_SKIP);
-          vp9_enable_segfeature(&xd->seg, 1, SEG_LVL_SKIP);
+          vp9_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+          vp9_enable_segfeature(seg, 1, SEG_LVL_SKIP);
       }
       // Enable data update
-      xd->seg.update_data = 1;
+      seg->update_data = 1;
     } else {
       // All other frames.
 
       // No updates.. leave things as they are.
-      xd->seg.update_map = 0;
-      xd->seg.update_data = 0;
+      seg->update_map = 0;
+      seg->update_data = 0;
     }
   }
 }
@@ -718,7 +720,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->reduce_first_step_size = 0;
   sf->auto_mv_step_size = 0;
   sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
-  sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
+  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
   sf->adaptive_rd_thresh = 0;
   sf->use_lastframe_partitioning = 0;
   sf->tx_size_search_method = USE_FULL_RD;
@@ -731,10 +733,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_one_partition_size_always = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
-  sf->use_partitions_less_than = 0;
-  sf->less_than_block_size = BLOCK_SIZE_MB16X16;
-  sf->use_partitions_greater_than = 0;
-  sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+  sf->auto_min_max_partition_size = 0;
+  sf->auto_min_max_partition_interval = 0;
+  sf->auto_min_max_partition_count = 0;
+  // sf->use_max_partition_size = 0;
+  sf->max_partition_size = BLOCK_64X64;
+  // sf->use_min_partition_size = 0;
+  sf->min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
   sf->disable_splitmv = 0;
@@ -745,8 +750,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->use_uv_intra_rd_estimate = 0;
   sf->using_small_partition_info = 0;
   // Skip any mode not chosen at size < X for all sizes > X
-  // Hence BLOCK_SIZE_SB64X64 (skip is off)
-  sf->unused_mode_skip_lvl = BLOCK_SIZE_SB64X64;
+  // Hence BLOCK_64X64 (skip is off)
+  sf->unused_mode_skip_lvl = BLOCK_64X64;
 
 #if CONFIG_MULTIPLE_ARF
   // Switch segmentation off.
@@ -769,8 +774,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 #endif
       sf->use_avoid_tested_higherror = 1;
       sf->adaptive_rd_thresh = 1;
-      sf->last_chroma_intra_mode = TM_PRED;
-
       if (speed == 1) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->less_rectangular_check  = 1;
@@ -784,14 +787,20 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                    cpi->common.show_frame == 0);
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
-        sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+        sf->unused_mode_skip_lvl = BLOCK_32X32;
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
-                                     FLAG_SKIP_COMP_BESTINTRA;
-        sf->last_chroma_intra_mode = H_PRED;
+                                     FLAG_SKIP_COMP_BESTINTRA |
+                                     FLAG_SKIP_INTRA_LOWVAR;
+        sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
         sf->auto_mv_step_size = 1;
+
+        sf->auto_min_max_partition_size = 1;
+        // sf->use_max_partition_size = 1;
+        // sf->use_min_partition_size = 1;
+        sf->auto_min_max_partition_interval = 1;
       }
       if (speed == 2) {
         sf->adjust_thresholds_by_speed = 1;
@@ -801,7 +810,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->use_lastframe_partitioning = 1;
         sf->adjust_partitioning_from_last_frame = 1;
         sf->last_partitioning_redo_frequency = 3;
-        sf->unused_mode_skip_lvl = BLOCK_SIZE_SB32X32;
+        sf->unused_mode_skip_lvl = BLOCK_32X32;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only ||
                                       cpi->common.show_frame == 0) ?
@@ -810,11 +819,13 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH;
+                                     FLAG_SKIP_COMP_REFMISMATCH |
+                                     FLAG_SKIP_INTRA_LOWVAR |
+                                     FLAG_EARLY_TERMINATE;
         sf->last_chroma_intra_mode = DC_PRED;
+        sf->use_uv_intra_rd_estimate = 1;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
-        sf->use_uv_intra_rd_estimate = 1;
         sf->using_small_partition_info = 1;
         sf->disable_splitmv =
             (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
@@ -831,7 +842,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH;
+                                     FLAG_SKIP_COMP_REFMISMATCH |
+                                     FLAG_SKIP_INTRA_LOWVAR |
+                                     FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
         sf->skip_encode_sb = 1;
         sf->disable_splitmv = 1;
@@ -840,7 +853,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       if (speed == 4) {
         sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
         sf->use_one_partition_size_always = 1;
-        sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+        sf->always_this_block_size = BLOCK_16X16;
         sf->tx_size_search_method = ((cpi->common.frame_type == KEY_FRAME ||
                                       cpi->common.intra_only ||
                                       cpi->common.show_frame == 0) ?
@@ -849,7 +862,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->mode_search_skip_flags = FLAG_SKIP_INTRA_DIRMISMATCH |
                                      FLAG_SKIP_INTRA_BESTINTER |
                                      FLAG_SKIP_COMP_BESTINTRA |
-                                     FLAG_SKIP_COMP_REFMISMATCH;
+                                     FLAG_SKIP_COMP_REFMISMATCH |
+                                     FLAG_SKIP_INTRA_LOWVAR |
+                                     FLAG_EARLY_TERMINATE;
         sf->use_rd_breakout = 1;
         sf->optimize_coefficients = 0;
         sf->auto_mv_step_size = 1;
@@ -861,15 +876,15 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
       /*
       if (speed == 2) {
         sf->first_step = 0;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
-        sf->use_partitions_less_than = 1;
-        sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+        sf->comp_inter_joint_search_thresh = BLOCK_8X8;
+        sf->use_max_partition_size = 1;
+        sf->max_partition_size = BLOCK_16X16;
       }
       if (speed == 3) {
         sf->first_step = 0;
-        sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
-        sf->use_partitions_greater_than = 1;
-        sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+        sf->comp_inter_joint_search_thresh = BLOCK_B8X8;
+        sf->use_min_partition_size = 1;
+        sf->min_partition_size = BLOCK_8X8;
       }
       */
 
@@ -1383,7 +1398,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cm = &cpi->common;
 
-  vpx_memset(cpi, 0, sizeof(VP9_COMP));
+  vp9_zero(*cpi);
 
   if (setjmp(cm->error.jmp)) {
     VP9_PTR ptr = ctx.ptr;
@@ -1833,7 +1848,10 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
       printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
-      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+      printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
+             cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
+             cpi->time_compress_data / 1000,
+             (cpi->time_receive_data + cpi->time_compress_data) / 1000);
     }
 #endif
 
@@ -2406,8 +2424,9 @@ static void update_reference_frames(VP9_COMP * const cpi) {
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  struct loopfilter *lf = &xd->lf;
   if (xd->lossless) {
-      xd->lf.filter_level = 0;
+      lf->filter_level = 0;
   } else {
     struct vpx_usec_timer timer;
 
@@ -2421,9 +2440,9 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
   }
 
-  if (xd->lf.filter_level > 0) {
-    vp9_set_alt_lf_level(cpi, xd->lf.filter_level);
-    vp9_loop_filter_frame(cm, xd, xd->lf.filter_level, 0);
+  if (lf->filter_level > 0) {
+    vp9_set_alt_lf_level(cpi, lf->filter_level);
+    vp9_loop_filter_frame(cm, xd, lf->filter_level, 0);
   }
 
   vp9_extend_frame_inner_borders(cm->frame_to_show,
@@ -2513,6 +2532,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
   SPEED_FEATURES *sf = &cpi->sf;
   unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
+  struct segmentation *seg = &xd->seg;
 #if RESET_FOREACH_FILTER
   int q_low0;
   int q_high0;
@@ -2612,9 +2632,9 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     setup_features(cpi);
 
     // If segmentation is enabled force a map update for key frames
-    if (xd->seg.enabled) {
-      xd->seg.update_map = 1;
-      xd->seg.update_data = 1;
+    if (seg->enabled) {
+      seg->update_map = 1;
+      seg->update_data = 1;
     }
 
     // The alternate reference frame cannot be active for a key frame
@@ -2818,7 +2838,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   }
 #endif
   loop_count = 0;
-  vpx_memset(cpi->rd_tx_select_threshes, 0, sizeof(cpi->rd_tx_select_threshes));
+  vp9_zero(cpi->rd_tx_select_threshes);
 
   if (cm->frame_type != KEY_FRAME) {
     /* TODO: Decide this more intelligently */
@@ -3173,7 +3193,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     if (!cpi->common.error_resilient_mode &&
         !cpi->common.frame_parallel_decoding_mode) {
       vp9_adapt_mode_probs(&cpi->common);
-      vp9_adapt_mode_context(&cpi->common);
       vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
     }
   }
@@ -3994,7 +4013,7 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
                    unsigned int threshold[MAX_SEGMENTS]) {
   VP9_COMP *cpi = (VP9_COMP *) comp;
   signed char feature_data[SEG_LVL_MAX][MAX_SEGMENTS];
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  struct segmentation *seg = &cpi->mb.e_mbd.seg;
   int i;
 
   if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
@@ -4021,14 +4040,14 @@ int vp9_set_roimap(VP9_PTR comp, unsigned char *map, unsigned int rows,
   // Enable the loop and quant changes in the feature mask
   for (i = 0; i < MAX_SEGMENTS; i++) {
     if (delta_q[i])
-      vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
     else
-      vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_Q);
+      vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
 
     if (delta_lf[i])
-      vp9_enable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
+      vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
     else
-      vp9_disable_segfeature(&xd->seg, i, SEG_LVL_ALT_LF);
+      vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
   }
 
   // Initialise the feature data structure
diff --git a/libvpx/vp9/encoder/vp9_onyx_int.h b/libvpx/vp9/encoder/vp9_onyx_int.h
index 0798927bd..c258829c2 100644
--- a/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -77,7 +77,7 @@ typedef struct {
   // 0 = ZERO_MV, MV
   signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS];
 
-  vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
 
   vp9_prob y_mode_prob[4][VP9_INTRA_MODES - 1];
   vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1];
@@ -145,6 +145,8 @@ typedef struct {
 // const MODE_DEFINITION vp9_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
+  THR_DC,
+
   THR_NEARESTA,
   THR_NEARESTG,
   THR_NEWMV,
@@ -152,8 +154,6 @@ typedef enum {
   THR_NEARMV,
   THR_COMP_NEARESTGA,
 
-  THR_DC,
-
   THR_NEWG,
   THR_NEWA,
   THR_NEARA,
@@ -224,6 +224,10 @@ typedef enum {
   // skips oblique intra modes  at angles 27, 63, 117, 153 if the best
   // intra so far is not one of the neighboring directions
   FLAG_SKIP_INTRA_DIRMISMATCH = 16,
+
+  // skips intra modes other than DC_PRED if the source variance
+  // is small
+  FLAG_SKIP_INTRA_LOWVAR = 32,
 } MODE_SEARCH_SKIP_LOGIC;
 
 typedef struct {
@@ -258,10 +262,13 @@ typedef struct {
   int unused_mode_skip_lvl;
   int reference_masking;
   BLOCK_SIZE_TYPE always_this_block_size;
-  int use_partitions_greater_than;
-  BLOCK_SIZE_TYPE greater_than_block_size;
-  int use_partitions_less_than;
-  BLOCK_SIZE_TYPE less_than_block_size;
+  int auto_min_max_partition_size;
+  int auto_min_max_partition_interval;
+  int auto_min_max_partition_count;
+  BLOCK_SIZE_TYPE min_partition_size;
+  BLOCK_SIZE_TYPE max_partition_size;
+  // int use_min_partition_size;       // not used in code
+  // int use_max_partition_size;
   int adjust_partitioning_from_last_frame;
   int last_partitioning_redo_frequency;
   int disable_splitmv;
@@ -370,9 +377,9 @@ typedef struct VP9_COMP {
   unsigned int single_ref_count[REF_CONTEXTS][2][2];
   unsigned int comp_ref_count[REF_CONTEXTS][2];
 
-  int64_t rd_tx_select_diff[NB_TXFM_MODES];
+  int64_t rd_tx_select_diff[TX_MODES];
   // FIXME(rbultje) can this overflow?
-  int rd_tx_select_threshes[4][NB_TXFM_MODES];
+  int rd_tx_select_threshes[4][TX_MODES];
 
   int64_t rd_filter_diff[VP9_SWITCHABLE_FILTERS + 1];
   int64_t rd_filter_threshes[4][VP9_SWITCHABLE_FILTERS + 1];
@@ -457,9 +464,9 @@ typedef struct VP9_COMP {
 
   nmv_context_counts NMVcount;
 
-  vp9_coeff_count coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct[TX_SIZE_MAX_SB][BLOCK_TYPES];
+  vp9_coeff_count coef_counts[TX_SIZES][BLOCK_TYPES];
+  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][BLOCK_TYPES];
+  vp9_coeff_stats frame_branch_ct[TX_SIZES][BLOCK_TYPES];
 
   int gfu_boost;
   int last_boost;
@@ -527,7 +534,7 @@ typedef struct VP9_COMP {
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
-  uint64_t time_encode_mb_row;
+  uint64_t time_encode_sb_row;
 
   struct twopass_rc {
     unsigned int section_intra_rating;
@@ -619,7 +626,7 @@ typedef struct VP9_COMP {
   unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1]
                                       [VP9_SWITCHABLE_FILTERS];
 
-  unsigned int txfm_stepdown_count[TX_SIZE_MAX_SB];
+  unsigned int txfm_stepdown_count[TX_SIZES];
 
   int initial_width;
   int initial_height;
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 843cf3f03..2d932500e 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -54,6 +54,8 @@ DECLARE_ALIGNED(16, extern const uint8_t,
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARESTMV, LAST_FRAME,   NONE},
+  {DC_PRED,   INTRA_FRAME,  NONE},
+
   {NEARESTMV, ALTREF_FRAME, NONE},
   {NEARESTMV, GOLDEN_FRAME, NONE},
   {NEWMV,     LAST_FRAME,   NONE},
@@ -61,8 +63,6 @@ const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   {NEARMV,    LAST_FRAME,   NONE},
   {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
 
-  {DC_PRED,   INTRA_FRAME,  NONE},
-
   {NEWMV,     GOLDEN_FRAME, NONE},
   {NEWMV,     ALTREF_FRAME, NONE},
   {NEARMV,    ALTREF_FRAME, NONE},
@@ -109,7 +109,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
 #define MAX_RD_THRESH_FREQ_FACT 32
 #define MAX_RD_THRESH_FREQ_INC 1
 
-static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
+static void fill_token_costs(vp9_coeff_cost *c,
                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
@@ -120,12 +120,12 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
             vp9_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
-            vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
+            vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);
-            vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
+            vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  vp9_coef_tree);
-            assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
-                   c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
+            assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
           }
 }
 
@@ -453,7 +453,7 @@ static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
                                  int *out_rate_sum, int64_t *out_dist_sum,
                                  int *out_skip) {
   int t = 4, j, k;
-  BLOCK_SIZE_TYPE bs = BLOCK_SIZE_AB4X4;
+  BLOCK_SIZE_TYPE bs = BLOCK_4X4;
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &xd->plane[0];
   const int width = plane_block_width(bsize, pd);
@@ -513,14 +513,19 @@ int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
   return error;
 }
 
-static const int16_t band_counts[TX_SIZE_MAX_SB][8] = {
-  { 1, 2, 3, 4,  3,   16 - 13 },
-  { 1, 2, 3, 4, 11,   64 - 21 },
-  { 1, 2, 3, 4, 11,  256 - 21 },
-  { 1, 2, 3, 4, 11, 1024 - 21 },
+/* The trailing '0' is a terminator which is used inside cost_coeffs() to
+ * decide whether to include cost of a trailing EOB node or not (i.e. we
+ * can skip this if the last coefficient in this transform block, e.g. the
+ * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
+ * were non-zero). */
+static const int16_t band_counts[TX_SIZES][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
 };
 
-static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
+static INLINE int cost_coeffs(MACROBLOCK *mb,
                               int plane, int block, PLANE_TYPE type,
                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
                               TX_SIZE tx_size,
@@ -528,11 +533,11 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
   MACROBLOCKD *const xd = &mb->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt, c, cost;
-  const int16_t *band_count = band_counts[tx_size];
+  const int16_t *band_count = &band_counts[tx_size][1];
   const int eob = xd->plane[plane].eobs[block];
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
+  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS]
                     [MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
   ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
   uint8_t token_cache[1024];
@@ -552,13 +557,14 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
     c = 0;
   } else {
-    int v, prev_t, band = 1, band_left = band_count[1];
+    int v, prev_t, band_left = *band_count++;
 
     // dc token
     v = qcoeff_ptr[0];
     prev_t = vp9_dct_value_tokens_ptr[v].token;
-    cost = token_costs[0][0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
+    cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
     token_cache[0] = vp9_pt_energy_class[prev_t];
+    ++token_costs;
 
     // ac tokens
     for (c = 1; c < eob; c++) {
@@ -568,18 +574,19 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
       v = qcoeff_ptr[rc];
       t = vp9_dct_value_tokens_ptr[v].token;
       pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v];
+      cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
       token_cache[rc] = vp9_pt_energy_class[t];
       prev_t = t;
       if (!--band_left) {
-        band_left = band_count[++band];
+        band_left = *band_count++;
+        ++token_costs;
       }
     }
 
     // eob token
-    if (band < 6) {
+    if (band_left) {
       pt = get_coef_context(nb, token_cache, c);
-      cost += token_costs[0][band][pt][DCT_EOB_TOKEN];
+      cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
     }
   }
 
@@ -639,7 +646,7 @@ static void rate_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   txfrm_block_to_raster_xy(xd, bsize, plane, block, args->tx_size * 2, &x_idx,
                            &y_idx);
 
-  args->rate += cost_coeffs(args->cm, args->x, plane, block,
+  args->rate += cost_coeffs(args->x, plane, block,
                             xd->plane[plane].plane_type, args->t_above + x_idx,
                             args->t_left + y_idx, args->tx_size,
                             args->scan, args->nb);
@@ -831,7 +838,7 @@ static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
                                      int64_t ref_best_rd,
                                      BLOCK_SIZE_TYPE bs) {
   const TX_SIZE max_txfm_size = TX_32X32
-      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -859,25 +866,25 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
                                      int (*r)[2], int *rate,
                                      int64_t *d, int64_t *distortion,
                                      int *s, int *skip,
-                                     int64_t txfm_cache[NB_TXFM_MODES],
+                                     int64_t tx_cache[TX_MODES],
                                      BLOCK_SIZE_TYPE bs) {
-  const TX_SIZE max_txfm_size = TX_32X32
-      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+  const TX_SIZE max_tx_size = TX_32X32
+      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZE_MAX_SB][2];
+  int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
 
   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     r[n][1] = r[n][0];
     if (r[n][0] == INT_MAX)
       continue;
-    for (m = 0; m <= n - (n == max_txfm_size); m++) {
+    for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
         r[n][1] += vp9_cost_zero(tx_probs[m]);
       else
@@ -889,7 +896,7 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
-  for (n = TX_4X4; n <= max_txfm_size; n++) {
+  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (d[n] == INT64_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
       continue;
@@ -902,13 +909,13 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       (cm->tx_mode == ALLOW_32X32 ||
        (cm->tx_mode == TX_MODE_SELECT &&
         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
     mbmi->txfm_size = TX_32X32;
-  } else if (max_txfm_size >= TX_16X16 &&
+  } else if (max_tx_size >= TX_16X16 &&
              (cm->tx_mode == ALLOW_16X16 ||
               cm->tx_mode == ALLOW_32X32 ||
               (cm->tx_mode == TX_MODE_SELECT &&
@@ -928,34 +935,34 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   *rate       = r[mbmi->txfm_size][cm->tx_mode == TX_MODE_SELECT];
   *skip       = s[mbmi->txfm_size];
 
-  txfm_cache[ONLY_4X4] = rd[TX_4X4][0];
-  txfm_cache[ALLOW_8X8] = rd[TX_8X8][0];
-  txfm_cache[ALLOW_16X16] = rd[MIN(max_txfm_size, TX_16X16)][0];
-  txfm_cache[ALLOW_32X32] = rd[MIN(max_txfm_size, TX_32X32)][0];
-  if (max_txfm_size == TX_32X32 &&
+  tx_cache[ONLY_4X4] = rd[TX_4X4][0];
+  tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
+  tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
+  tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
+  if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1])
-    txfm_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (max_txfm_size >= TX_16X16 &&
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
+  else if (max_tx_size >= TX_16X16 &&
            rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
-    txfm_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
   else
-    txfm_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
                                  rd[TX_4X4][1] : rd[TX_8X8][1];
 
-  if (max_txfm_size == TX_32X32 &&
+  if (max_tx_size == TX_32X32 &&
       rd[TX_32X32][1] < rd[TX_16X16][1] &&
       rd[TX_32X32][1] < rd[TX_8X8][1] &&
       rd[TX_32X32][1] < rd[TX_4X4][1]) {
     cpi->txfm_stepdown_count[0]++;
-  } else if (max_txfm_size >= TX_16X16 &&
+  } else if (max_tx_size >= TX_16X16 &&
              rd[TX_16X16][1] < rd[TX_8X8][1] &&
              rd[TX_16X16][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
+    cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
+    cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
-    cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
+    cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
 
@@ -967,16 +974,16 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
                                           BLOCK_SIZE_TYPE bs,
                                           int *model_used) {
   const TX_SIZE max_txfm_size = TX_32X32
-      - (bs < BLOCK_SIZE_SB32X32) - (bs < BLOCK_SIZE_MB16X16);
+      - (bs < BLOCK_32X32) - (bs < BLOCK_16X16);
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
-  int64_t rd[TX_SIZE_MAX_SB][2];
+  int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
-  double scale_rd[TX_SIZE_MAX_SB] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZE_MAX_SB] = {2.82, 2.00, 1.41, 1.00};
+  double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
+  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
 
   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs);
 
@@ -1065,11 +1072,11 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int64_t *distortion,
                             int *skip, int64_t *psse, BLOCK_SIZE_TYPE bs,
-                            int64_t txfm_cache[NB_TXFM_MODES],
+                            int64_t txfm_cache[TX_MODES],
                             int64_t ref_best_rd) {
   VP9_COMMON *const cm = &cpi->common;
-  int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
-  int64_t d[TX_SIZE_MAX_SB], sse[TX_SIZE_MAX_SB];
+  int r[TX_SIZES][2], s[TX_SIZES];
+  int64_t d[TX_SIZES], sse[TX_SIZES];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
@@ -1080,7 +1087,7 @@ static void super_block_yrd(VP9_COMP *cpi,
   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
        mbmi->ref_frame[0] == INTRA_FRAME)) {
-    vpx_memset(txfm_cache, 0, NB_TXFM_MODES * sizeof(int64_t));
+    vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
                              ref_best_rd, bs);
     if (psse)
@@ -1090,49 +1097,47 @@ static void super_block_yrd(VP9_COMP *cpi,
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
       mbmi->ref_frame[0] > INTRA_FRAME) {
-    int model_used[TX_SIZE_MAX_SB] = {1, 1, 1, 1};
-    if (bs >= BLOCK_SIZE_SB32X32) {
-      if (model_used[TX_32X32]) {
+    int model_used[TX_SIZES] = {1, 1, 1, 1};
+    if (bs >= BLOCK_32X32) {
+      if (model_used[TX_32X32])
         model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
                              &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-      } else {
+      else
         super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
                                  &s[TX_32X32], &sse[TX_32X32], INT64_MAX,
                                  bs, TX_32X32);
-      }
     }
-    if (bs >= BLOCK_SIZE_MB16X16) {
-      if (model_used[TX_16X16]) {
+    if (bs >= BLOCK_16X16) {
+      if (model_used[TX_16X16])
         model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
                              &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-      } else {
+      else
         super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
                                  &s[TX_16X16], &sse[TX_16X16], INT64_MAX,
                                  bs, TX_16X16);
-      }
     }
-    if (model_used[TX_8X8]) {
+    if (model_used[TX_8X8])
       model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
                            &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-    } else {
+    else
       super_block_yrd_for_txfm(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
                                &sse[TX_8X8], INT64_MAX, bs, TX_8X8);
-    }
-    if (model_used[TX_4X4]) {
+
+    if (model_used[TX_4X4])
       model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
                            &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-    } else {
+    else
       super_block_yrd_for_txfm(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
                                &sse[TX_4X4], INT64_MAX, bs, TX_4X4);
-    }
+
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
                                   skip, sse, ref_best_rd, bs, model_used);
   } else {
-    if (bs >= BLOCK_SIZE_SB32X32)
+    if (bs >= BLOCK_32X32)
       super_block_yrd_for_txfm(cm, x, &r[TX_32X32][0], &d[TX_32X32],
                                &s[TX_32X32], &sse[TX_32X32], ref_best_rd,
                                bs, TX_32X32);
-    if (bs >= BLOCK_SIZE_MB16X16)
+    if (bs >= BLOCK_16X16)
       super_block_yrd_for_txfm(cm, x, &r[TX_16X16][0], &d[TX_16X16],
                                &s[TX_16X16], &sse[TX_16X16], ref_best_rd,
                                bs, TX_16X16);
@@ -1174,28 +1179,30 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                                      int *bestrate, int *bestratey,
                                      int64_t *bestdistortion,
-                                     BLOCK_SIZE_TYPE bsize) {
+                                     BLOCK_SIZE_TYPE bsize,
+                                     int64_t rd_thresh) {
   MB_PREDICTION_MODE mode;
   MACROBLOCKD *xd = &x->e_mbd;
-  int64_t best_rd = INT64_MAX;
+  int64_t best_rd = rd_thresh;
   int rate = 0;
   int64_t distortion;
-  VP9_COMMON *const cm = &cpi->common;
   struct macroblock_plane *p = &x->plane[0];
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  uint8_t *src, *dst;
+  uint8_t *src_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+                                                p->src.buf, src_stride);
+  uint8_t *dst_init = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, ib,
+                                                pd->dst.buf, dst_stride);
   int16_t *src_diff, *coeff;
 
   ENTROPY_CONTEXT ta[2], tempa[2];
   ENTROPY_CONTEXT tl[2], templ[2];
   TX_TYPE tx_type = DCT_DCT;
-  TX_TYPE best_tx_type = DCT_DCT;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy, block;
-  DECLARE_ALIGNED(16, int16_t, best_dqcoeff[4][16]);
+  uint8_t best_dst[8 * 8];
 
   assert(ib < 4);
 
@@ -1223,17 +1230,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
         const int16_t *scan;
+        uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
+        uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
 
         block = ib + idy * 2 + idx;
         xd->mode_info_context->bmi[block].as_mode = mode;
-        src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        p->src.buf, src_stride);
         src_diff = raster_block_offset_int16(xd, BLOCK_SIZE_SB8X8, 0, block,
                                              p->src_diff);
         coeff = BLOCK_OFFSET(x->plane[0].coeff, block, 16);
-        dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                        pd->dst.buf, dst_stride);
-        vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8),
+        vp9_predict_intra_block(xd, block, 1,
                                 TX_4X4, mode,
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
@@ -1252,12 +1257,14 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
         }
 
         scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
-        ratey += cost_coeffs(cm, x, 0, block, PLANE_TYPE_Y_WITH_DC,
+        ratey += cost_coeffs(x, 0, block, PLANE_TYPE_Y_WITH_DC,
                              tempa + idx, templ + idy, TX_4X4, scan,
                              vp9_get_coef_neighbors_handle(scan));
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff,
                                                           block, 16),
                                       16, &ssz) >> 2;
+        if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+          goto next;
 
         if (tx_type != DCT_DCT)
           vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block, 16),
@@ -1277,61 +1284,40 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      best_tx_type = tx_type;
       vpx_memcpy(a, tempa, sizeof(tempa));
       vpx_memcpy(l, templ, sizeof(templ));
-      for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
-        for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-          block = ib + idy * 2 + idx;
-          vpx_memcpy(best_dqcoeff[idy * 2 + idx],
-                     BLOCK_OFFSET(pd->dqcoeff, block, 16),
-                     sizeof(best_dqcoeff[0]));
-        }
-      }
+      for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+        vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
+                   num_4x4_blocks_wide * 4);
     }
+  next:
+    {}
   }
 
-  if (x->skip_encode)
+  if (best_rd >= rd_thresh || x->skip_encode)
     return best_rd;
 
-  for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
-    for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
-      block = ib + idy * 2 + idx;
-      xd->mode_info_context->bmi[block].as_mode = *best_mode;
-      src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      p->src.buf, src_stride);
-      dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, block,
-                                      pd->dst.buf, dst_stride);
-
-      vp9_predict_intra_block(xd, block, b_width_log2(BLOCK_SIZE_SB8X8), TX_4X4,
-                              *best_mode,
-                              x->skip_encode ? src : dst,
-                              x->skip_encode ? src_stride : dst_stride,
-                              dst, dst_stride);
-      // inverse transform
-      if (best_tx_type != DCT_DCT)
-        vp9_short_iht4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                             dst_stride, best_tx_type);
-      else
-        xd->inv_txm4x4_add(best_dqcoeff[idy * 2 + idx], dst,
-                           dst_stride);
-    }
-  }
+  for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
+    vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
+               num_4x4_blocks_wide * 4);
 
   return best_rd;
 }
 
-static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
-                                         int *Rate, int *rate_y,
-                                         int64_t *Distortion, int64_t best_rd) {
+static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
+                                            MACROBLOCK * const mb,
+                                            int * const rate,
+                                            int * const rate_y,
+                                            int64_t * const distortion,
+                                            int64_t best_rd) {
   int i, j;
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   int idx, idy;
   int cost = 0;
-  int64_t distortion = 0;
+  int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
   ENTROPY_CONTEXT t_above[4], t_left[4];
@@ -1343,12 +1329,13 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
 
   bmode_costs = mb->mbmode_cost;
 
+  // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
       const int mis = xd->mode_info_stride;
       MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
       int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
-      int64_t UNINITIALIZED_IS_SAFE(d);
+      int64_t UNINITIALIZED_IS_SAFE(d), this_rd;
       i = idy * 2 + idx;
 
       if (cpi->common.frame_type == KEY_FRAME) {
@@ -1359,11 +1346,16 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
         bmode_costs  = mb->y_mode_costs[A][L];
       }
 
-      total_rd += rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
-                                        t_above + idx, t_left + idy,
-                                        &r, &ry, &d, bsize);
+      this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
+                                      t_above + idx, t_left + idy,
+                                      &r, &ry, &d, bsize,
+                                      best_rd - total_rd);
+      if (this_rd >= best_rd - total_rd)
+        return INT64_MAX;
+
+      total_rd += this_rd;
       cost += r;
-      distortion += d;
+      total_distortion += d;
       tot_rate_y += ry;
 
       mic->bmi[i].as_mode = best_mode;
@@ -1377,19 +1369,19 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
     }
   }
 
-  *Rate = cost;
+  *rate = cost;
   *rate_y = tot_rate_y;
-  *Distortion = distortion;
+  *distortion = total_distortion;
   xd->mode_info_context->mbmi.mode = mic->bmi[3].as_mode;
 
-  return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+  return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE_TYPE bsize,
-                                      int64_t txfm_cache[NB_TXFM_MODES],
+                                      int64_t tx_cache[TX_MODES],
                                       int64_t best_rd) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
@@ -1400,14 +1392,13 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int i;
   int *bmode_costs = x->mbmode_cost;
 
-  if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-    for (i = 0; i < NB_TXFM_MODES; i++)
-      txfm_cache[i] = INT64_MAX;
-  }
+  if (cpi->sf.tx_size_search_method == USE_FULL_RD)
+    for (i = 0; i < TX_MODES; i++)
+      tx_cache[i] = INT64_MAX;
 
-  /* Y Search for 32x32 intra prediction mode */
+  /* Y Search for intra prediction mode */
   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    int64_t local_txfm_cache[NB_TXFM_MODES];
+    int64_t local_tx_cache[TX_MODES];
     MODE_INFO *const mic = xd->mode_info_context;
     const int mis = xd->mode_info_stride;
 
@@ -1421,7 +1412,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     x->e_mbd.mode_info_context->mbmi.mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, local_txfm_cache, best_rd);
+                    bsize, local_tx_cache, best_rd);
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
@@ -1440,11 +1431,11 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        int64_t adj_rd = this_rd + local_txfm_cache[i] -
-            local_txfm_cache[cpi->common.tx_mode];
-        if (adj_rd < txfm_cache[i]) {
-          txfm_cache[i] = adj_rd;
+      for (i = 0; i < TX_MODES; i++) {
+        const int64_t adj_rd = this_rd + local_tx_cache[i] -
+            local_tx_cache[cpi->common.tx_mode];
+        if (adj_rd < tx_cache[i]) {
+          tx_cache[i] = adj_rd;
         }
       }
     }
@@ -1537,8 +1528,6 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 
-  x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
-
   return this_rd;
 }
 
@@ -1609,8 +1598,8 @@ static int labels2mode(MACROBLOCK *x, int i,
   MB_MODE_INFO * mbmi = &mic->mbmi;
   int cost = 0, thismvcost = 0;
   int idx, idy;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
 
   /* We have to be careful retrieving previously-encoded motion vectors.
    Ones from this macroblock have to be pulled from the BLOCKD array
@@ -1623,12 +1612,11 @@ static int labels2mode(MACROBLOCK *x, int i,
     case NEWMV:
       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
       thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
-                                    102, xd->allow_high_precision_mv);
+                                    102);
       if (mbmi->ref_frame[1] > 0) {
         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
         thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
-                                      mvjcost, mvcost, 102,
-                                      xd->allow_high_precision_mv);
+                                      mvjcost, mvcost, 102);
       }
       break;
     case NEARESTMV:
@@ -1678,11 +1666,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                        ENTROPY_CONTEXT *ta,
                                        ENTROPY_CONTEXT *tl) {
   int k;
-  VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
-  BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
-  const int width = plane_block_width(bsize, &xd->plane[0]);
-  const int height = plane_block_height(bsize, &xd->plane[0]);
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MODE_INFO *const mi = xd->mode_info_context;
+  const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+  const int width = plane_block_width(bsize, pd);
+  const int height = plane_block_height(bsize, pd);
   int idx, idy;
   const int src_stride = x->plane[0].src.stride;
   uint8_t* const src = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
@@ -1692,39 +1681,33 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
                                                 x->plane[0].src_diff);
   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, i);
   uint8_t* const pre = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                                 xd->plane[0].pre[0].buf,
-                                                 xd->plane[0].pre[0].stride);
+                                                 pd->pre[0].buf,
+                                                 pd->pre[0].stride);
   uint8_t* const dst = raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                                                 xd->plane[0].dst.buf,
-                                                 xd->plane[0].dst.stride);
+                                                 pd->dst.buf,
+                                                 pd->dst.stride);
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
 
-  vp9_build_inter_predictor(pre,
-                            xd->plane[0].pre[0].stride,
-                            dst,
-                            xd->plane[0].dst.stride,
-                            &xd->mode_info_context->bmi[i].as_mv[0],
+  vp9_build_inter_predictor(pre, pd->pre[0].stride,
+                            dst, pd->dst.stride,
+                            &mi->bmi[i].as_mv[0].as_mv,
                             &xd->scale_factor[0],
-                            width, height, 0, &xd->subpix,
-                            MV_PRECISION_Q3);
+                            width, height, 0, &xd->subpix, MV_PRECISION_Q3);
 
-  if (xd->mode_info_context->mbmi.ref_frame[1] > 0) {
+  if (mi->mbmi.ref_frame[1] > 0) {
     uint8_t* const second_pre =
     raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
-                              xd->plane[0].pre[1].buf,
-                              xd->plane[0].pre[1].stride);
-    vp9_build_inter_predictor(second_pre, xd->plane[0].pre[1].stride,
-                              dst, xd->plane[0].dst.stride,
-                              &xd->mode_info_context->bmi[i].as_mv[1],
+                              pd->pre[1].buf, pd->pre[1].stride);
+    vp9_build_inter_predictor(second_pre, pd->pre[1].stride,
+                              dst, pd->dst.stride,
+                              &mi->bmi[i].as_mv[1].as_mv,
                               &xd->scale_factor[1],
-                              width, height, 1,
-                              &xd->subpix, MV_PRECISION_Q3);
+                              width, height, 1, &xd->subpix, MV_PRECISION_Q3);
   }
 
-  vp9_subtract_block(height, width, src_diff, 8,
-                     src, src_stride,
-                     dst, xd->plane[0].dst.stride);
+  vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
+                     dst, pd->dst.stride);
 
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
@@ -1737,11 +1720,10 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       coeff = BLOCK_OFFSET(x->plane[0].coeff, 16, k);
       x->fwd_txm4x4(src_diff, coeff, 16);
       x->quantize_b_4x4(x, k, DCT_DCT, 16);
-      thisdistortion += vp9_block_error(coeff,
-                                        BLOCK_OFFSET(xd->plane[0].dqcoeff,
-                                                     k, 16), 16, &ssz);
+      thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k, 16),
+                                        16, &ssz);
       thissse += ssz;
-      thisrate += cost_coeffs(cm, x, 0, k, PLANE_TYPE_Y_WITH_DC,
+      thisrate += cost_coeffs(x, 0, k, PLANE_TYPE_Y_WITH_DC,
                               ta + (k & 1),
                               tl + (k >> 1), TX_4X4,
                               vp9_default_scan_4x4,
@@ -1836,8 +1818,8 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   int label_mv_thresh;
   int segmentyrate = 0;
   BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
-  int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
-  int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
+  const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   vp9_variance_fn_ptr_t *v_fn_ptr;
   ENTROPY_CONTEXT t_above[2], t_left[2];
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
@@ -1871,12 +1853,12 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
       vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
-                                    i, 0);
+                                    i, 0, mi_row, mi_col);
       if (mbmi->ref_frame[1] > 0)
         vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
                                    &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
                                    &frame_mv[NEARMV][mbmi->ref_frame[1]],
-                                   i, 1);
+                                   i, 1, mi_row, mi_col);
 
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
@@ -1984,7 +1966,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           // Should we do a full search (best quality only)
           if (cpi->compressor_speed == 0) {
             /* Check if mvp_full is within the range. */
-            clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
+            clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
 
             thissme = cpi->full_search_sad(x, &mvp_full,
@@ -2204,7 +2186,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   MB_MODE_INFO *mbmi = &mi->mbmi;
   int mode_idx;
 
-  vpx_memset(bsi, 0, sizeof(*bsi));
+  vp9_zero(*bsi);
 
   bsi->segment_rd = best_rd;
   bsi->ref_mv = best_ref_mv;
@@ -2358,7 +2340,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
-                         int64_t txfm_size_diff[NB_TXFM_MODES],
+                         int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[VP9_SWITCHABLE_FILTERS + 1]) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -2380,7 +2362,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
 
   // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
   // doesn't actually work this way
-  memcpy(ctx->txfm_rd_diff, txfm_size_diff, sizeof(ctx->txfm_rd_diff));
+  memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   memcpy(ctx->best_filter_diff, best_filter_diff,
          sizeof(*best_filter_diff) * (VP9_SWITCHABLE_FILTERS + 1));
 }
@@ -2444,7 +2426,7 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
                    xd->prev_mode_info_context,
                    frame_type,
                    mbmi->ref_mvs[frame_type],
-                   cpi->common.ref_frame_sign_bias);
+                   cpi->common.ref_frame_sign_bias, mi_row, mi_col);
 
   // Candidate refinement carried out at encoder and decoder
   vp9_find_best_ref_mvs(xd,
@@ -2469,7 +2451,7 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   return scaled_ref_frame;
 }
 
-static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
+static INLINE int get_switchable_rate(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
 
@@ -2575,7 +2557,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
   *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
                              x->nmvjointcost, x->mvcost,
-                             96, xd->allow_high_precision_mv);
+                             96);
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2663,7 +2645,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     vp9_build_inter_predictor(ref_yv12[!id].buf,
                               ref_yv12[!id].stride,
                               second_pred, pw,
-                              &frame_mv[refs[!id]],
+                              &frame_mv[refs[!id]].as_mv,
                               &xd->scale_factor[!id],
                               pw, ph, 0,
                               &xd->subpix, MV_PRECISION_Q3);
@@ -2730,12 +2712,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
   *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
                               &mbmi->ref_mvs[refs[0]][0],
-                              x->nmvjointcost, x->mvcost, 96,
-                              x->e_mbd.allow_high_precision_mv);
+                              x->nmvjointcost, x->mvcost, 96);
   *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
                               &mbmi->ref_mvs[refs[1]][0],
-                              x->nmvjointcost, x->mvcost, 96,
-                              x->e_mbd.allow_high_precision_mv);
+                              x->nmvjointcost, x->mvcost, 96);
 
   vpx_free(second_pred);
 }
@@ -2775,46 +2755,36 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
-  switch (this_mode) {
+  if (this_mode == NEWMV) {
     int rate_mv;
-    case NEWMV:
-      if (is_comp_pred) {
-        // Initialize mv using single prediction mode result.
-        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+    if (is_comp_pred) {
+      // Initialize mv using single prediction mode result.
+      frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-        if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
-          joint_motion_search(cpi, x, bsize, frame_mv,
-                              mi_row, mi_col, single_newmv, &rate_mv);
-        } else {
-          rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
-                                     &mbmi->ref_mvs[refs[0]][0],
-                                     x->nmvjointcost, x->mvcost, 96,
-                                     x->e_mbd.allow_high_precision_mv);
-          rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
-                                     &mbmi->ref_mvs[refs[1]][0],
-                                     x->nmvjointcost, x->mvcost, 96,
-                                     x->e_mbd.allow_high_precision_mv);
-        }
-        if (frame_mv[refs[0]].as_int == INVALID_MV ||
-            frame_mv[refs[1]].as_int == INVALID_MV)
-          return INT64_MAX;
-        *rate2 += rate_mv;
+      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+        joint_motion_search(cpi, x, bsize, frame_mv,
+                            mi_row, mi_col, single_newmv, &rate_mv);
       } else {
-        int_mv tmp_mv;
-        single_motion_search(cpi, x, bsize, mi_row, mi_col,
-                             &tmp_mv, &rate_mv);
-        *rate2 += rate_mv;
-        frame_mv[refs[0]].as_int =
-            xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
-        single_newmv[refs[0]].as_int = tmp_mv.as_int;
+        rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
+                                   &mbmi->ref_mvs[refs[0]][0],
+                                   x->nmvjointcost, x->mvcost, 96);
+        rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
+                                   &mbmi->ref_mvs[refs[1]][0],
+                                   x->nmvjointcost, x->mvcost, 96);
       }
-      break;
-    case NEARMV:
-    case NEARESTMV:
-    case ZEROMV:
-    default:
-      break;
+      if (frame_mv[refs[0]].as_int == INVALID_MV ||
+          frame_mv[refs[1]].as_int == INVALID_MV)
+        return INT64_MAX;
+      *rate2 += rate_mv;
+    } else {
+      int_mv tmp_mv;
+      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+      *rate2 += rate_mv;
+      frame_mv[refs[0]].as_int =
+          xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+      single_newmv[refs[0]].as_int = tmp_mv.as_int;
+    }
   }
 
   // if we're near/nearest and mv == 0,0, compare to zeromv
@@ -2856,10 +2826,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   for (i = 0; i < num_refs; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
-    if (this_mode == NEWMV)
-      assert(!clamp_mv2(&cur_mv[i], xd));
-    else
-      clamp_mv2(&cur_mv[i], xd);
+    if (this_mode != NEWMV)
+      clamp_mv2(&cur_mv[i].as_mv, xd);
 
     if (mv_check_bounds(x, &cur_mv[i]))
       return INT64_MAX;
@@ -2918,7 +2886,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       const int is_intpel_interp = intpel_mv;
       mbmi->interp_filter = filter;
       vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-      rs = get_switchable_rate(cm, x);
+      rs = get_switchable_rate(x);
       rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
       if (interpolating_intpel_seen && is_intpel_interp) {
@@ -2995,11 +2963,11 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
       xd->plane[i].dst.stride = orig_dst_stride[i];
     }
   }
-  // Set the appripriate filter
+  // Set the appropriate filter
   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
       cm->mcomp_filter_type : *best_filter;
   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
-  rs = (cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(cm, x) : 0);
+  rs = cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(x) : 0;
 
   if (pred_exists) {
     if (best_needs_copy) {
@@ -3033,55 +3001,82 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (cpi->common.mcomp_filter_type == SWITCHABLE)
-    *rate2 += get_switchable_rate(cm, x);
-
-  if (cpi->active_map_enabled && x->active_ptr[0] == 0)
-    x->skip = 1;
-  else if (x->encode_breakout) {
-    const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
-    const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
-
-    unsigned int var, sse;
-    int threshold = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1] >> 4);
-
-
-    if (threshold < x->encode_breakout)
-      threshold = x->encode_breakout;
-
-    var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
-                                 xd->plane[0].dst.buf, xd->plane[0].dst.stride,
-                                 &sse);
-
-    if ((int)sse < threshold) {
-      unsigned int q2dc = xd->plane[0].dequant[0];
-      // If there is no codeable 2nd order dc
-      // or a very small uniform pixel change change
-      if ((sse - var < q2dc * q2dc >> 4) ||
-          (sse / 2 > var && sse - var < 64)) {
-        // Check u and v to make sure skip is ok
-        int sse2;
-        unsigned int sse2u, sse2v;
-        var = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
-                                      x->plane[1].src.stride,
-                                      xd->plane[1].dst.buf,
-                                      xd->plane[1].dst.stride, &sse2u);
-        var = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
-                                      x->plane[2].src.stride,
-                                      xd->plane[2].dst.buf,
-                                      xd->plane[2].dst.stride, &sse2v);
-        sse2 = sse2u + sse2v;
-
-        if (sse2 * 2 < threshold) {
-          x->skip = 1;
-          *distortion = sse + sse2;
-          *rate2 = 500;
-
-          // for best yrd calculation
-          *rate_uv = 0;
-          *distortion_uv = sse2;
-
-          *disable_skip = 1;
-          this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+    *rate2 += get_switchable_rate(x);
+
+  if (!is_comp_pred) {
+    if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+      x->skip = 1;
+    else if (x->encode_breakout) {
+      const BLOCK_SIZE_TYPE y_size = get_plane_block_size(bsize, &xd->plane[0]);
+      const BLOCK_SIZE_TYPE uv_size = get_plane_block_size(bsize,
+                                                           &xd->plane[1]);
+      unsigned int var, sse;
+      // Skipping threshold for ac.
+      unsigned int thresh_ac;
+      // The encode_breakout input
+      unsigned int encode_breakout = x->encode_breakout << 4;
+
+      // Calculate threshold according to dequant value.
+      thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
+
+      // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
+      if (thresh_ac > 36000)
+        thresh_ac = 36000;
+
+      // Use encode_breakout input if it is bigger than internal threshold.
+      if (thresh_ac < encode_breakout)
+        thresh_ac = encode_breakout;
+
+      var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                   xd->plane[0].dst.buf,
+                                   xd->plane[0].dst.stride, &sse);
+
+      // Adjust threshold according to partition size.
+      thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
+          b_height_log2_lookup[bsize]);
+
+      // Y skipping condition checking
+      if (sse < thresh_ac || sse == 0) {
+        // Skipping threshold for dc
+        unsigned int thresh_dc;
+
+        thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
+
+        // dc skipping checking
+        if ((sse - var) < thresh_dc || sse == var) {
+          unsigned int sse_u, sse_v;
+          unsigned int var_u, var_v;
+
+          var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
+                                          x->plane[1].src.stride,
+                                          xd->plane[1].dst.buf,
+                                          xd->plane[1].dst.stride, &sse_u);
+
+          // U skipping condition checking
+          if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
+              (sse_u - var_u < thresh_dc || sse_u == var_u)) {
+            var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
+                                            x->plane[2].src.stride,
+                                            xd->plane[2].dst.buf,
+                                            xd->plane[2].dst.stride, &sse_v);
+
+            // V skipping condition checking
+            if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
+                (sse_v - var_v < thresh_dc || sse_v == var_v)) {
+              x->skip = 1;
+
+              *rate2 = 500;
+              *rate_uv = 0;
+
+              // Scaling factor for SSE from spatial domain to frequency domain
+              // is 16. Adjust distortion accordingly.
+              *distortion_uv = (sse_u + sse_v) << 4;
+              *distortion = (sse << 4) + *distortion_uv;
+
+              *disable_skip = 1;
+              this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+            }
+          }
         }
       }
     }
@@ -3133,15 +3128,13 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   MACROBLOCKD *const xd = &x->e_mbd;
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip;
-  int64_t dist_y = 0, dist_uv = 0, txfm_cache[NB_TXFM_MODES];
-
+  int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   x->skip_encode = 0;
-  vpx_memset(&txfm_cache, 0, sizeof(txfm_cache));
   ctx->skip = 0;
   xd->mode_info_context->mbmi.ref_frame[0] = INTRA_FRAME;
   if (bsize >= BLOCK_SIZE_SB8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
-                               &dist_y, &y_skip, bsize, txfm_cache,
+                               &dist_y, &y_skip, bsize, tx_cache,
                                best_rd) >= best_rd) {
       *returnrate = INT_MAX;
       return;
@@ -3150,8 +3143,8 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                             &dist_uv, &uv_skip, bsize);
   } else {
     y_skip = 0;
-    if (rd_pick_intra4x4mby_modes(cpi, x, &rate_y, &rate_y_tokenonly,
-                                  &dist_y, best_rd) >= best_rd) {
+    if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
+                                     &dist_y, best_rd) >= best_rd) {
       *returnrate = INT_MAX;
       return;
     }
@@ -3163,17 +3156,15 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
                   vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
     *returndist = dist_y + (dist_uv >> 2);
-    memset(ctx->txfm_rd_diff, 0, sizeof(ctx->txfm_rd_diff));
+    vp9_zero(ctx->tx_rd_diff);
   } else {
     int i;
     *returnrate = rate_y + rate_uv +
         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
     *returndist = dist_y + (dist_uv >> 2);
-    if (cpi->sf.tx_size_search_method == USE_FULL_RD) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
-        ctx->txfm_rd_diff[i] = txfm_cache[i] - txfm_cache[cm->tx_mode];
-      }
-    }
+    if (cpi->sf.tx_size_search_method == USE_FULL_RD)
+      for (i = 0; i < TX_MODES; i++)
+        ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
   }
 
   ctx->mic = *xd->mode_info_context;
@@ -3189,9 +3180,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
+  const struct segmentation *seg = &xd->seg;
   const BLOCK_SIZE_TYPE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   MB_PREDICTION_MODE this_mode;
-  MV_REFERENCE_FRAME ref_frame;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
@@ -3205,8 +3197,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                      cpi->alt_fb_idx};
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
-  int64_t best_txfm_rd[NB_TXFM_MODES];
-  int64_t best_txfm_diff[NB_TXFM_MODES];
+  int64_t best_tx_rd[TX_MODES];
+  int64_t best_tx_diff[TX_MODES];
   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   int64_t best_filter_rd[VP9_SWITCHABLE_FILTERS + 1];
@@ -3222,10 +3214,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
-  int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
-  int64_t dist_uv[TX_SIZE_MAX_SB];
-  int skip_uv[TX_SIZE_MAX_SB];
-  MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
+  int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
+  int64_t dist_uv[TX_SIZES];
+  int skip_uv[TX_SIZES];
+  MB_PREDICTION_MODE mode_uv[TX_SIZES];
   struct scale_factors scale_factor[4];
   unsigned int ref_frame_mask = 0;
   unsigned int mode_mask = 0;
@@ -3254,7 +3246,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   ctx->frames_with_high_error = 0;
   ctx->modes_with_high_error = 0;
 
-  xd->mode_info_context->mbmi.segment_id = segment_id;
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
   vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
@@ -3262,16 +3253,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < NB_TXFM_MODES; i++)
-    best_txfm_rd[i] = INT64_MAX;
+  for (i = 0; i < TX_MODES; i++)
+    best_tx_rd[i] = INT64_MAX;
   for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
     best_filter_rd[i] = INT64_MAX;
-  for (i = 0; i < TX_SIZE_MAX_SB; i++)
+  for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
 
   *returnrate = INT_MAX;
 
-  // Create a mask set to 1 for each frame used by a smaller resolution.
+  // Create a mask set to 1 for each reference frame used by a smaller
+  // resolution.
   if (cpi->sf.use_avoid_tested_higherror) {
     switch (block_size) {
       case BLOCK_64X64:
@@ -3321,24 +3313,26 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable;
-    int64_t txfm_cache[NB_TXFM_MODES];
+    int64_t tx_cache[TX_MODES];
     int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
 
-    for (i = 0; i < NB_TXFM_MODES; ++i)
-      txfm_cache[i] = INT64_MAX;
+    for (i = 0; i < TX_MODES; ++i)
+      tx_cache[i] = INT64_MAX;
 
+    x->skip = 0;
     this_mode = vp9_mode_order[mode_index].mode;
     ref_frame = vp9_mode_order[mode_index].ref_frame;
+    second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
 
-    // Slip modes that have been masked off but always consider first mode.
-    if ( mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
+    // Skip modes that have been masked off but always consider first mode.
+    if (mode_index && (bsize > cpi->sf.unused_mode_skip_lvl) &&
          (cpi->unused_mode_skip_mask & (1 << mode_index)) )
       continue;
 
-    // Skip if the current refernce frame has been masked off
+    // Skip if the current reference frame has been masked off
     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
         (cpi->ref_frame_mask & (1 << ref_frame)))
       continue;
@@ -3351,12 +3345,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Do not allow compound prediction if the segment level reference
     // frame feature is in use as in this case there can only be one reference.
-    if ((vp9_mode_order[mode_index].second_ref_frame > INTRA_FRAME) &&
-         vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME))
+    if ((second_ref_frame > INTRA_FRAME) &&
+         vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
       continue;
 
-    x->skip = 0;
-
     // Skip some checking based on small partitions' result.
     if (x->fast_ms > 1 && !ref_frame)
       continue;
@@ -3370,51 +3362,49 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (!(mode_mask & (1 << this_mode))) {
         continue;
       }
-      if (vp9_mode_order[mode_index].second_ref_frame != NONE
-          && !(ref_frame_mask
-              & (1 << vp9_mode_order[mode_index].second_ref_frame))) {
+      if (second_ref_frame != NONE
+          && !(ref_frame_mask & (1 << second_ref_frame))) {
         continue;
       }
     }
 
     mbmi->ref_frame[0] = ref_frame;
-    mbmi->ref_frame[1] = vp9_mode_order[mode_index].second_ref_frame;
+    mbmi->ref_frame[1] = second_ref_frame;
 
     if (!(ref_frame == INTRA_FRAME
         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
       continue;
     }
-    if (!(mbmi->ref_frame[1] == NONE
-        || (cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))) {
+    if (!(second_ref_frame == NONE
+        || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
       continue;
     }
 
-    comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+    comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
         if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
           continue;
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-        if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
-            vp9_mode_order[mode_index].second_ref_frame != best_inter_ref_frame)
+        if (ref_frame != best_inter_ref_frame &&
+            second_ref_frame != best_inter_ref_frame)
           continue;
     }
     // TODO(jingning, jkoleszar): scaling reference frame not supported for
     // SPLITMV.
-    if (mbmi->ref_frame[0] > 0 &&
-        (scale_factor[mbmi->ref_frame[0]].x_scale_fp != VP9_REF_NO_SCALE ||
-         scale_factor[mbmi->ref_frame[0]].y_scale_fp != VP9_REF_NO_SCALE) &&
+    if (ref_frame > 0 &&
+        (scale_factor[ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
+         scale_factor[ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
         this_mode == SPLITMV)
       continue;
 
-    if (mbmi->ref_frame[1] > 0 &&
-        (scale_factor[mbmi->ref_frame[1]].x_scale_fp != VP9_REF_NO_SCALE ||
-         scale_factor[mbmi->ref_frame[1]].y_scale_fp != VP9_REF_NO_SCALE) &&
+    if (second_ref_frame > 0 &&
+        (scale_factor[second_ref_frame].x_scale_fp != VP9_REF_NO_SCALE ||
+         scale_factor[second_ref_frame].y_scale_fp != VP9_REF_NO_SCALE) &&
         this_mode == SPLITMV)
       continue;
 
-    set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
-                      scale_factor);
+    set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
     mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
 
@@ -3431,46 +3421,43 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       continue;
 
     if (comp_pred) {
-      if (!(cpi->ref_frame_flags & flag_list[mbmi->ref_frame[1]]))
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
-      set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
-                        scale_factor);
+      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
 
       mode_excluded = mode_excluded
                          ? mode_excluded
                          : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
     } else {
-      // mbmi->ref_frame[1] = vp9_mode_order[mode_index].ref_frame[1];
-      if (ref_frame != INTRA_FRAME) {
-        if (mbmi->ref_frame[1] != INTRA_FRAME)
-          mode_excluded =
-              mode_excluded ?
-                  mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
+        mode_excluded =
+            mode_excluded ?
+                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
       }
     }
 
-    // Select predictors
+    // Select prediction reference frames.
     for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
       if (comp_pred)
-        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+        xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
     // If the segment reference frame feature is enabled....
     // then do nothing if the current ref frame is not allowed..
-    if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(&xd->seg, segment_id, SEG_LVL_REF_FRAME) !=
+    if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
             (int)ref_frame) {
       continue;
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
-    } else if (vp9_segfeature_active(&xd->seg, segment_id, SEG_LVL_SKIP) &&
+    } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
                (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
     // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(&xd->seg, segment_id,
+    } else if (!vp9_segfeature_active(seg, segment_id,
                                       SEG_LVL_REF_FRAME)) {
       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
       // unless ARNR filtering is enabled in which case we want
@@ -3506,9 +3493,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
         */
 
+      // I4X4_PRED is only considered for block sizes less than 8x8.
       mbmi->txfm_size = TX_4X4;
-      rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y,
-                                &distortion_y, INT64_MAX);
+      if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
+                                       &distortion_y, best_rd) >= best_rd)
+        continue;
       rate2 += rate;
       rate2 += intra_cost_penalty;
       distortion2 += distortion_y;
@@ -3524,11 +3513,21 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += dist_uv[TX_4X4];
       distortion_uv = dist_uv[TX_4X4];
       mbmi->uv_mode = mode_uv[TX_4X4];
-      txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-      for (i = 0; i < NB_TXFM_MODES; ++i)
-        txfm_cache[i] = txfm_cache[ONLY_4X4];
+      tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      for (i = 0; i < TX_MODES; ++i)
+        tx_cache[i] = tx_cache[ONLY_4X4];
     } else if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
+      // Disable intra modes other than DC_PRED for blocks with low variance
+      // Threshold for intra skipping based on source variance
+      // TODO(debargha): Specialize the threshold for super block sizes
+      static const int skip_intra_var_thresh[BLOCK_SIZE_TYPES] = {
+        64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+      };
+      if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+          this_mode != DC_PRED &&
+          x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
+        continue;
       // Only search the oblique modes if the best so far is
       // one of the neighboring directional modes
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
@@ -3541,7 +3540,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             continue;
       }
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
-                      bsize, txfm_cache, best_rd);
+                      bsize, tx_cache, best_rd);
 
       if (rate_y == INT_MAX)
         continue;
@@ -3564,7 +3563,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else if (this_mode == SPLITMV) {
-      const int is_comp_pred = mbmi->ref_frame[1] > 0;
+      const int is_comp_pred = second_ref_frame > 0;
       int rate;
       int64_t distortion;
       int64_t this_rd_thresh;
@@ -3574,7 +3573,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       int tmp_best_skippable = 0;
       int switchable_filter_index;
       int_mv *second_ref = is_comp_pred ?
-          &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
+          &mbmi->ref_mvs[second_ref_frame][0] : NULL;
       union b_mode_info tmp_best_bmodes[16];
       MB_MODE_INFO tmp_best_mbmode;
       PARTITION_INFO tmp_best_partition;
@@ -3586,16 +3585,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
             continue;
         if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
-          if (vp9_mode_order[mode_index].ref_frame != best_inter_ref_frame &&
-              vp9_mode_order[mode_index].second_ref_frame !=
-              best_inter_ref_frame)
+          if (ref_frame != best_inter_ref_frame &&
+              second_ref_frame != best_inter_ref_frame)
             continue;
       }
 
-      this_rd_thresh = (mbmi->ref_frame[0] == LAST_FRAME) ?
+      this_rd_thresh = (ref_frame == LAST_FRAME) ?
           cpi->rd_threshes[bsize][THR_NEWMV] :
           cpi->rd_threshes[bsize][THR_NEWA];
-      this_rd_thresh = (mbmi->ref_frame[0] == GOLDEN_FRAME) ?
+      this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
       xd->mode_info_context->mbmi.txfm_size = TX_4X4;
 
@@ -3610,7 +3608,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
-                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                     &mbmi->ref_mvs[ref_frame][0],
                      second_ref,
                      best_yrd,
                      &rate, &rate_y, &distortion,
@@ -3622,7 +3620,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if (tmp_rd == INT64_MAX)
           continue;
         cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
-        rs = get_switchable_rate(cm, x);
+        rs = get_switchable_rate(x);
         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
         cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS] =
             MIN(cpi->rd_filter_cache[VP9_SWITCHABLE_FILTERS], tmp_rd + rs_rd);
@@ -3672,7 +3670,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         // Handles the special case when a filter that is not in the
         // switchable list (bilinear, 6-tap) is indicated at the frame level
         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
-                     &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
+                     &mbmi->ref_mvs[ref_frame][0],
                      second_ref,
                      best_yrd,
                      &rate, &rate_y, &distortion,
@@ -3684,7 +3682,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           continue;
       } else {
         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
-          int rs = get_switchable_rate(cm, x);
+          int rs = get_switchable_rate(x);
           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
         }
         tmp_rd = tmp_best_rdu;
@@ -3703,7 +3701,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += distortion;
 
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
-        rate2 += get_switchable_rate(cm, x);
+        rate2 += get_switchable_rate(x);
 
       if (!mode_excluded) {
         if (is_comp_pred)
@@ -3728,15 +3726,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         skippable = skippable && uv_skippable;
         total_sse += uv_sse;
 
-        txfm_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-        for (i = 0; i < NB_TXFM_MODES; ++i)
-          txfm_cache[i] = txfm_cache[ONLY_4X4];
+        tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+        for (i = 0; i < TX_MODES; ++i)
+          tx_cache[i] = tx_cache[ONLY_4X4];
       }
     } else {
-      compmode_cost = vp9_cost_bit(comp_mode_p,
-                                   mbmi->ref_frame[1] > INTRA_FRAME);
+      compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, bsize,
-                                  txfm_cache,
+                                  tx_cache,
                                   &rate2, &distortion2, &skippable,
                                   &rate_y, &distortion_y,
                                   &rate_uv, &distortion_uv,
@@ -3754,10 +3751,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
-    if (mbmi->ref_frame[1] > INTRA_FRAME) {
-      rate2 += ref_costs_comp[mbmi->ref_frame[0]];
+    if (second_ref_frame > INTRA_FRAME) {
+      rate2 += ref_costs_comp[ref_frame];
     } else {
-      rate2 += ref_costs_single[mbmi->ref_frame[0]];
+      rate2 += ref_costs_single[ref_frame];
     }
 
     if (!disable_skip) {
@@ -3766,7 +3763,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // necessary adjustment for rate. Ignore if skip is coded at
       // segment level as the cost wont have been added in.
       // Is Mb level skip allowed (i.e. not coded at segment level).
-      const int mb_skip_allowed = !vp9_segfeature_active(&xd->seg, segment_id,
+      const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
                                                          SEG_LVL_SKIP);
 
       if (skippable && bsize >= BLOCK_SIZE_SB8X8) {
@@ -3787,8 +3784,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             rate2 += prob_skip_cost;
           }
         }
-      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME &&
-                 !xd->lossless) {
+      } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
@@ -3835,7 +3831,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       // best_inter_mode = xd->mode_info_context->mbmi.mode;
     }
 
-    if (!disable_skip && mbmi->ref_frame[0] == INTRA_FRAME) {
+    if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++)
@@ -3848,9 +3844,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           || distortion2 < mode_distortions[this_mode]) {
         mode_distortions[this_mode] = distortion2;
       }
-      if (frame_distortions[mbmi->ref_frame[0]] == -1
-          || distortion2 < frame_distortions[mbmi->ref_frame[0]]) {
-        frame_distortions[mbmi->ref_frame[0]] = distortion2;
+      if (frame_distortions[ref_frame] == -1
+          || distortion2 < frame_distortions[ref_frame]) {
+        frame_distortions[ref_frame] = distortion2;
       }
     }
 
@@ -3858,8 +3854,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
         // Note index of best mode so far
-        const int qstep = xd->plane[0].dequant[1];
-
         best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
@@ -3882,9 +3876,19 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
-        if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE)
-          if (ref_frame > INTRA_FRAME && distortion2 * 4 < qstep * qstep)
+        if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) {
+          const int qstep = xd->plane[0].dequant[1];
+          // TODO(debargha): Enhance this by specializing for each mode_index
+          int scale = 4;
+          if (x->source_variance < UINT_MAX) {
+            const int var_adjust = (x->source_variance < 16);
+            scale -= var_adjust;
+          }
+          if (ref_frame > INTRA_FRAME &&
+              distortion2 * scale < qstep * qstep) {
             early_term = 1;
+          }
+        }
       }
 #if 0
       // Testing this mode gave rise to an improvement in best error score.
@@ -3912,7 +3916,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* keep record of best compound/single-only prediction */
-    if (!disable_skip && mbmi->ref_frame[0] != INTRA_FRAME) {
+    if (!disable_skip && ref_frame != INTRA_FRAME) {
       int single_rd, hybrid_rd, single_rate, hybrid_rate;
 
       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
@@ -3926,10 +3930,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
-      if (mbmi->ref_frame[1] <= INTRA_FRAME &&
+      if (second_ref_frame <= INTRA_FRAME &&
           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
-      } else if (mbmi->ref_frame[1] > INTRA_FRAME &&
+      } else if (second_ref_frame > INTRA_FRAME &&
                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
       }
@@ -3938,7 +3942,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && mbmi->ref_frame[0] != INTRA_FRAME &&
+    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
         cm->mcomp_filter_type != BILINEAR) {
       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
                               VP9_SWITCHABLE_FILTERS :
@@ -3958,34 +3962,35 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     /* keep record of best txfm size */
-    if (bsize < BLOCK_SIZE_SB32X32) {
-      if (bsize < BLOCK_SIZE_MB16X16) {
+    if (bsize < BLOCK_32X32) {
+      if (bsize < BLOCK_16X16) {
         if (this_mode == SPLITMV || this_mode == I4X4_PRED)
-          txfm_cache[ALLOW_8X8] = txfm_cache[ONLY_4X4];
-        txfm_cache[ALLOW_16X16] = txfm_cache[ALLOW_8X8];
+          tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
+        tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
       }
-      txfm_cache[ALLOW_32X32] = txfm_cache[ALLOW_16X16];
+      tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
     }
     if (!mode_excluded && this_rd != INT64_MAX) {
-      for (i = 0; i < NB_TXFM_MODES; i++) {
+      for (i = 0; i < TX_MODES; i++) {
         int64_t adj_rd = INT64_MAX;
         if (this_mode != I4X4_PRED) {
-          adj_rd = this_rd + txfm_cache[i] - txfm_cache[cm->tx_mode];
+          adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
         } else {
           adj_rd = this_rd;
         }
 
-        if (adj_rd < best_txfm_rd[i])
-          best_txfm_rd[i] = adj_rd;
+        if (adj_rd < best_tx_rd[i])
+          best_tx_rd[i] = adj_rd;
       }
     }
 
     if (early_term)
       break;
 
-    if (x->skip && !mode_excluded)
+    if (x->skip && !comp_pred)
       break;
   }
+
   if (best_rd >= best_rd_so_far)
     return INT64_MAX;
 
@@ -4044,7 +4049,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
          (best_mbmode.ref_frame[0] == INTRA_FRAME));
 
-  // Updating rd_thresh_freq_fact[] here means that the differnt
+  // Updating rd_thresh_freq_fact[] here means that the different
   // partition/block sizes are handled independently based on the best
   // choice for the current partition. It may well be better to keep a scaled
   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
@@ -4126,14 +4131,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   }
 
   if (!x->skip) {
-    for (i = 0; i < NB_TXFM_MODES; i++) {
-      if (best_txfm_rd[i] == INT64_MAX)
-        best_txfm_diff[i] = 0;
+    for (i = 0; i < TX_MODES; i++) {
+      if (best_tx_rd[i] == INT64_MAX)
+        best_tx_diff[i] = 0;
       else
-        best_txfm_diff[i] = best_rd - best_txfm_rd[i];
+        best_tx_diff[i] = best_rd - best_tx_rd[i];
     }
   } else {
-    vpx_memset(best_txfm_diff, 0, sizeof(best_txfm_diff));
+    vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff));
   }
 
   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
@@ -4143,7 +4148,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
                                       mbmi->ref_frame[1]][0],
-                       best_pred_diff, best_txfm_diff, best_filter_diff);
+                       best_pred_diff, best_tx_diff, best_filter_diff);
 
   return best_rd;
 }
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index ef84cc5c0..9564edc84 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -57,8 +57,7 @@ void vp9_set_segment_data(VP9_PTR ptr,
 }
 
 // Based on set of segment counts calculate a probability tree
-static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,
-                               vp9_prob *segment_tree_probs) {
+static void calc_segtree_probs(int *segcounts, vp9_prob *segment_tree_probs) {
   // Work out probabilities of each segment
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
@@ -75,7 +74,7 @@ static void calc_segtree_probs(MACROBLOCKD *xd, int *segcounts,
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
-static int cost_segmap(MACROBLOCKD *xd, int *segcounts, vp9_prob *probs) {
+static int cost_segmap(int *segcounts, vp9_prob *probs) {
   const int c01 = segcounts[0] + segcounts[1];
   const int c23 = segcounts[2] + segcounts[3];
   const int c45 = segcounts[4] + segcounts[5];
@@ -189,13 +188,13 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
     int n;
 
     assert(bwl < bsl && bhl < bsl);
-    if (bsize == BLOCK_SIZE_SB64X64) {
-      subsize = BLOCK_SIZE_SB32X32;
-    } else if (bsize == BLOCK_SIZE_SB32X32) {
-      subsize = BLOCK_SIZE_MB16X16;
+    if (bsize == BLOCK_64X64) {
+      subsize = BLOCK_32X32;
+    } else if (bsize == BLOCK_32X32) {
+      subsize = BLOCK_16X16;
     } else {
-      assert(bsize == BLOCK_SIZE_MB16X16);
-      subsize = BLOCK_SIZE_SB8X8;
+      assert(bsize == BLOCK_16X16);
+      subsize = BLOCK_8X8;
     }
 
     for (n = 0; n < 4; n++) {
@@ -211,7 +210,7 @@ static void count_segs_sb(VP9_COMP *cpi, MODE_INFO *mi,
 
 void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  MACROBLOCKD *const xd = &cpi->mb.e_mbd;
+  struct segmentation *seg = &cpi->mb.e_mbd.seg;
 
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
@@ -231,8 +230,8 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Set default state for the segment tree probabilities and the
   // temporal coding probabilities
-  vpx_memset(xd->seg.tree_probs, 255, sizeof(xd->seg.tree_probs));
-  vpx_memset(xd->seg.pred_probs, 255, sizeof(xd->seg.pred_probs));
+  vpx_memset(seg->tree_probs, 255, sizeof(seg->tree_probs));
+  vpx_memset(seg->pred_probs, 255, sizeof(seg->pred_probs));
 
   vpx_memset(no_pred_segcounts, 0, sizeof(no_pred_segcounts));
   vpx_memset(t_unpred_seg_counts, 0, sizeof(t_unpred_seg_counts));
@@ -249,21 +248,21 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
       for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
            mi_col += 8, mi += 8)
         count_segs_sb(cpi, mi, no_pred_segcounts, temporal_predictor_count,
-                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+                      t_unpred_seg_counts, mi_row, mi_col, BLOCK_64X64);
     }
   }
 
   // Work out probability tree for coding segments without prediction
   // and the cost.
-  calc_segtree_probs(xd, no_pred_segcounts, no_pred_tree);
-  no_pred_cost = cost_segmap(xd, no_pred_segcounts, no_pred_tree);
+  calc_segtree_probs(no_pred_segcounts, no_pred_tree);
+  no_pred_cost = cost_segmap(no_pred_segcounts, no_pred_tree);
 
   // Key frames cannot use temporal prediction
   if (cm->frame_type != KEY_FRAME) {
     // Work out probability tree for coding those segments not
     // predicted using the temporal method and the cost.
-    calc_segtree_probs(xd, t_unpred_seg_counts, t_pred_tree);
-    t_pred_cost = cost_segmap(xd, t_unpred_seg_counts, t_pred_tree);
+    calc_segtree_probs(t_unpred_seg_counts, t_pred_tree);
+    t_pred_cost = cost_segmap(t_unpred_seg_counts, t_pred_tree);
 
     // Add in the cost of the signalling for each prediction context
     for (i = 0; i < PREDICTION_PROBS; i++) {
@@ -280,11 +279,11 @@ void vp9_choose_segmap_coding_method(VP9_COMP *cpi) {
 
   // Now choose which coding method to use.
   if (t_pred_cost < no_pred_cost) {
-    xd->seg.temporal_update = 1;
-    vpx_memcpy(xd->seg.tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    vpx_memcpy(xd->seg.pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
+    seg->temporal_update = 1;
+    vpx_memcpy(seg->tree_probs, t_pred_tree, sizeof(t_pred_tree));
+    vpx_memcpy(seg->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
   } else {
-    xd->seg.temporal_update = 0;
-    vpx_memcpy(xd->seg.tree_probs, no_pred_tree, sizeof(no_pred_tree));
+    seg->temporal_update = 0;
+    vpx_memcpy(seg->tree_probs, no_pred_tree, sizeof(no_pred_tree));
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 821b7c6ca..a692c010e 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -40,10 +40,7 @@ static void temporal_filter_predictors_mb_c(MACROBLOCKD *xd,
                                             int mv_col,
                                             uint8_t *pred) {
   const int which_mv = 0;
-  int_mv mv;
-
-  mv.as_mv.row = mv_row;
-  mv.as_mv.col = mv_col;
+  MV mv = { mv_row, mv_col };
 
   vp9_build_inter_predictor(y_mb_ptr, stride,
                             &pred[0], 16,
diff --git a/libvpx/vp9/encoder/vp9_tokenize.c b/libvpx/vp9/encoder/vp9_tokenize.c
index 4b9c6c8b4..caa89b218 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/libvpx/vp9/encoder/vp9_tokenize.c
@@ -25,8 +25,8 @@
    compressions, then generating vp9_context.c = initial stats. */
 
 #ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZE_MAX_SB][BLOCK_TYPES];
+vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
+extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
 #endif  /* ENTROPY_STATS */
 
 DECLARE_ALIGNED(16, extern const uint8_t,
@@ -40,7 +40,7 @@ const int *vp9_dct_value_cost_ptr;
 static void fill_value_tokens() {
 
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_extra_bit *const e = vp9_extra_bits;
+  const vp9_extra_bit *const e = vp9_extra_bits;
 
   int i = -DCT_MAX_VALUE;
   int sign = 1;
@@ -69,7 +69,7 @@ static void fill_value_tokens() {
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      vp9_extra_bit *p = vp9_extra_bits + t[i].token;
+      const vp9_extra_bit *p = vp9_extra_bits + t[i].token;
 
       if (p->base_val) {
         const int extra = t[i].extra;
@@ -95,18 +95,40 @@ struct tokenize_b_args {
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
   TX_SIZE tx_size;
-  int dry_run;
 };
 
+static void set_entropy_context_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
+                                  int ss_txfrm_size, void *arg) {
+  struct tokenize_b_args* const args = arg;
+  TX_SIZE tx_size = ss_txfrm_size >> 1;
+  MACROBLOCKD *xd = args->xd;
+  const int bwl = b_width_log2(bsize);
+  const int off = block >> (2 * tx_size);
+  const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
+  const int aoff = (off & ((1 << mod) - 1)) << tx_size;
+  const int loff = (off >> mod) << tx_size;
+  ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff;
+  ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff;
+  const int eob = xd->plane[plane].eobs[block];
+  const int tx_size_in_blocks = 1 << tx_size;
+
+  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
+    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, eob, aoff, loff,
+                           A, L);
+  } else {
+    vpx_memset(A, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(L, eob > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+}
+
 static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
                        int ss_txfrm_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
-  TX_SIZE tx_size = ss_txfrm_size / 2;
-  int dry_run = args->dry_run;
-
+  const TX_SIZE tx_size = ss_txfrm_size >> 1;
+  const int tx_size_in_blocks = 1 << tx_size;
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   int pt; /* near block/prev token context index */
   int c = 0, rc = 0;
@@ -114,9 +136,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   const int eob = xd->plane[plane].eobs[block];
   const PLANE_TYPE type = xd->plane[plane].plane_type;
   const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
-  const BLOCK_SIZE_TYPE sb_type = (mbmi->sb_type < BLOCK_SIZE_SB8X8) ?
-                                   BLOCK_SIZE_SB8X8 : mbmi->sb_type;
-  const int bwl = b_width_log2(sb_type);
+  const int bwl = b_width_log2(bsize);
   const int off = block >> (2 * tx_size);
   const int mod = bwl - tx_size - xd->plane[plane].subsampling_x;
   const int aoff = (off & ((1 << mod) - 1)) << tx_size;
@@ -128,7 +148,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
   const int16_t *scan, *nb;
   vp9_coeff_count *counts;
   vp9_coeff_probs_model *coef_probs;
-  const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
+  const int ref = is_inter_block(mbmi);
   ENTROPY_CONTEXT above_ec, left_ec;
   uint8_t token_cache[1024];
   const uint8_t *band_translate;
@@ -146,22 +166,22 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
       band_translate = vp9_coefband_trans_4x4;
       break;
     case TX_8X8:
-      above_ec = (A[0] + A[1]) != 0;
-      left_ec = (L[0] + L[1]) != 0;
+      above_ec = !!*(uint16_t *)A;
+      left_ec  = !!*(uint16_t *)L;
       seg_eob = 64;
       scan = get_scan_8x8(get_tx_type_8x8(type, xd));
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_16X16:
-      above_ec = (A[0] + A[1] + A[2] + A[3]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3]) != 0;
+      above_ec = !!*(uint32_t *)A;
+      left_ec  = !!*(uint32_t *)L;
       seg_eob = 256;
       scan = get_scan_16x16(get_tx_type_16x16(type, xd));
       band_translate = vp9_coefband_trans_8x8plus;
       break;
     case TX_32X32:
-      above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0;
-      left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0;
+      above_ec = !!*(uint64_t *)A;
+      left_ec  = !!*(uint64_t *)L;
       seg_eob = 1024;
       scan = vp9_default_scan_32x32;
       band_translate = vp9_coefband_trans_8x8plus;
@@ -198,22 +218,21 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
     assert(vp9_coef_encodings[t->token].len - t->skip_eob_node > 0);
 
-    if (!dry_run) {
-      ++counts[type][ref][band][pt][token];
-      if (!t->skip_eob_node)
-        ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
-    }
-    token_cache[scan[c]] = vp9_pt_energy_class[token];
+    ++counts[type][ref][band][pt][token];
+    if (!t->skip_eob_node)
+      ++cpi->common.counts.eob_branch[tx_size][type][ref][band][pt];
+
+    token_cache[rc] = vp9_pt_energy_class[token];
     ++t;
   } while (c < eob && ++c < seg_eob);
 
   *tp = t;
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, bsize, plane, tx_size, c, aoff, loff, A, L);
+    set_contexts_on_border(xd, bsize, plane, tx_size_in_blocks, c, aoff, loff,
+                           A, L);
   } else {
-    for (pt = 0; pt < (1 << tx_size); pt++) {
-      A[pt] = L[pt] = c > 0;
-    }
+    vpx_memset(A, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(L, c > 0, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 
@@ -257,8 +276,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
   const int skip_inc = !vp9_segfeature_active(&xd->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  const TX_SIZE txfm_size = mbmi->txfm_size;
-  struct tokenize_b_args arg = { cpi, xd, t, txfm_size, dry_run };
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->txfm_size};
 
   mbmi->mb_skip_coeff = vp9_sb_is_skippable(xd, bsize);
   if (mbmi->mb_skip_coeff) {
@@ -270,13 +288,13 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
     return;
   }
 
-  if (!dry_run)
+  if (!dry_run) {
     cm->counts.mbskip[mb_skip_context][0] += skip_inc;
-
-  foreach_transformed_block(xd, bsize, tokenize_b, &arg);
-
-  if (dry_run)
+    foreach_transformed_block(xd, bsize, tokenize_b, &arg);
+  } else {
+    foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
     *t = t_backup;
+  }
 }
 
 #ifdef ENTROPY_STATS
diff --git a/libvpx/vp9/encoder/vp9_tokenize.h b/libvpx/vp9/encoder/vp9_tokenize.h
index bc7d9352e..968bec75e 100644
--- a/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/libvpx/vp9/encoder/vp9_tokenize.h
@@ -43,7 +43,7 @@ void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
 void init_context_counters();
 void print_context_counters();
 
-extern vp9_coeff_accum context_counters[TX_SIZE_MAX_SB][BLOCK_TYPES];
+extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
 #endif
 
 extern const int *vp9_dct_value_cost_ptr;
diff --git a/libvpx/vp9/vp9_common.mk b/libvpx/vp9/vp9_common.mk
index 5a0c1c958..b2b2a80a7 100644
--- a/libvpx/vp9/vp9_common.mk
+++ b/libvpx/vp9/vp9_common.mk
@@ -74,7 +74,6 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_ss
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm
@@ -83,6 +82,10 @@ VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm
 endif
 
+ifeq ($(USE_X86INC),yes)
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
+endif
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
@@ -91,5 +94,6 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 
 $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.sh))
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index dee83c9e4..288c0d829 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -83,11 +83,15 @@ VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
+
+ifeq ($(USE_X86INC),yes)
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+endif
+
 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.asm
 endif
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 6cad29329..be3afe835 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -28,6 +28,8 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
+VP9_DX_SRCS-yes += decoder/vp9_thread.c
+VP9_DX_SRCS-yes += decoder/vp9_thread.h
 VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_idct_blk.c
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index b18155be6..259204065 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -60,7 +60,7 @@ int vp8_yv12_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
-     * the start of the chroma rows without intoducing an arbitrary gap
+     * the start of the chroma rows without introducing an arbitrary gap
      * between planes, which would break the semantics of things like
      * vpx_img_set_rect(). */
     if (border & 0x1f)
@@ -158,7 +158,7 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
-     * the start of the chroma rows without intoducing an arbitrary gap
+     * the start of the chroma rows without introducing an arbitrary gap
      * between planes, which would break the semantics of things like
      * vpx_img_set_rect(). */
     if (border & 0x1f)
diff --git a/mips-dspr2/libvpx_srcs.txt b/mips-dspr2/libvpx_srcs.txt
index d75620883..299d615be 100644
--- a/mips-dspr2/libvpx_srcs.txt
+++ b/mips-dspr2/libvpx_srcs.txt
@@ -203,6 +203,8 @@ vp9/decoder/vp9_onyxd.h
 vp9/decoder/vp9_onyxd_if.c
 vp9/decoder/vp9_onyxd_int.h
 vp9/decoder/vp9_read_bit_buffer.h
+vp9/decoder/vp9_thread.c
+vp9/decoder/vp9_thread.h
 vp9/decoder/vp9_treereader.h
 vp9/vp9_common.mk
 vp9/vp9_dx_iface.c
diff --git a/mips-dspr2/vp9_rtcd.h b/mips-dspr2/vp9_rtcd.h
index 0752f4590..d6dc6bfb6 100644
--- a/mips-dspr2/vp9_rtcd.h
+++ b/mips-dspr2/vp9_rtcd.h
@@ -14,9 +14,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -260,14 +258,17 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_8x8 vp9_short_idct1_8x8_c
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
@@ -275,18 +276,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_16x16 vp9_short_idct1_16x16_c
-
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_32x32_add vp9_short_idct10_32x32_add_c
-
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 #define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
 
diff --git a/mips-dspr2/vpx_config.h b/mips-dspr2/vpx_config.h
index 13a092db0..e85b676d4 100644
--- a/mips-dspr2/vpx_config.h
+++ b/mips-dspr2/vpx_config.h
@@ -39,6 +39,7 @@
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
+#define CONFIG_USE_X86INC 1
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0
diff --git a/mips/libvpx_srcs.txt b/mips/libvpx_srcs.txt
index 402ac2420..055f5fb5d 100644
--- a/mips/libvpx_srcs.txt
+++ b/mips/libvpx_srcs.txt
@@ -197,6 +197,8 @@ vp9/decoder/vp9_onyxd.h
 vp9/decoder/vp9_onyxd_if.c
 vp9/decoder/vp9_onyxd_int.h
 vp9/decoder/vp9_read_bit_buffer.h
+vp9/decoder/vp9_thread.c
+vp9/decoder/vp9_thread.h
 vp9/decoder/vp9_treereader.h
 vp9/vp9_common.mk
 vp9/vp9_dx_iface.c
diff --git a/mips/vp9_rtcd.h b/mips/vp9_rtcd.h
index 0752f4590..d6dc6bfb6 100644
--- a/mips/vp9_rtcd.h
+++ b/mips/vp9_rtcd.h
@@ -14,9 +14,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_enums.h"
 
-struct loop_filter_info;
 struct macroblockd;
-struct loop_filter_info;
 
 /* Encoder forward decls */
 struct macroblock;
@@ -260,14 +258,17 @@ void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct4x4_add vp9_short_idct4x4_add_c
 
+void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct8x8_1_add vp9_short_idct8x8_1_add_c
+
 void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct8x8_add vp9_short_idct8x8_add_c
 
 void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_8x8_add vp9_short_idct10_8x8_add_c
 
-void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_8x8 vp9_short_idct1_8x8_c
+void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_short_idct16x16_1_add vp9_short_idct16x16_1_add_c
 
 void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct16x16_add vp9_short_idct16x16_add_c
@@ -275,18 +276,12 @@ void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct10_16x16_add vp9_short_idct10_16x16_add_c
 
-void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output);
-#define vp9_short_idct1_16x16 vp9_short_idct1_16x16_c
-
 void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
 #define vp9_short_idct32x32_add vp9_short_idct32x32_add_c
 
 void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output);
 #define vp9_short_idct1_32x32 vp9_short_idct1_32x32_c
 
-void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_short_idct10_32x32_add vp9_short_idct10_32x32_add_c
-
 void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 #define vp9_short_iht4x4_add vp9_short_iht4x4_add_c
 
diff --git a/mips/vpx_config.h b/mips/vpx_config.h
index 51ea388f1..7db47f873 100644
--- a/mips/vpx_config.h
+++ b/mips/vpx_config.h
@@ -39,6 +39,7 @@
 #define CONFIG_INSTALL_BINS 1
 #define CONFIG_INSTALL_LIBS 1
 #define CONFIG_INSTALL_SRCS 0
+#define CONFIG_USE_X86INC 1
 #define CONFIG_DEBUG 0
 #define CONFIG_GPROF 0
 #define CONFIG_GCOV 0