libvpx: Pull from upstream

Add support for avx2 intrinsics Pass '-chromium' to ads2gas scripts. This allows us to generate the correct syntax for Chromium clang as well as Xcode clang. https://code.google.com/p/webm/issues/detail?id=603 Current HEAD: 4a535efcfd6c2d1ccded864faf042f4b78716206 git log from upstream: 4a535ef Change in data rate test to be less stringent fbada94 Add frame buffer lru cache. d0ee1fd Merge "Add support to pass in external frame buffers." 10f8916 Add support to pass in external frame buffers. c5aaf92 webmdec/tests: fix leak when file isn't read to eof 64cf398 Merge "Using MV struct instead of int_mv union in encoder." 33df4f0 Merge "vp9_convole.c cleanup." f54b515 Merge "Cleaning up vp9_append_sub8x8_mvs_for_idx()." 25da21b Using MV struct instead of int_mv union in encoder. d4b500d Merge "Increase disable_filter_search_var_thresh threshold" ec79db6 Merge "obj_int_extract: fix compile warning" 3aa75bc Merge "vp9 asserts: fix compile warning" a793cf9 Merge "Rename so -> scan_order in vp9_encodemb.c" df13e01 Merge "Renaming treed_write() to vp9_write_tree()." 15a23c8 Merge "zz_motion_search() cleanup." 33859ad Merge "Moving mi configuration loop from vp9_read_mode_info to set_offsets." 2f9d118 Merge "Remove some dead code" 930ae71 Rename so -> scan_order in vp9_encodemb.c da9f55c Increase disable_filter_search_var_thresh threshold 2ec473b Merge "Set rc_2pass_vbr_maxsection_pct to correct value" 8b73296 Merge "Enable adaptive pred filter type for sub8x8" 15cf596 Set rc_2pass_vbr_maxsection_pct to correct value e121bf4 Moving mi configuration loop from vp9_read_mode_info to set_offsets. 178db94 vp9 asserts: fix compile warning 934f0fb obj_int_extract: fix compile warning 3b5a90b Enable adaptive pred filter type for sub8x8 c263418 Merge "test-data.sha1: add missing sha1sums" 52bf934 zz_motion_search() cleanup. 629fb85 vp9_convole.c cleanup. TBR=tomfinegan@chromium.org Review URL: https://codereview.chromium.org/111463005 git-svn-id: http://src.chromium.org/svn/trunk/deps/third_party/libvpx@240981 4ff67af0-8c30-449e-8e8b-ad334ec8d88c
author: johannkoenig@chromium.org <johannkoenig@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> 2013-12-16 19:49:40 +0000
committer: johannkoenig@chromium.org <johannkoenig@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c> 2013-12-16 19:49:40 +0000
commit: d851b91d14ef0bd71acdce7b90c9a8f1af1181ad (patch)
tree: ded826e4587a462cf390d127bf6d189dae806ed6 /source
parent: 19002df347c5606f660056344d4ff7f7b9b37c5c (diff)
download: libvpx-d851b91d14ef0bd71acdce7b90c9a8f1af1181ad.tar.gz
234 files changed, 26171 insertions, 10066 deletions
diff --git a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
index 89f1cd3..d0cbd25 100644
--- a/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
+++ b/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -184,14 +185,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
@@ -200,10 +213,18 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *b
 void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -282,10 +303,12 @@ void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_s
 RTCD_EXTERN void (*vp9_idct32x32_1024_add)(const int16_t *input, uint8_t *dest, int dest_stride);
 
 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
+void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
+RTCD_EXTERN void (*vp9_idct32x32_34_add)(const int16_t *input, uint8_t *dest, int dest_stride);
 
 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
+void vp9_idct32x32_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
+RTCD_EXTERN void (*vp9_idct32x32_1_add)(const int16_t *input, uint8_t *dest, int dest_stride);
 
 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 void vp9_iht4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
@@ -370,21 +393,36 @@ static void setup_rtcd_internal(void)
     vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c;
     if (flags & HAS_NEON) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_neon;
 
+    vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_c;
+    if (flags & HAS_NEON) vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_neon;
+
     vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c;
     if (flags & HAS_NEON) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_neon;
 
+    vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_c;
+    if (flags & HAS_NEON) vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_neon;
+
     vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c;
     if (flags & HAS_NEON) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_neon;
 
+    vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_c;
+    if (flags & HAS_NEON) vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_neon;
+
     vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c;
     if (flags & HAS_NEON) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_neon;
 
     vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c;
     if (flags & HAS_NEON) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_neon;
 
+    vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_c;
+    if (flags & HAS_NEON) vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_neon;
+
     vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c;
     if (flags & HAS_NEON) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_neon;
 
+    vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_c;
+    if (flags & HAS_NEON) vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_neon;
+
 
 
 
@@ -439,7 +477,11 @@ static void setup_rtcd_internal(void)
     vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_c;
     if (flags & HAS_NEON) vp9_idct32x32_1024_add = vp9_idct32x32_1024_add_neon;
 
+    vp9_idct32x32_34_add = vp9_idct32x32_34_add_c;
+    if (flags & HAS_NEON) vp9_idct32x32_34_add = vp9_idct32x32_1024_add_neon;
 
+    vp9_idct32x32_1_add = vp9_idct32x32_1_add_c;
+    if (flags & HAS_NEON) vp9_idct32x32_1_add = vp9_idct32x32_1_add_neon;
 
     vp9_iht4x4_16_add = vp9_iht4x4_16_add_c;
     if (flags & HAS_NEON) vp9_iht4x4_16_add = vp9_iht4x4_16_add_neon;
diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
index d23257e..8a2b3df 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.asm
@@ -74,11 +74,11 @@
 .equ CONFIG_POSTPROC_VISUALIZER ,  0
 .equ CONFIG_OS_SUPPORT ,  1
 .equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_DECRYPT ,  0
-.equ CONFIG_ONESHOTQ ,  0
 .equ CONFIG_MULTIPLE_ARF ,  0
 .equ CONFIG_NON420 ,  0
 .equ CONFIG_ALPHA ,  0
diff --git a/source/config/linux/arm-neon-cpu-detect/vpx_config.h b/source/config/linux/arm-neon-cpu-detect/vpx_config.h
index 76e006f..2305f8e 100644
--- a/source/config/linux/arm-neon-cpu-detect/vpx_config.h
+++ b/source/config/linux/arm-neon-cpu-detect/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/linux/arm-neon/vp9_rtcd.h b/source/config/linux/arm-neon/vp9_rtcd.h
index d9e2b15..8b3bd23 100644
--- a/source/config/linux/arm-neon/vp9_rtcd.h
+++ b/source/config/linux/arm-neon/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -184,14 +185,26 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_neon
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_neon
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_neon
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_neon
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_neon
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_neon
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_neon
@@ -200,10 +213,18 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *b
 void vp9_mbloop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_neon
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_neon
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_neon
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_neon
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -282,10 +303,12 @@ void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_s
 #define vp9_idct32x32_1024_add vp9_idct32x32_1024_add_neon
 
 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_34_add vp9_idct32x32_34_add_c
+void vp9_idct32x32_1024_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_idct32x32_34_add vp9_idct32x32_1024_add_neon
 
 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride);
-#define vp9_idct32x32_1_add vp9_idct32x32_1_add_c
+void vp9_idct32x32_1_add_neon(const int16_t *input, uint8_t *dest, int dest_stride);
+#define vp9_idct32x32_1_add vp9_idct32x32_1_add_neon
 
 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
 void vp9_iht4x4_16_add_neon(const int16_t *input, uint8_t *dest, int dest_stride, int tx_type);
diff --git a/source/config/linux/arm-neon/vpx_config.asm b/source/config/linux/arm-neon/vpx_config.asm
index edd494e..e36c9ac 100644
--- a/source/config/linux/arm-neon/vpx_config.asm
+++ b/source/config/linux/arm-neon/vpx_config.asm
@@ -74,11 +74,11 @@
 .equ CONFIG_POSTPROC_VISUALIZER ,  0
 .equ CONFIG_OS_SUPPORT ,  1
 .equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_DECRYPT ,  0
-.equ CONFIG_ONESHOTQ ,  0
 .equ CONFIG_MULTIPLE_ARF ,  0
 .equ CONFIG_NON420 ,  0
 .equ CONFIG_ALPHA ,  0
diff --git a/source/config/linux/arm-neon/vpx_config.h b/source/config/linux/arm-neon/vpx_config.h
index a8dccf5..cdadfa8 100644
--- a/source/config/linux/arm-neon/vpx_config.h
+++ b/source/config/linux/arm-neon/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/linux/arm/vp9_rtcd.h b/source/config/linux/arm/vp9_rtcd.h
index abcb56a..87d258f 100644
--- a/source/config/linux/arm/vp9_rtcd.h
+++ b/source/config/linux/arm/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -183,21 +184,36 @@ void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_c
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_c
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_c
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_c
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_c
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
diff --git a/source/config/linux/arm/vpx_config.asm b/source/config/linux/arm/vpx_config.asm
index 6c56e96..6f88245 100644
--- a/source/config/linux/arm/vpx_config.asm
+++ b/source/config/linux/arm/vpx_config.asm
@@ -74,11 +74,11 @@
 .equ CONFIG_POSTPROC_VISUALIZER ,  0
 .equ CONFIG_OS_SUPPORT ,  1
 .equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_DECRYPT ,  0
-.equ CONFIG_ONESHOTQ ,  0
 .equ CONFIG_MULTIPLE_ARF ,  0
 .equ CONFIG_NON420 ,  0
 .equ CONFIG_ALPHA ,  0
diff --git a/source/config/linux/arm/vpx_config.h b/source/config/linux/arm/vpx_config.h
index 15a0d7f..b2ee4e4 100644
--- a/source/config/linux/arm/vpx_config.h
+++ b/source/config/linux/arm/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/linux/ia32/vp9_rtcd.h b/source/config/linux/ia32/vp9_rtcd.h
index 0d474e8..eda5e68 100644
--- a/source/config/linux/ia32/vp9_rtcd.h
+++ b/source/config/linux/ia32/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -178,7 +179,8 @@ void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8
 RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
@@ -213,26 +215,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -448,6 +471,8 @@ static void setup_rtcd_internal(void)
     vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c;
     if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3;
 
+    vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3;
 
 
 
@@ -465,21 +490,37 @@ static void setup_rtcd_internal(void)
     vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2;
 
+    vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_c;
+    if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_sse2;
+
     vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2;
 
+    vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_sse2;
+
     vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx;
 
+    vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_sse2;
+
     vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
     vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2;
 
+    vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_sse2;
+
     vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx;
 
+    vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_sse2;
+
 
 
 
diff --git a/source/config/linux/ia32/vpx_config.asm b/source/config/linux/ia32/vpx_config.asm
index 8298d42..8006646 100644
--- a/source/config/linux/ia32/vpx_config.asm
+++ b/source/config/linux/ia32/vpx_config.asm
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/linux/ia32/vpx_config.h b/source/config/linux/ia32/vpx_config.h
index 21afb45..a01c031 100644
--- a/source/config/linux/ia32/vpx_config.h
+++ b/source/config/linux/ia32/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/linux/mipsel/vp9_rtcd.h b/source/config/linux/mipsel/vp9_rtcd.h
index d9e1b0b..39abdba 100644
--- a/source/config/linux/mipsel/vp9_rtcd.h
+++ b/source/config/linux/mipsel/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -183,21 +184,36 @@ void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_c
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_c
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_c
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_c
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_c
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
diff --git a/source/config/linux/mipsel/vpx_config.h b/source/config/linux/mipsel/vpx_config.h
index e679a84..b400c6e 100644
--- a/source/config/linux/mipsel/vpx_config.h
+++ b/source/config/linux/mipsel/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/linux/x64/vp9_rtcd.h b/source/config/linux/x64/vp9_rtcd.h
index 16af84a..29a910f 100644
--- a/source/config/linux/x64/vp9_rtcd.h
+++ b/source/config/linux/x64/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -178,7 +179,8 @@ void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8
 RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
@@ -214,26 +216,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_sse2
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_sse2
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_sse2
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_sse2
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_sse2
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -431,6 +454,13 @@ static void setup_rtcd_internal(void)
     vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c;
     if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3;
 
+    vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3;
+
+
+
+
+
 
 
 
@@ -442,6 +472,8 @@ static void setup_rtcd_internal(void)
 
 
 
+    vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
 
 
diff --git a/source/config/linux/x64/vpx_config.asm b/source/config/linux/x64/vpx_config.asm
index 6c6b584..6b13a30 100644
--- a/source/config/linux/x64/vpx_config.asm
+++ b/source/config/linux/x64/vpx_config.asm
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/linux/x64/vpx_config.h b/source/config/linux/x64/vpx_config.h
index 5b4626c..3fa7ed6 100644
--- a/source/config/linux/x64/vpx_config.h
+++ b/source/config/linux/x64/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/mac/ia32/vp9_rtcd.h b/source/config/mac/ia32/vp9_rtcd.h
index 0543c77..47d3592 100644
--- a/source/config/mac/ia32/vp9_rtcd.h
+++ b/source/config/mac/ia32/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -184,26 +185,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -376,21 +398,37 @@ static void setup_rtcd_internal(void)
     vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2;
 
+    vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_c;
+    if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_sse2;
+
     vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2;
 
+    vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_sse2;
+
     vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx;
 
+    vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_sse2;
+
     vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
     vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2;
 
+    vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_sse2;
+
     vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx;
 
+    vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_sse2;
+
 
 
 
diff --git a/source/config/mac/ia32/vpx_config.asm b/source/config/mac/ia32/vpx_config.asm
index fc03ea0..7d7e8ff 100644
--- a/source/config/mac/ia32/vpx_config.asm
+++ b/source/config/mac/ia32/vpx_config.asm
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/mac/ia32/vpx_config.h b/source/config/mac/ia32/vpx_config.h
index 161a2bf..efa6898 100644
--- a/source/config/mac/ia32/vpx_config.h
+++ b/source/config/mac/ia32/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/mac/x64/vp9_rtcd.h b/source/config/mac/x64/vp9_rtcd.h
index 16af84a..29a910f 100644
--- a/source/config/mac/x64/vp9_rtcd.h
+++ b/source/config/mac/x64/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -178,7 +179,8 @@ void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8
 RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
@@ -214,26 +216,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_sse2
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_sse2
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_sse2
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_sse2
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_sse2
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -431,6 +454,13 @@ static void setup_rtcd_internal(void)
     vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c;
     if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3;
 
+    vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3;
+
+
+
+
+
 
 
 
@@ -442,6 +472,8 @@ static void setup_rtcd_internal(void)
 
 
 
+    vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
 
 
diff --git a/source/config/mac/x64/vpx_config.asm b/source/config/mac/x64/vpx_config.asm
index 6c6b584..6b13a30 100644
--- a/source/config/mac/x64/vpx_config.asm
+++ b/source/config/mac/x64/vpx_config.asm
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/mac/x64/vpx_config.h b/source/config/mac/x64/vpx_config.h
index 5b4626c..3fa7ed6 100644
--- a/source/config/mac/x64/vpx_config.h
+++ b/source/config/mac/x64/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/nacl/vp9_rtcd.h b/source/config/nacl/vp9_rtcd.h
index d9e1b0b..39abdba 100644
--- a/source/config/nacl/vp9_rtcd.h
+++ b/source/config/nacl/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -183,21 +184,36 @@ void vp9_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_c
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_c
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_c
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_c
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_c
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_c
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_c
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_c
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_c
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_c
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_c
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
diff --git a/source/config/nacl/vpx_config.asm b/source/config/nacl/vpx_config.asm
index ecbfadf..8aa0947 100644
--- a/source/config/nacl/vpx_config.asm
+++ b/source/config/nacl/vpx_config.asm
@@ -74,11 +74,11 @@
 .equ CONFIG_POSTPROC_VISUALIZER ,  0
 .equ CONFIG_OS_SUPPORT ,  1
 .equ CONFIG_UNIT_TESTS ,  0
+.equ CONFIG_DECODE_PERF_TESTS ,  0
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_DECRYPT ,  0
-.equ CONFIG_ONESHOTQ ,  0
 .equ CONFIG_MULTIPLE_ARF ,  0
 .equ CONFIG_NON420 ,  0
 .equ CONFIG_ALPHA ,  0
diff --git a/source/config/nacl/vpx_config.h b/source/config/nacl/vpx_config.h
index ce3f078..86e5d25 100644
--- a/source/config/nacl/vpx_config.h
+++ b/source/config/nacl/vpx_config.h
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/vpx_version.h b/source/config/vpx_version.h
index 512851c..f60dcc7 100644
--- a/source/config/vpx_version.h
+++ b/source/config/vpx_version.h
@@ -1,7 +1,7 @@
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  2
+#define VERSION_MINOR  3
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.2.0"
-#define VERSION_STRING      " v1.2.0"
+#define VERSION_STRING_NOSP "v1.3.0"
+#define VERSION_STRING      " v1.3.0"
diff --git a/source/config/win/ia32/vp9_rtcd.h b/source/config/win/ia32/vp9_rtcd.h
index 0d474e8..eda5e68 100644
--- a/source/config/win/ia32/vp9_rtcd.h
+++ b/source/config/win/ia32/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -178,7 +179,8 @@ void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8
 RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
@@ -213,26 +215,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+RTCD_EXTERN void (*vp9_mb_lpf_vertical_edge_w_16)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_vertical_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_vertical_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_mbloop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+RTCD_EXTERN void (*vp9_loop_filter_horizontal_edge_16)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -448,6 +471,8 @@ static void setup_rtcd_internal(void)
     vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c;
     if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3;
 
+    vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3;
 
 
 
@@ -465,21 +490,37 @@ static void setup_rtcd_internal(void)
     vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w = vp9_mb_lpf_vertical_edge_w_sse2;
 
+    vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_c;
+    if (flags & HAS_SSE2) vp9_mb_lpf_vertical_edge_w_16 = vp9_mb_lpf_vertical_edge_w_16_sse2;
+
     vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge = vp9_mbloop_filter_vertical_edge_sse2;
 
+    vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_vertical_edge_16 = vp9_mbloop_filter_vertical_edge_16_sse2;
+
     vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_vertical_edge = vp9_loop_filter_vertical_edge_mmx;
 
+    vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_vertical_edge_16 = vp9_loop_filter_vertical_edge_16_sse2;
+
     vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_c;
     if (flags & HAS_SSE2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
     vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_c;
     if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge = vp9_mbloop_filter_horizontal_edge_sse2;
 
+    vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_mbloop_filter_horizontal_edge_16 = vp9_mbloop_filter_horizontal_edge_16_sse2;
+
     vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_c;
     if (flags & HAS_MMX) vp9_loop_filter_horizontal_edge = vp9_loop_filter_horizontal_edge_mmx;
 
+    vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_c;
+    if (flags & HAS_SSE2) vp9_loop_filter_horizontal_edge_16 = vp9_loop_filter_horizontal_edge_16_sse2;
+
 
 
 
diff --git a/source/config/win/ia32/vpx_config.asm b/source/config/win/ia32/vpx_config.asm
index 9de98dd..59f65b7 100644
--- a/source/config/win/ia32/vpx_config.asm
+++ b/source/config/win/ia32/vpx_config.asm
@@ -15,8 +15,8 @@ HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SSE4_1 equ 1
-HAVE_AVX equ 1
-HAVE_AVX2 equ 1
+HAVE_AVX equ 0
+HAVE_AVX2 equ 0
 HAVE_ALTIVEC equ 0
 HAVE_VPX_PORTS equ 1
 HAVE_STDINT_H equ 0
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/win/ia32/vpx_config.h b/source/config/win/ia32/vpx_config.h
index 59c47c8..06db230 100644
--- a/source/config/win/ia32/vpx_config.h
+++ b/source/config/win/ia32/vpx_config.h
@@ -27,8 +27,8 @@
 #define HAVE_SSE3 1
 #define HAVE_SSSE3 1
 #define HAVE_SSE4_1 1
-#define HAVE_AVX 1
-#define HAVE_AVX2 1
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
 #define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 0
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/config/win/x64/vp9_rtcd.h b/source/config/win/x64/vp9_rtcd.h
index 16af84a..29a910f 100644
--- a/source/config/win/x64/vp9_rtcd.h
+++ b/source/config/win/x64/vp9_rtcd.h
@@ -21,6 +21,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 
@@ -178,7 +179,8 @@ void vp9_d63_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8
 RTCD_EXTERN void (*vp9_d63_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
-#define vp9_h_predictor_32x32 vp9_h_predictor_32x32_c
+void vp9_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
+RTCD_EXTERN void (*vp9_h_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 
 void vp9_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vp9_d117_predictor_32x32 vp9_d117_predictor_32x32_c
@@ -214,26 +216,47 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit,
 void vp9_mb_lpf_vertical_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 #define vp9_mb_lpf_vertical_edge_w vp9_mb_lpf_vertical_edge_w_sse2
 
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+void vp9_mb_lpf_vertical_edge_w_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
+#define vp9_mb_lpf_vertical_edge_w_16 vp9_mb_lpf_vertical_edge_w_16_sse2
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_vertical_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_vertical_edge vp9_mbloop_filter_vertical_edge_sse2
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_vertical_edge_16 vp9_mbloop_filter_vertical_edge_16_sse2
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_vertical_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_vertical_edge vp9_loop_filter_vertical_edge_mmx
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_vertical_edge_16 vp9_loop_filter_vertical_edge_16_sse2
+
 void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mb_lpf_horizontal_edge_w_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
-#define vp9_mb_lpf_horizontal_edge_w vp9_mb_lpf_horizontal_edge_w_sse2
+void vp9_mb_lpf_horizontal_edge_w_avx2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
+RTCD_EXTERN void (*vp9_mb_lpf_horizontal_edge_w)(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 
 void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_mbloop_filter_horizontal_edge_sse2(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_mbloop_filter_horizontal_edge vp9_mbloop_filter_horizontal_edge_sse2
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_mbloop_filter_horizontal_edge_16 vp9_mbloop_filter_horizontal_edge_16_sse2
+
 void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 void vp9_loop_filter_horizontal_edge_mmx(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count);
 #define vp9_loop_filter_horizontal_edge vp9_loop_filter_horizontal_edge_mmx
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+void vp9_loop_filter_horizontal_edge_16_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1);
+#define vp9_loop_filter_horizontal_edge_16 vp9_loop_filter_horizontal_edge_16_sse2
+
 void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride);
 #define vp9_blend_mb_inner vp9_blend_mb_inner_c
 
@@ -431,6 +454,13 @@ static void setup_rtcd_internal(void)
     vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_c;
     if (flags & HAS_SSSE3) vp9_d63_predictor_32x32 = vp9_d63_predictor_32x32_ssse3;
 
+    vp9_h_predictor_32x32 = vp9_h_predictor_32x32_c;
+    if (flags & HAS_SSSE3) vp9_h_predictor_32x32 = vp9_h_predictor_32x32_ssse3;
+
+
+
+
+
 
 
 
@@ -442,6 +472,8 @@ static void setup_rtcd_internal(void)
 
 
 
+    vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_sse2;
+    if (flags & HAS_AVX2) vp9_mb_lpf_horizontal_edge_w = vp9_mb_lpf_horizontal_edge_w_avx2;
 
 
 
diff --git a/source/config/win/x64/vpx_config.asm b/source/config/win/x64/vpx_config.asm
index c70d920..c236d8a 100644
--- a/source/config/win/x64/vpx_config.asm
+++ b/source/config/win/x64/vpx_config.asm
@@ -15,8 +15,8 @@ HAVE_SSE2 equ 1
 HAVE_SSE3 equ 1
 HAVE_SSSE3 equ 1
 HAVE_SSE4_1 equ 1
-HAVE_AVX equ 1
-HAVE_AVX2 equ 1
+HAVE_AVX equ 0
+HAVE_AVX2 equ 0
 HAVE_ALTIVEC equ 0
 HAVE_VPX_PORTS equ 1
 HAVE_STDINT_H equ 0
@@ -71,11 +71,11 @@ CONFIG_SMALL equ 0
 CONFIG_POSTPROC_VISUALIZER equ 0
 CONFIG_OS_SUPPORT equ 1
 CONFIG_UNIT_TESTS equ 0
+CONFIG_DECODE_PERF_TESTS equ 0
 CONFIG_MULTI_RES_ENCODING equ 1
 CONFIG_TEMPORAL_DENOISING equ 1
 CONFIG_EXPERIMENTAL equ 0
 CONFIG_DECRYPT equ 0
-CONFIG_ONESHOTQ equ 0
 CONFIG_MULTIPLE_ARF equ 0
 CONFIG_NON420 equ 0
 CONFIG_ALPHA equ 0
diff --git a/source/config/win/x64/vpx_config.h b/source/config/win/x64/vpx_config.h
index 4dc8c5f..67914ae 100644
--- a/source/config/win/x64/vpx_config.h
+++ b/source/config/win/x64/vpx_config.h
@@ -27,8 +27,8 @@
 #define HAVE_SSE3 1
 #define HAVE_SSSE3 1
 #define HAVE_SSE4_1 1
-#define HAVE_AVX 1
-#define HAVE_AVX2 1
+#define HAVE_AVX 0
+#define HAVE_AVX2 0
 #define HAVE_ALTIVEC 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_STDINT_H 0
@@ -83,11 +83,11 @@
 #define CONFIG_POSTPROC_VISUALIZER 0
 #define CONFIG_OS_SUPPORT 1
 #define CONFIG_UNIT_TESTS 0
+#define CONFIG_DECODE_PERF_TESTS 0
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_DECRYPT 0
-#define CONFIG_ONESHOTQ 0
 #define CONFIG_MULTIPLE_ARF 0
 #define CONFIG_NON420 0
 #define CONFIG_ALPHA 0
diff --git a/source/libvpx/.mailmap b/source/libvpx/.mailmap
index ba1279b..fb82a24 100644
--- a/source/libvpx/.mailmap
+++ b/source/libvpx/.mailmap
@@ -1,8 +1,18 @@
 Adrian Grange <agrange@google.com>
+Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Jim Bankoski <jimbankoski@google.com>
+John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Sami Pietilä <samipietila@google.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Deb Mukherjee <debargha@google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
diff --git a/source/libvpx/AUTHORS b/source/libvpx/AUTHORS
index 0937d5d..a9aa481 100644
--- a/source/libvpx/AUTHORS
+++ b/source/libvpx/AUTHORS
@@ -2,62 +2,97 @@
 # by tools/gen_authors.sh.
 
 Aaron Watry <awatry@gmail.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adrian Grange <agrange@google.com>
+Ahmad Sharif <asharif@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Alex Converse <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+changjun.yang <changjun.yang@intel.com>
+chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Erik Niemeyer <erik.a.niemeyer@gmail.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Hangyu Kuang <hkuang@google.com>
 Henrik Lundin <hlundin@google.com>
+Hui Su <huisu@google.com>
+Ivan Maltz <ivanmaltz@google.com>
 James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
 Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
+Mark Mentovai <mark@chromium.org>
 Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Matthew Heaney <matthewjheaney@chromium.org>
 Michael Kohler <michaelkohler@live.com>
+Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Morton Jonuschat <yabawock@gmail.com>
+Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
+Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Sami Pietilä <samipietila@google.com>
+Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
+Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Google Inc.
diff --git a/source/libvpx/CHANGELOG b/source/libvpx/CHANGELOG
index ef64a96..97c9a7b 100644
--- a/source/libvpx/CHANGELOG
+++ b/source/libvpx/CHANGELOG
@@ -1,3 +1,53 @@
+2013-11-15 v1.3.0 "Forest"
+  This release introduces the VP9 codec in a backward-compatible way.
+  All existing users of VP8 can continue to use the library without
+  modification. However, some VP8 options do not map to VP9 in the same manner.
+
+  The VP9 encoder in this release is not feature complete. Users interested in
+  the encoder are advised to use the git master branch and discuss issues on
+  libvpx mailing lists.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this document
+    for that release.
+
+  - Enhancements:
+      Get rid of bashisms in the main build scripts
+      Added usage info on command line options
+      Add lossless compression mode
+      Dll build of libvpx
+      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
+      Add option to disable documentation
+      configure: add --enable-external-build support
+      make: support V=1 as short form of verbose=yes
+      configure: support mingw-w64
+      configure: support hardfloat armv7 CHOSTS
+      configure: add support for android x86
+      Add estimated completion time to vpxenc
+      Don't exit on decode errors in vpxenc
+      vpxenc: support scaling prior to encoding
+      vpxdec: support scaling output
+      vpxenc: improve progress indicators with --skip
+      msvs: Don't link to winmm.lib
+      Add a new script for producing vcxproj files
+      Produce Visual Studio 10 and 11 project files
+      Produce Windows Phone project files
+      msvs-build: use msbuild for vs >= 2005
+      configure: default configure log to config.log
+      Add encoding option --static-thresh
+
+  - Speed:
+      Miscellaneous speed optimizations for VP8 and VP9.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+  - Bug Fixes:
+      This release represents approximately a year of engineering effort,
+      and contains multiple bug fixes. Please refer to git history for details.
+
+
 2012-12-21 v1.2.0
   This release acts as a checkpoint for a large amount of internal refactoring
   and testing. It also contains a number of small bugfixes, so all users are
diff --git a/source/libvpx/README b/source/libvpx/README
index d7cb11a..ce9c1c6 100644
--- a/source/libvpx/README
+++ b/source/libvpx/README
@@ -64,6 +64,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-linux-gcc
     armv7-none-rvct
     armv7-win32-vs11
+    armv7-win32-vs12
     mips32-linux-gcc
     ppc32-darwin8-gcc
     ppc32-darwin9-gcc
@@ -91,6 +92,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86-win32-vs9
     x86-win32-vs10
     x86-win32-vs11
+    x86-win32-vs12
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
     x86_64-darwin11-gcc
@@ -104,6 +106,7 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-win64-vs9
     x86_64-win64-vs10
     x86_64-win64-vs11
+    x86_64-win64-vs12
     universal-darwin8-gcc
     universal-darwin9-gcc
     universal-darwin10-gcc
diff --git a/source/libvpx/build/make/ads2gas_apple.pl b/source/libvpx/build/make/ads2gas_apple.pl
index 2563e1f..befb3db 100755
--- a/source/libvpx/build/make/ads2gas_apple.pl
+++ b/source/libvpx/build/make/ads2gas_apple.pl
@@ -9,6 +9,7 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+
 # ads2gas_apple.pl
 # Author: Eric Fung (efung (at) acm.org)
 #
@@ -16,6 +17,13 @@
 #
 # Usage: cat inputfile | perl ads2gas_apple.pl > outputfile
 #
+
+my $chromium = 0;
+
+foreach my $arg (@ARGV) {
+    $chromium = 1 if ($arg eq "-chromium");
+}
+
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
 print "\t.set WIDE_REFERENCE, 0\n";
@@ -187,7 +195,7 @@ while (<STDIN>)
         $trimmed =~ s/,//g;
 
         # string to array
-        @incoming_array = split(/ /, $trimmed);
+        @incoming_array = split(/\s+/, $trimmed);
 
         print ".macro @incoming_array[0]\n";
 
@@ -210,12 +218,18 @@ while (<STDIN>)
     s/MEND/.endm/;              # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
 
-    s/qsubaddx/qsax/i;
-    s/qaddsubx/qasx/i;
-    s/ldrneb/ldrbne/i;
-    s/ldrneh/ldrhne/i;
-    s/(vqshrun\.s16 .*, \#)0$/${1}8/i;
-    s/\.include/#include/;
+    # Clang used by Chromium differs slightly from clang in XCode in what it
+    # will accept in the assembly.
+    if ($chromium) {
+        s/qsubaddx/qsax/i;
+        s/qaddsubx/qasx/i;
+        s/ldrneb/ldrbne/i;
+        s/ldrneh/ldrhne/i;
+        s/(vqshrun\.s16 .*, \#)0$/${1}8/i;
+
+        # http://llvm.org/bugs/show_bug.cgi?id=16022
+        s/\.include/#include/;
+    }
 
     print;
 }
diff --git a/source/libvpx/build/make/configure.sh b/source/libvpx/build/make/configure.sh
index 83f480a..8dcb9bb 100755
--- a/source/libvpx/build/make/configure.sh
+++ b/source/libvpx/build/make/configure.sh
@@ -925,41 +925,26 @@ EOF
           ;;
 
         darwin*)
-            if [ -z "${sdk_path}" ]; then
-                SDK_PATH=`xcode-select -print-path 2> /dev/null`
-                SDK_PATH=${SDK_PATH}/Platforms/iPhoneOS.platform/Developer
-            else
-                SDK_PATH=${sdk_path}
-            fi
-            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
-            CXX=${TOOLCHAIN_PATH}/g++
-            CC=${TOOLCHAIN_PATH}/gcc
-            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
-            AS=${TOOLCHAIN_PATH}/as
-            STRIP=${TOOLCHAIN_PATH}/strip
-            NM=${TOOLCHAIN_PATH}/nm
+
+            XCRUN_FIND="xcrun --sdk iphoneos -find"
+            CXX="$(${XCRUN_FIND} clang++)"
+            CC="$(${XCRUN_FIND} clang)"
+            AR="$(${XCRUN_FIND} ar)"
+            LD="$(${XCRUN_FIND} ld)"
+            AS="$(${XCRUN_FIND} as)"
+            STRIP="$(${XCRUN_FIND} strip)"
+            NM="$(${XCRUN_FIND} nm)"
+            RANLIB="$(${XCRUN_FIND} ranlib)"
             AS_SFX=.s
 
             # ASFLAGS is written here instead of using check_add_asflags
             # because we need to overwrite all of ASFLAGS and purge the
             # options that were put in above
-            ASFLAGS="-version -arch ${tgt_isa} -g"
-
-            add_cflags -arch ${tgt_isa}
-            add_ldflags -arch_only ${tgt_isa}
-
-            if [ -z "${alt_libc}" ]; then
-                alt_libc=${SDK_PATH}/SDKs/iPhoneOS6.0.sdk
-            fi
-
-            add_cflags  "-isysroot ${alt_libc}"
+            ASFLAGS="-arch ${tgt_isa} -g"
 
-            # Add the paths for the alternate libc
-            for d in usr/include; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
-            done
+            alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
+            add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
+            add_ldflags -arch ${tgt_isa} -ios_version_min 7.0
 
             for d in lib usr/lib usr/lib/system; do
                 try_dir="${alt_libc}/${d}"
@@ -1091,6 +1076,15 @@ EOF
                 # Skip the check by setting AS arbitrarily
                 AS=msvs
                 msvs_arch_dir=x86-msvs
+                vc_version=${tgt_cc##vs}
+                case $vc_version in
+                    7|8|9|10)
+                         echo "${tgt_cc} does not support avx/avx2, disabling....."
+                         RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
+                         soft_disable avx
+                         soft_disable avx2
+                    ;;
+                esac
             ;;
         esac
 
diff --git a/source/libvpx/build/make/gen_msvs_sln.sh b/source/libvpx/build/make/gen_msvs_sln.sh
index 0c269b1..ffa3706 100755
--- a/source/libvpx/build/make/gen_msvs_sln.sh
+++ b/source/libvpx/build/make/gen_msvs_sln.sh
@@ -255,7 +255,7 @@ for opt in "$@"; do
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             [789]|10|11)
+             [789]|10|11|12)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
@@ -297,12 +297,15 @@ case "${vs_ver:-8}" in
     11) sln_vers="12.00"
        sln_vers_str="Visual Studio 2012"
     ;;
+    12) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2013"
+    ;;
 esac
 case "${vs_ver:-8}" in
     [789])
     sfx=vcproj
     ;;
-    10|11)
+    10|11|12)
     sfx=vcxproj
     ;;
 esac
diff --git a/source/libvpx/build/make/gen_msvs_vcxproj.sh b/source/libvpx/build/make/gen_msvs_vcxproj.sh
index 4875915..359157c 100755
--- a/source/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/source/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -33,7 +33,7 @@ Options:
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11) of visual studio to generate for
+    --ver=version               Version (10,11,12) of visual studio to generate for
     --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
@@ -228,7 +228,7 @@ for opt in "$@"; do
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                10|11)
+                10|11|12)
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -269,7 +269,7 @@ guid=${guid:-`generate_uuid`}
 asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
 case "${vs_ver:-11}" in
-    10|11)
+    10|11|12)
        asm_use_custom_step=$uses_asm
     ;;
 esac
@@ -383,6 +383,20 @@ generate_vcxproj() {
                     tag_content PlatformToolset v110
                 fi
             fi
+            if [ "$vs_ver" = "12" ]; then
+                if [ "$plat" = "ARM" ]; then
+                    # Setting the wp80 toolchain automatically sets the
+                    # WINAPI_FAMILY define, which is required for building
+                    # code for arm with the windows headers. Alternatively,
+                    # one could add AppContainerApplication=true in the Globals
+                    # section and add PrecompiledHeader=NotUsing and
+                    # CompileAsWinRT=false in ClCompile and SubSystem=Console
+                    # in Link.
+                    tag_content PlatformToolset v120_wp80
+                else
+                    tag_content PlatformToolset v120
+                fi
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true
diff --git a/source/libvpx/build/make/obj_int_extract.c b/source/libvpx/build/make/obj_int_extract.c
index feed9d9..495e9d7 100644
--- a/source/libvpx/build/make/obj_int_extract.c
+++ b/source/libvpx/build/make/obj_int_extract.c
@@ -321,7 +321,7 @@ bail:
   return 1;
 }
 
-char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
+const char *parse_elf_string_table(elf_obj_t *elf, int s_idx, int idx) {
   if (elf->bits == 32) {
     Elf32_Shdr shdr;
 
diff --git a/source/libvpx/build/make/thumb.pm b/source/libvpx/build/make/thumb.pm
index e1f34c1..d8d04aa 100644
--- a/source/libvpx/build/make/thumb.pm
+++ b/source/libvpx/build/make/thumb.pm
@@ -24,7 +24,7 @@ sub FixThumbInstructions($$)
     # with left shift, addition and a right shift (to restore the
     # register to the original value). Currently the right shift
     # isn't necessary in the code base since the values in these
-    # registers aren't used, but doing the shift for consitency.
+    # registers aren't used, but doing the shift for consistency.
     # This converts instructions such as "add r12, r12, r5, lsl r4"
     # into the sequence "lsl r5, r4", "add r12, r12, r5", "lsr r5, r4".
     s/^(\s*)(add)(\s+)(r\d+),\s*(r\d+),\s*(r\d+),\s*lsl (r\d+)/$1lsl$3$6, $7\n$1$2$3$4, $5, $6\n$1lsr$3$6, $7/g;
diff --git a/source/libvpx/configure b/source/libvpx/configure
index 621161c..f9454ba 100755
--- a/source/libvpx/configure
+++ b/source/libvpx/configure
@@ -24,9 +24,10 @@ Advanced options:
   ${toggle_examples}              examples
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
+  ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
   --libc=PATH                     path to alternate libc
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
-  --sdk-path=PATH                 path to root of sdk (iOS, android builds only)
+  --sdk-path=PATH                 path to root of sdk (android builds only)
   ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                   supported by hardware [auto]
   ${toggle_codec_srcs}            in/exclude codec library source code
@@ -100,6 +101,7 @@ all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
+all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -127,6 +129,7 @@ all_platforms="${all_platforms} x86-win32-vs8"
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
+all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
@@ -140,6 +143,7 @@ all_platforms="${all_platforms} x86_64-win64-vs8"
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
+all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} universal-darwin8-gcc"
 all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
@@ -249,7 +253,6 @@ HAVE_LIST="
     unistd_h
 "
 EXPERIMENT_LIST="
-    oneshotq
     multiple_arf
     non420
     alpha
@@ -300,6 +303,7 @@ CONFIG_LIST="
     postproc_visualizer
     os_support
     unit_tests
+    decode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
@@ -353,6 +357,7 @@ CMDLINE_SELECT="
     small
     postproc_visualizer
     unit_tests
+    decode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
@@ -671,7 +676,7 @@ process_toolchain() {
                  VCPROJ_SFX=vcproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
                  ;;
-             10|11)
+             10|11|12)
                  VCPROJ_SFX=vcxproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
                  ;;
diff --git a/source/libvpx/examples.mk b/source/libvpx/examples.mk
index 88327fe..fe36c4f 100644
--- a/source/libvpx/examples.mk
+++ b/source/libvpx/examples.mk
@@ -23,7 +23,9 @@ vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
+vpxdec.SRCS                 += ivfdec.c ivfdec.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
+vpxdec.SRCS                 += webmdec.c webmdec.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -35,11 +37,16 @@ vpxdec.SRCS                 += $(LIBYUV_SRCS)
 vpxdec.GUID                  = BA5FE66F-38DD-E034-F542-B1578C5FB950
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
-vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h vpxenc.h
+vpxenc.SRCS                 += ivfdec.c ivfdec.h
+vpxenc.SRCS                 += ivfenc.c ivfenc.h
 vpxenc.SRCS                 += tools_common.c tools_common.h
+vpxenc.SRCS                 += warnings.c warnings.h
+vpxenc.SRCS                 += webmenc.c webmenc.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
+vpxenc.SRCS                 += vpxstats.c vpxstats.h
 vpxenc.SRCS                 += third_party/libmkv/EbmlIDs.h
 vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.c
 vpxenc.SRCS                 += third_party/libmkv/EbmlWriter.h
@@ -50,18 +57,12 @@ UTILS-$(CONFIG_VP8_ENCODER)    += vp8_scalable_patterns.c
 vp8_scalable_patterns.GUID   = 0D6A210B-F482-4D6F-8570-4A9C01ACC88C
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
 UTILS-$(CONFIG_VP9_ENCODER)    += vp9_spatial_scalable_encoder.c
+vp9_spatial_scalable_encoder.SRCS += args.c args.h
+vp9_spatial_scalable_encoder.SRCS += ivfenc.c ivfenc.h
+vp9_spatial_scalable_encoder.SRCS += tools_common.c tools_common.h
 vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
 vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
 
-# Clean up old ivfenc, ivfdec binaries.
-ifeq ($(CONFIG_MSVS),yes)
-CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfenc.exe)
-CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfdec.exe)
-else
-CLEAN-OBJS += ivfenc{.c.o,.c.d,.dox,.exe,}
-CLEAN-OBJS += ivfdec{.c.o,.c.d,.dox,.exe,}
-endif
-
 # XMA example disabled for now, not used in VP8
 #UTILS-$(CONFIG_DECODERS)    += example_xma.c
 #example_xma.GUID             = A955FC4A-73F1-44F7-135E-30D84D32F022
diff --git a/source/libvpx/ivfdec.c b/source/libvpx/ivfdec.c
new file mode 100644
index 0000000..4a0816f
--- /dev/null
+++ b/source/libvpx/ivfdec.c
@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ivfdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int file_is_ivf(struct VpxInputContext *input_ctx) {
+  char raw_hdr[32];
+  int is_ivf = 0;
+
+  // TODO(tomfinegan): This can eventually go away, but for now it's required
+  // because the means by which file types are detected differ in vpxdec and
+  // vpxenc.
+  rewind(input_ctx->file);
+
+  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+    if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K' &&
+        raw_hdr[2] == 'I' && raw_hdr[3] == 'F') {
+      is_ivf = 1;
+
+      if (mem_get_le16(raw_hdr + 4) != 0) {
+        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
+                " decode properly.");
+      }
+
+      input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+      input_ctx->width = mem_get_le16(raw_hdr + 12);
+      input_ctx->height = mem_get_le16(raw_hdr + 14);
+      input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+      input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+
+      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
+       * we can guess the framerate using only the timebase in this
+       * case. Other files would require reading ahead to guess the
+       * timebase, like we do for webm.
+       */
+      if (input_ctx->framerate.numerator < 1000) {
+        /* Correct for the factor of 2 applied to the timebase in the
+         * encoder.
+         */
+        if (input_ctx->framerate.numerator & 1)
+          input_ctx->framerate.denominator <<= 1;
+        else
+          input_ctx->framerate.numerator >>= 1;
+      } else {
+        /* Don't know FPS for sure, and don't have readahead code
+         * (yet?), so just default to 30fps.
+         */
+        input_ctx->framerate.numerator = 30;
+        input_ctx->framerate.denominator = 1;
+      }
+    }
+  }
+
+  if (!is_ivf) {
+    rewind(input_ctx->file);
+    input_ctx->detect.buf_read = 0;
+  } else {
+    input_ctx->detect.position = 4;
+  }
+  return is_ivf;
+}
+
+int ivf_read_frame(struct VpxInputContext *input_ctx,
+                   uint8_t **buffer,
+                   size_t *bytes_read,
+                   size_t *buffer_size) {
+  char raw_header[IVF_FRAME_HDR_SZ] = {0};
+  size_t frame_size = 0;
+  FILE *infile = input_ctx->file;
+
+  if (input_ctx->file_type != FILE_TYPE_IVF)
+    return 0;
+
+  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile))
+      warn("Failed to read frame size\n");
+  } else {
+    frame_size = mem_get_le32(raw_header);
+
+    if (frame_size > 256 * 1024 * 1024) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buffer = realloc(*buffer, 2 * frame_size);
+
+      if (new_buffer) {
+        *buffer = new_buffer;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+
+    *bytes_read = frame_size;
+    return 0;
+  }
+
+  return 1;
+}
diff --git a/source/libvpx/ivfdec.h b/source/libvpx/ivfdec.h
new file mode 100644
index 0000000..b1468a9
--- /dev/null
+++ b/source/libvpx/ivfdec.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFDEC_H_
+#define IVFDEC_H_
+
+#include "./tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct VpxInputContext *input);
+
+int ivf_read_frame(struct VpxInputContext *input,
+                   uint8_t **buffer,
+                   size_t *bytes_read,
+                   size_t *buffer_size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* IVFDEC_H_ */
diff --git a/source/libvpx/ivfenc.c b/source/libvpx/ivfenc.c
new file mode 100644
index 0000000..fa92566
--- /dev/null
+++ b/source/libvpx/ivfenc.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ivfenc.h"
+
+#include "./tools_common.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           unsigned int fourcc,
+                           int frame_cnt) {
+  char header[32];
+
+  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+    return;
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                 /* version */
+  mem_put_le16(header + 6,  32);                /* headersize */
+  mem_put_le32(header + 8,  fourcc);            /* four CC */
+  mem_put_le16(header + 12, cfg->g_w);          /* width */
+  mem_put_le16(header + 14, cfg->g_h);          /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);         /* length */
+  mem_put_le32(header + 28, 0);                 /* unused */
+
+  (void) fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, (int)pkt->data.frame.sz);
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void) fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t size) {
+  char header[4];
+  mem_put_le32(header, (int)size);
+  (void) fwrite(header, 1, 4, outfile);
+}
diff --git a/source/libvpx/ivfenc.h b/source/libvpx/ivfenc.h
new file mode 100644
index 0000000..a332c7d
--- /dev/null
+++ b/source/libvpx/ivfenc.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFENC_H_
+#define IVFENC_H_
+
+#include "./tools_common.h"
+
+struct vpx_codec_enc_cfg;
+struct vpx_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           uint32_t fourcc,
+                           int frame_cnt);
+void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt);
+void ivf_write_frame_size(FILE *outfile, size_t size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* IVFENC_H_ */
diff --git a/source/libvpx/libs.mk b/source/libvpx/libs.mk
index 4691a12..8340eee 100644
--- a/source/libvpx/libs.mk
+++ b/source/libvpx/libs.mk
@@ -122,6 +122,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
   CODEC_EXPORTS-yes += $(addprefix $(VP9_PREFIX),$(VP9_CX_EXPORTS))
   CODEC_SRCS-yes += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
   INSTALL-LIBS-yes += include/vpx/vp8.h include/vpx/vp8cx.h
+  INSTALL-LIBS-yes += include/vpx/svc_context.h
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
@@ -182,6 +183,7 @@ CODEC_EXPORTS-$(CONFIG_DECODERS) += vpx/exports_dec
 
 INSTALL-LIBS-yes += include/vpx/vpx_codec.h
 INSTALL-LIBS-yes += include/vpx/vpx_image.h
+INSTALL-LIBS-yes += include/vpx/vpx_external_frame_buffer.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
diff --git a/source/libvpx/test/android/Android.mk b/source/libvpx/test/android/Android.mk
new file mode 100644
index 0000000..13af601
--- /dev/null
+++ b/source/libvpx/test/android/Android.mk
@@ -0,0 +1,42 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This make file builds vpx_test app for android.
+# The test app itself runs on the command line through adb shell
+# The paths are really messed up as the libvpx make file
+# expects to be made from a parent directory.
+CUR_WD := $(call my-dir)
+BINDINGS_DIR := $(CUR_WD)/../../..
+LOCAL_PATH := $(CUR_WD)/../../..
+
+#libvpx
+include $(CLEAR_VARS)
+include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
+LOCAL_PATH := $(CUR_WD)/../..
+
+#libgtest
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_CPP_EXTENSION := .cc
+LOCAL_MODULE := gtest
+LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
+LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
+include $(BUILD_STATIC_LIBRARY)
+
+#libvpx_test
+include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
+LOCAL_MODULE := libvpx_test
+LOCAL_STATIC_LIBRARIES := gtest
+LOCAL_SHARED_LIBRARIES := vpx
+include $(LOCAL_PATH)/test/test.mk
+LOCAL_C_INCLUDES := $(BINDINGS_DIR)
+FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
+LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
+include $(BUILD_EXECUTABLE)
diff --git a/source/libvpx/test/android/README b/source/libvpx/test/android/README
new file mode 100644
index 0000000..6840d91
--- /dev/null
+++ b/source/libvpx/test/android/README
@@ -0,0 +1,32 @@
+Android.mk will build vpx unittests on android.
+1) Configure libvpx from the parent directory:
+./libvpx/configure --target=armv7-android-gcc --enable-external-build \
+  --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
+  --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
+  --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
+
+2) From the parent directory, invoke ndk-build:
+NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
+  APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
+  APP_STL=gnustl_static
+
+Note: Both adb and ndk-build are available prebuilt at:
+  https://chromium.googlesource.com/android_tools
+
+3) Run get_files.py to download the test files:
+python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
+  -u http://downloads.webmproject.org/test_data/libvpx
+
+4) Transfer files to device using adb. Ensure you have proper permissions for
+the target
+
+adb push /path/to/test_files /data/local/tmp
+adb push /path/to/built_libs /data/local/tmp
+
+NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
+
+5) Run tests:
+adb shell
+(on device)
+cd /data/local/tmp
+LD_LIBRARY_PATH=. ./vpx_test
diff --git a/source/libvpx/test/android/get_files.py b/source/libvpx/test/android/get_files.py
new file mode 100644
index 0000000..1c69740
--- /dev/null
+++ b/source/libvpx/test/android/get_files.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+#
+# Use of this source code is governed by a BSD-style license
+# that can be found in the LICENSE file in the root of the source
+# tree. An additional intellectual property rights grant can be found
+# in the file PATENTS.  All contributing project authors may
+# be found in the AUTHORS file in the root of the source tree.
+#
+# This simple script pulls test files from the webm homepage
+# It is intelligent enough to only pull files if
+#   1) File / test_data folder does not exist
+#   2) SHA mismatch
+
+import pycurl
+import csv
+import hashlib
+import re
+import os.path
+import time
+import itertools
+import sys
+import getopt
+
+#globals
+url = ''
+file_list_path = ''
+local_resource_path = ''
+
+# Helper functions:
+# A simple function which returns the sha hash of a file in hex
+def get_file_sha(filename):
+  try:
+    sha_hash = hashlib.sha1()
+    with open(filename, 'rb') as file:
+      buf = file.read(HASH_CHUNK)
+      while len(buf) > 0:
+        sha_hash.update(buf)
+        buf = file.read(HASH_CHUNK)
+      return sha_hash.hexdigest()
+  except IOError:
+    print "Error reading " + filename
+
+# Downloads a file from a url, and then checks the sha against the passed
+# in sha
+def download_and_check_sha(url, filename, sha):
+  path = os.path.join(local_resource_path, filename)
+  fp = open(path, "wb")
+  curl = pycurl.Curl()
+  curl.setopt(pycurl.URL, url + "/" + filename)
+  curl.setopt(pycurl.WRITEDATA, fp)
+  curl.perform()
+  curl.close()
+  fp.close()
+  return get_file_sha(path) == sha
+
+#constants
+ftp_retries = 3
+
+SHA_COL = 0
+NAME_COL = 1
+EXPECTED_COL = 2
+HASH_CHUNK = 65536
+
+# Main script
+try:
+  opts, args = \
+      getopt.getopt(sys.argv[1:], \
+                    "u:i:o:", ["url=", "input_csv=", "output_dir="])
+except:
+  print 'get_files.py -u <url> -i <input_csv> -o <output_dir>'
+  sys.exit(2)
+
+for opt, arg in opts:
+  if opt == '-u':
+    url = arg
+  elif opt in ("-i", "--input_csv"):
+    file_list_path = os.path.join(arg)
+  elif opt in ("-o", "--output_dir"):
+    local_resource_path = os.path.join(arg)
+
+if len(sys.argv) != 7:
+  print "Expects two paths and a url!"
+  exit(1)
+
+if not os.path.isdir(local_resource_path):
+  os.makedirs(local_resource_path)
+
+file_list_csv = open(file_list_path, "rb")
+
+# Our 'csv' file uses multiple spaces as a delimiter, python's
+# csv class only uses single character delimiters, so we convert them below
+file_list_reader = csv.reader((re.sub(' +', ' ', line) \
+    for line in file_list_csv), delimiter = ' ')
+
+file_shas = []
+file_names = []
+
+for row in file_list_reader:
+  if len(row) != EXPECTED_COL:
+      continue
+  file_shas.append(row[SHA_COL])
+  file_names.append(row[NAME_COL])
+
+file_list_csv.close()
+
+# Download files, only if they don't already exist and have correct shas
+for filename, sha in itertools.izip(file_names, file_shas):
+  path = os.path.join(local_resource_path, filename)
+  if os.path.isfile(path) \
+      and get_file_sha(path) == sha:
+    print path + ' exists, skipping'
+    continue
+  for retry in range(0, ftp_retries):
+    print "Downloading " + path
+    if not download_and_check_sha(url, filename, sha):
+      print "Sha does not match, retrying..."
+    else:
+      break
diff --git a/source/libvpx/test/borders_test.cc b/source/libvpx/test/borders_test.cc
index dcdedcf..5071541 100644
--- a/source/libvpx/test/borders_test.cc
+++ b/source/libvpx/test/borders_test.cc
@@ -67,7 +67,7 @@ TEST_P(BordersTest, TestLowBitrate) {
 
   cfg_.g_lag_in_frames = 25;
   cfg_.rc_2pass_vbr_minsection_pct = 5;
-  cfg_.rc_2pass_vbr_minsection_pct = 2000;
+  cfg_.rc_2pass_vbr_maxsection_pct = 2000;
   cfg_.rc_target_bitrate = 200;
   cfg_.rc_min_quantizer = 40;
 
diff --git a/source/libvpx/test/codec_factory.h b/source/libvpx/test/codec_factory.h
index cc7b53f..2ca6ff0 100644
--- a/source/libvpx/test/codec_factory.h
+++ b/source/libvpx/test/codec_factory.h
@@ -26,6 +26,8 @@ extern "C" {
 #include "test/encode_test_driver.h"
 namespace libvpx_test {
 
+const int kCodecFactoryParam = 0;
+
 class CodecFactory {
  public:
   CodecFactory() {}
diff --git a/source/libvpx/test/convolve_test.cc b/source/libvpx/test/convolve_test.cc
index abeb4bd..9ab60b1 100644
--- a/source/libvpx/test/convolve_test.cc
+++ b/source/libvpx/test/convolve_test.cc
@@ -44,6 +44,8 @@ struct ConvolveFunctions {
   convolve_fn_t hv8_avg_;
 };
 
+typedef std::tr1::tuple<int, int, const ConvolveFunctions*> convolve_param_t;
+
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
 #define VP9_FILTER_SHIFT 7
@@ -169,7 +171,7 @@ void filter_average_block2d_8_c(const uint8_t *src_ptr,
                     output_width, output_height);
 }
 
-class ConvolveTest : public PARAMS(int, int, const ConvolveFunctions*) {
+class ConvolveTest : public ::testing::TestWithParam<convolve_param_t> {
  public:
   static void SetUpTestCase() {
     // Force input_ to be unaligned, output to be 16 byte aligned.
diff --git a/source/libvpx/test/datarate_test.cc b/source/libvpx/test/datarate_test.cc
index 85f4bb6..2d46522 100644
--- a/source/libvpx/test/datarate_test.cc
+++ b/source/libvpx/test/datarate_test.cc
@@ -248,9 +248,11 @@ TEST_P(DatarateTestVP9, BasicRateTargeting) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.8)
+    ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 0.85)
         << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_ * 1.3)
+    ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate),
+              effective_datarate_ * 1.15)
         << " The datarate for the file missed the target!";
   }
 }
diff --git a/source/libvpx/test/dct16x16_test.cc b/source/libvpx/test/dct16x16_test.cc
index b61df8d..5496d0b 100644
--- a/source/libvpx/test/dct16x16_test.cc
+++ b/source/libvpx/test/dct16x16_test.cc
@@ -264,6 +264,9 @@ typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
 typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
                        int tx_type);
 
+typedef std::tr1::tuple<fdct_t, idct_t, int> dct_16x16_param_t;
+typedef std::tr1::tuple<fht_t, iht_t, int> ht_16x16_param_t;
+
 void fdct16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_fdct16x16_c(in, out, stride);
 }
@@ -412,8 +415,9 @@ class Trans16x16TestBase {
   fht_t fwd_txfm_ref;
 };
 
-class Trans16x16DCT : public Trans16x16TestBase,
-                      public PARAMS(fdct_t, idct_t, int) {
+class Trans16x16DCT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<dct_16x16_param_t> {
  public:
   virtual ~Trans16x16DCT() {}
 
@@ -454,8 +458,9 @@ TEST_P(Trans16x16DCT, InvAccuracyCheck) {
   RunInvAccuracyCheck();
 }
 
-class Trans16x16HT : public Trans16x16TestBase,
-                     public PARAMS(fht_t, iht_t, int) {
+class Trans16x16HT
+    : public Trans16x16TestBase,
+      public ::testing::TestWithParam<ht_16x16_param_t> {
  public:
   virtual ~Trans16x16HT() {}
 
diff --git a/source/libvpx/test/dct32x32_test.cc b/source/libvpx/test/dct32x32_test.cc
index 1e792da..a1e472a 100644
--- a/source/libvpx/test/dct32x32_test.cc
+++ b/source/libvpx/test/dct32x32_test.cc
@@ -77,7 +77,9 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
 typedef void (*fwd_txfm_t)(const int16_t *in, int16_t *out, int stride);
 typedef void (*inv_txfm_t)(const int16_t *in, uint8_t *out, int stride);
 
-class Trans32x32Test : public PARAMS(fwd_txfm_t, inv_txfm_t, int) {
+typedef std::tr1::tuple<fwd_txfm_t, inv_txfm_t, int> trans_32x32_param_t;
+
+class Trans32x32Test : public ::testing::TestWithParam<trans_32x32_param_t> {
  public:
   virtual ~Trans32x32Test() {}
   virtual void SetUp() {
@@ -258,4 +260,14 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fdct32x32_rd_sse2,
                    &vp9_idct32x32_1024_add_sse2, 1)));
 #endif
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(
+    AVX2, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_fdct32x32_avx2,
+                   &vp9_idct32x32_1024_add_sse2, 0),
+        make_tuple(&vp9_fdct32x32_rd_avx2,
+                   &vp9_idct32x32_1024_add_sse2, 1)));
+#endif
 }  // namespace
diff --git a/source/libvpx/test/decode_perf_test.cc b/source/libvpx/test/decode_perf_test.cc
new file mode 100644
index 0000000..95600db
--- /dev/null
+++ b/source/libvpx/test/decode_perf_test.cc
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+#include "./vpx_version.h"
+
+using std::tr1::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const double kUsecsInSec = 1000000.0;
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef std::tr1::tuple<const char *const, unsigned> decode_perf_param_t;
+
+const decode_perf_param_t kVP9DecodePerfVectors[] = {
+  make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
+  make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2),
+  make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2),
+  make_tuple("vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm", 1),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm", 4),
+  make_tuple("vp90-2-sintel_426x182_tile_1x1_171kbps.webm", 1),
+  make_tuple("vp90-2-sintel_640x272_tile_1x2_318kbps.webm", 2),
+  make_tuple("vp90-2-sintel_854x364_tile_1x2_621kbps.webm", 2),
+  make_tuple("vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm", 4),
+  make_tuple("vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm", 4),
+  make_tuple("vp90-2-tos_426x178_tile_1x1_181kbps.webm", 1),
+  make_tuple("vp90-2-tos_640x266_tile_1x2_336kbps.webm", 2),
+  make_tuple("vp90-2-tos_854x356_tile_1x2_656kbps.webm", 2),
+  make_tuple("vp90-2-tos_1280x534_tile_1x4_1306kbps.webm", 4),
+  make_tuple("vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm", 4),
+};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+   power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<decode_perf_param_t> {
+};
+
+TEST_P(DecodePerfTest, PerfTest) {
+  const char *const video_name = GET_PARAM(VIDEO_NAME);
+  const unsigned threads = GET_PARAM(THREADS);
+
+  libvpx_test::WebMVideoSource video(video_name);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  vpx_usec_timer t;
+  vpx_usec_timer_start(&t);
+
+  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+    decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  }
+
+  vpx_usec_timer_mark(&t);
+  const double elapsed_secs = double(vpx_usec_timer_elapsed(&t))
+                              / kUsecsInSec;
+  const unsigned frames = video.frame_number();
+  const double fps = double(frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", video_name);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
+                        ::testing::ValuesIn(kVP9DecodePerfVectors));
+
+}  // namespace
diff --git a/source/libvpx/test/decode_test_driver.cc b/source/libvpx/test/decode_test_driver.cc
index 1f6d540..7a93e50 100644
--- a/source/libvpx/test/decode_test_driver.cc
+++ b/source/libvpx/test/decode_test_driver.cc
@@ -30,6 +30,7 @@ void DecoderTest::RunLoop(CompressedVideoSource *video) {
 
   // Decode frames.
   for (video->Begin(); video->cxdata(); video->Next()) {
+    PreDecodeFrameHook(*video, decoder);
     vpx_codec_err_t res_dec = decoder->DecodeFrame(video->cxdata(),
                                                    video->frame_size());
     ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder->DecodeError();
diff --git a/source/libvpx/test/decode_test_driver.h b/source/libvpx/test/decode_test_driver.h
index 055c45e..79db6e1 100644
--- a/source/libvpx/test/decode_test_driver.h
+++ b/source/libvpx/test/decode_test_driver.h
@@ -76,6 +76,16 @@ class Decoder {
     return detail ? detail : vpx_codec_error(&decoder_);
   }
 
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetExternalFrameBuffers(
+      vpx_codec_frame_buffer_t *fb_list, int fb_count,
+      vpx_realloc_frame_buffer_cb_fn_t cb, void *user_priv) {
+    InitOnce();
+    return vpx_codec_set_frame_buffers(&decoder_,
+                                       fb_list, fb_count,
+                                       cb, user_priv);
+  }
+
  protected:
   virtual const vpx_codec_iface_t* CodecInterface() const = 0;
 
@@ -101,6 +111,10 @@ class DecoderTest {
   // Main decoding loop
   virtual void RunLoop(CompressedVideoSource *video);
 
+  // Hook to be called before decompressing every frame.
+  virtual void PreDecodeFrameHook(const CompressedVideoSource& video,
+                                  Decoder *decoder) {}
+
   // Hook to be called on every decompressed frame.
   virtual void DecompressedFrameHook(const vpx_image_t& img,
                                      const unsigned int frame_number) {}
diff --git a/source/libvpx/test/error_resilience_test.cc b/source/libvpx/test/error_resilience_test.cc
index 16d250c..30c20e9 100644
--- a/source/libvpx/test/error_resilience_test.cc
+++ b/source/libvpx/test/error_resilience_test.cc
@@ -1,12 +1,12 @@
 /*
-  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
-  Use of this source code is governed by a BSD-style license
-  that can be found in the LICENSE file in the root of the source
-  tree. An additional intellectual property rights grant can be found
-  in the file PATENTS.  All contributing project authors may
-  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
diff --git a/source/libvpx/test/external_frame_buffer_test.cc b/source/libvpx/test/external_frame_buffer_test.cc
new file mode 100644
index 0000000..874d199
--- /dev/null
+++ b/source/libvpx/test/external_frame_buffer_test.cc
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const int kVideoNameParam = 1;
+const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+
+// Callback used by libvpx to request the application to allocate a frame
+// buffer of at least |new_size| in bytes.
+int realloc_vp9_frame_buffer(void *user_priv, size_t new_size,
+                             vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  if (fb == NULL)
+    return -1;
+
+  delete [] fb->data;
+  fb->data = new uint8_t[new_size];
+  fb->size = new_size;
+  return 0;
+}
+
+// Callback will not allocate data for frame buffer.
+int zero_realloc_vp9_frame_buffer(void *user_priv, size_t new_size,
+                                  vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  if (fb == NULL)
+    return -1;
+
+  delete [] fb->data;
+  fb->data = NULL;
+  fb->size = new_size;
+  return 0;
+}
+
+// Callback will allocate one less byte.
+int one_less_byte_realloc_vp9_frame_buffer(void *user_priv, size_t new_size,
+                                           vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  if (fb == NULL)
+    return -1;
+
+  delete [] fb->data;
+
+  const size_t error_size = new_size - 1;
+  fb->data = new uint8_t[error_size];
+  fb->size = error_size;
+  return 0;
+}
+
+// Class for testing passing in external frame buffers to libvpx.
+class ExternalFrameBufferMD5Test
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  ExternalFrameBufferMD5Test()
+      : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
+        md5_file_(NULL),
+        num_buffers_(0),
+        frame_buffers_(NULL) {}
+
+  virtual ~ExternalFrameBufferMD5Test() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] frame_buffers_[i].data;
+    }
+    delete [] frame_buffers_;
+
+    if (md5_file_ != NULL)
+      fclose(md5_file_);
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    if (num_buffers_ > 0 && video.frame_number() == 0) {
+      // Have libvpx use frame buffers we create.
+      frame_buffers_ = new vpx_codec_frame_buffer_t[num_buffers_];
+      memset(frame_buffers_, 0, sizeof(frame_buffers_[0]) * num_buffers_);
+
+      ASSERT_EQ(VPX_CODEC_OK,
+                decoder->SetExternalFrameBuffers(
+                    frame_buffers_, num_buffers_,
+                    realloc_vp9_frame_buffer, NULL));
+    }
+  }
+
+  void OpenMD5File(const std::string &md5_file_name_) {
+    md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
+    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
+        << md5_file_name_;
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    ASSERT_TRUE(md5_file_ != NULL);
+    char expected_md5[33];
+    char junk[128];
+
+    // Read correct md5 checksums.
+    const int res = fscanf(md5_file_, "%s  %s", expected_md5, junk);
+    ASSERT_NE(EOF, res) << "Read md5 data failed";
+    expected_md5[32] = '\0';
+
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    const char *const actual_md5 = md5_res.Get();
+
+    // Check md5 match.
+    ASSERT_STREQ(expected_md5, actual_md5)
+        << "Md5 checksums don't match: frame number = " << frame_number;
+  }
+
+  void set_num_buffers(int num_buffers) { num_buffers_ = num_buffers; }
+  int num_buffers() const { return num_buffers_; }
+
+ private:
+  FILE *md5_file_;
+  int num_buffers_;
+  vpx_codec_frame_buffer_t *frame_buffers_;
+};
+
+class ExternalFrameBufferTest : public ::testing::Test {
+ protected:
+  ExternalFrameBufferTest()
+      : video_(NULL),
+        decoder_(NULL),
+        num_buffers_(0),
+        frame_buffers_(NULL) {}
+
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = {0};
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+  }
+
+  virtual void TearDown() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] frame_buffers_[i].data;
+    }
+    delete [] frame_buffers_;
+    delete decoder_;
+    delete video_;
+  }
+
+  // Passes the external frame buffer information to libvpx.
+  vpx_codec_err_t SetExternalFrameBuffers(
+      int num_buffers,
+      vpx_realloc_frame_buffer_cb_fn_t cb) {
+    if (num_buffers > 0) {
+      num_buffers_ = num_buffers;
+
+      // Have libvpx use frame buffers we create.
+      frame_buffers_ = new vpx_codec_frame_buffer_t[num_buffers_];
+      memset(frame_buffers_, 0, sizeof(frame_buffers_[0]) * num_buffers_);
+    }
+
+    return decoder_->SetExternalFrameBuffers(frame_buffers_, num_buffers_,
+                                             cb, NULL);
+  }
+
+  // Pass Null frame buffer list to libvpx.
+  vpx_codec_err_t SetNullFrameBuffers(
+      int num_buffers,
+      vpx_realloc_frame_buffer_cb_fn_t cb) {
+    return decoder_->SetExternalFrameBuffers(NULL, num_buffers,
+                                             cb, NULL);
+  }
+
+  vpx_codec_err_t DecodeOneFrame() {
+    const vpx_codec_err_t res =
+        decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+    if (res == VPX_CODEC_OK)
+      video_->Next();
+    return res;
+  }
+
+  vpx_codec_err_t DecodeRemainingFrames() {
+    for (; video_->cxdata(); video_->Next()) {
+      const vpx_codec_err_t res =
+          decoder_->DecodeFrame(video_->cxdata(), video_->frame_size());
+      if (res != VPX_CODEC_OK)
+        return res;
+
+      libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
+      const vpx_image_t *img = NULL;
+
+      // Get decompressed data
+      while ((img = dec_iter.Next())) {
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
+  libvpx_test::WebMVideoSource *video_;
+  libvpx_test::VP9Decoder *decoder_;
+  int num_buffers_;
+  vpx_codec_frame_buffer_t *frame_buffers_;
+};
+
+
+// This test runs through the set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame in the video file.
+// If md5 checksums match the correct md5 data, then the test is passed.
+// Otherwise, the test failed.
+TEST_P(ExternalFrameBufferMD5Test, ExtFBMD5Match) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+  libvpx_test::CompressedVideoSource *video = NULL;
+
+  // Number of buffers equals number of possible reference buffers(8), plus
+  // one working buffer, plus four jitter buffers.
+  const int num_buffers = 13;
+  set_num_buffers(num_buffers);
+
+  // Tell compiler we are not using kVP8TestVectors.
+  (void)libvpx_test::kVP8TestVectors;
+
+  // Open compressed video file.
+  if (filename.substr(filename.length() - 3, 3) == "ivf") {
+    video = new libvpx_test::IVFVideoSource(filename);
+  } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+    video = new libvpx_test::WebMVideoSource(filename);
+  }
+  video->Init();
+
+  // Construct md5 file name.
+  const std::string md5_filename = filename + ".md5";
+  OpenMD5File(md5_filename);
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+TEST_F(ExternalFrameBufferTest, EightFrameBuffers) {
+  // Minimum number of reference buffers for VP9 is 8.
+  const int num_buffers = 8;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetExternalFrameBuffers(num_buffers, realloc_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, EightJitterBuffers) {
+  // Number of buffers equals number of possible reference buffers(8), plus
+  // one working buffer, plus eight jitter buffers.
+  const int num_buffers = 17;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetExternalFrameBuffers(num_buffers, realloc_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+}
+
+TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
+  // Minimum number of reference buffers for VP9 is 8.
+  const int num_buffers = 7;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetExternalFrameBuffers(num_buffers, realloc_vp9_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullFrameBufferList) {
+  // Number of buffers equals number of possible reference buffers(8), plus
+  // one working buffer, plus four jitter buffers.
+  const int num_buffers = 13;
+  ASSERT_EQ(VPX_CODEC_INVALID_PARAM,
+            SetNullFrameBuffers(num_buffers, realloc_vp9_frame_buffer));
+}
+
+TEST_F(ExternalFrameBufferTest, NullRealloc) {
+  // Number of buffers equals number of possible reference buffers(8), plus
+  // one working buffer, plus four jitter buffers.
+  const int num_buffers = 13;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetExternalFrameBuffers(num_buffers,
+                                    zero_realloc_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+TEST_F(ExternalFrameBufferTest, ReallocOneLessByte) {
+  // Number of buffers equals number of possible reference buffers(8), plus
+  // one working buffer, plus four jitter buffers.
+  const int num_buffers = 13;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetExternalFrameBuffers(num_buffers,
+                                    one_less_byte_realloc_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeOneFrame());
+}
+
+VP9_INSTANTIATE_TEST_CASE(ExternalFrameBufferMD5Test,
+                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors));
+}  // namespace
diff --git a/source/libvpx/test/fdct4x4_test.cc b/source/libvpx/test/fdct4x4_test.cc
index 796a2e9..67426eb 100644
--- a/source/libvpx/test/fdct4x4_test.cc
+++ b/source/libvpx/test/fdct4x4_test.cc
@@ -13,178 +13,291 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
+#include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
+const int kNumCoeffs = 16;
+typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
+                       int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
+                       int tx_type);
+
+typedef std::tr1::tuple<fdct_t, idct_t, int> dct_4x4_param_t;
+typedef std::tr1::tuple<fht_t, iht_t, int> ht_4x4_param_t;
+
+void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
-  vp9_idct4x4_16_add_c(out, dst, stride);
-}
-void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
+
+void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_short_fht4x4_c(in, out, stride, tx_type);
 }
-void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                int stride, int tx_type) {
-  vp9_iht4x4_16_add_c(out, dst, stride, tx_type);
-}
 
-class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
+class Trans4x4TestBase {
  public:
-  virtual ~FwdTrans4x4Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm_ = fdct4x4;
-      inv_txfm_ = idct4x4_add;
-    } else {
-      fwd_txfm_ = fht4x4;
-      inv_txfm_ = iht4x4_add;
-    }
-  }
+  virtual ~Trans4x4TestBase() {}
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm_)(in, out, dst, stride, tx_type);
-  }
+  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1u, max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > 1";
 
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm_)(in, out, dst, stride, tx_type);
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > 1 per block";
   }
 
-  int tx_type_;
-  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
-                   int stride, int tx_type);
-  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
-                   int stride, int tx_type);
-};
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
 
-TEST_P(FwdTrans4x4Test, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
-  const int pitch = 4;
-  int count_sign_block[16][2];
-  const int count_test_block = 1000000;
-
-  memset(count_sign_block, 0, sizeof(count_sign_block));
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
     }
   }
 
-  for (int j = 0; j < 16; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 10000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
-        << " for input range [-255, 255] at index " << j
-        << " tx_type " << tx_type_;
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
   }
 
-  memset(count_sign_block, 0, sizeof(count_sign_block));
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-15, 15].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
 
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
 
-    for (int j = 0; j < 16; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
     }
   }
 
-  for (int j = 0; j < 16; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 100000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT/FHT has a sign bias > 10%"
-        << " for input range [-15, 15] at index " << j;
+  int pitch_;
+  int tx_type_;
+  fht_t fwd_txfm_ref;
+};
+
+class Trans4x4DCT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<dct_4x4_param_t> {
+ public:
+  virtual ~Trans4x4DCT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fdct4x4_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
   }
+
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
+
+TEST_P(Trans4x4DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }
 
-TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int max_error = 0;
-  int total_error = 0;
-  const int count_test_block = 1000000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
-
-    for (int j = 0; j < 16; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = src[j] - dst[j];
-
-    const int pitch = 4;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-        if (test_temp_block[j] > 0) {
-          test_temp_block[j] += 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        } else {
-          test_temp_block[j] -= 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        }
-    }
+TEST_P(Trans4x4DCT, CoeffCheck) {
+  RunCoeffCheck();
+}
 
-    // inverse transform and reconstruct the pixel block
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
+TEST_P(Trans4x4DCT, MemCheck) {
+  RunMemCheck();
+}
 
-    for (int j = 0; j < 16; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
+TEST_P(Trans4x4DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class Trans4x4HT
+    : public Trans4x4TestBase,
+      public ::testing::TestWithParam<ht_4x4_param_t> {
+ public:
+  virtual ~Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fht4x4_ref;
   }
-  EXPECT_GE(1, max_error)
-      << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: FDCT/IDCT or FHT/IHT has average "
-      << "roundtrip error > 1 per block";
+ protected:
+  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans4x4HT, AccuracyCheck) {
+  RunAccuracyCheck();
 }
 
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
+TEST_P(Trans4x4HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4HT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4HT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_sse2,
+                   &vp9_idct4x4_16_add_sse2, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+#endif
+
 }  // namespace
diff --git a/source/libvpx/test/fdct8x8_test.cc b/source/libvpx/test/fdct8x8_test.cc
index 3777b11..19ffe26 100644
--- a/source/libvpx/test/fdct8x8_test.cc
+++ b/source/libvpx/test/fdct8x8_test.cc
@@ -35,6 +35,9 @@ typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
 typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
                        int tx_type);
 
+typedef std::tr1::tuple<fdct_t, idct_t, int> dct_8x8_param_t;
+typedef std::tr1::tuple<fht_t, iht_t, int> ht_8x8_param_t;
+
 void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_fdct8x8_c(in, out, stride);
 }
@@ -215,8 +218,9 @@ class FwdTrans8x8TestBase {
   fht_t fwd_txfm_ref;
 };
 
-class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
-                       public PARAMS(fdct_t, idct_t, int) {
+class FwdTrans8x8DCT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<dct_8x8_param_t> {
  public:
   virtual ~FwdTrans8x8DCT() {}
 
@@ -254,8 +258,9 @@ TEST_P(FwdTrans8x8DCT, ExtremalCheck) {
   RunExtremalCheck();
 }
 
-class FwdTrans8x8HT : public FwdTrans8x8TestBase,
-                      public PARAMS(fht_t, iht_t, int) {
+class FwdTrans8x8HT
+    : public FwdTrans8x8TestBase,
+      public ::testing::TestWithParam<ht_8x8_param_t> {
  public:
   virtual ~FwdTrans8x8HT() {}
 
diff --git a/source/libvpx/test/lru_frame_buffer_test.cc b/source/libvpx/test/lru_frame_buffer_test.cc
new file mode 100644
index 0000000..cd6b432
--- /dev/null
+++ b/source/libvpx/test/lru_frame_buffer_test.cc
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <queue>
+#include <string>
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+
+namespace {
+
+const int kVideoNameParam = 1;
+
+const char *kLRUTestVectors[] = {
+  "vp90-2-02-size-lf-1920x1080.webm",
+  "vp90-2-05-resize.ivf",
+};
+
+// Callback used by libvpx to request the application to allocate a frame
+// buffer of at least |new_size| in bytes.
+int realloc_vp9_frame_buffer(void *user_priv, size_t new_size,
+                             vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  if (fb == NULL)
+    return -1;
+
+  delete [] fb->data;
+  fb->data = new uint8_t[new_size];
+  fb->size = new_size;
+
+  return 0;
+}
+
+// Class for testing libvpx is using the least recently
+// used frame buffer when a new buffer is requested.
+class LRUFrameBufferTest
+    : public ::libvpx_test::DecoderTest,
+      public ::libvpx_test::CodecTestWithParam<const char*> {
+ protected:
+  struct FrameBufferMD5Sum {
+    int frame_buffer_index;
+    vpx_image_t img;
+    std::string md5;
+  };
+
+  LRUFrameBufferTest()
+      : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
+        num_buffers_(0),
+        num_jitter_buffers_(0),
+        frame_buffers_(NULL) {}
+
+  virtual ~LRUFrameBufferTest() {
+    for (int i = 0; i < num_buffers_; ++i) {
+      delete [] frame_buffers_[i].data;
+    }
+    delete [] frame_buffers_;
+  }
+
+  virtual void PreDecodeFrameHook(
+      const libvpx_test::CompressedVideoSource &video,
+      libvpx_test::Decoder *decoder) {
+    // Use external buffers for testing jitter buffers.
+    if (num_jitter_buffers_ > 0 && video.frame_number() == 0) {
+      const int max_reference_buffers = 8;
+
+      // Add 1 for a work buffer.
+      num_buffers_ = max_reference_buffers + 1 + num_jitter_buffers_;
+
+      // Have libvpx use frame buffers we create.
+      frame_buffers_ = new vpx_codec_frame_buffer_t[num_buffers_];
+      memset(frame_buffers_, 0, sizeof(frame_buffers_[0]) * num_buffers_);
+
+      decoder->SetExternalFrameBuffers(frame_buffers_, num_buffers_,
+                                       realloc_vp9_frame_buffer, NULL);
+    }
+
+    // Turn on frame buffer LRU cache.
+    decoder->Control(VP9D_SET_FRAME_BUFFER_LRU_CACHE, 1);
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     const unsigned int frame_number) {
+    const uint32_t ximg_y_plane = 0;
+    const uint8_t *const y_buffer = img.planes[ximg_y_plane];
+
+    // Find which external buffer contains the y_buffer.
+    int i = 0;
+    for (i = 0; i < num_buffers_; ++i) {
+      if (y_buffer >= frame_buffers_[i].data &&
+          y_buffer < (frame_buffers_[i].data + frame_buffers_[i].size)) {
+        break;
+      }
+    }
+
+    FrameBufferMD5Sum fb_md5;
+    fb_md5.frame_buffer_index = i;
+    fb_md5.img = img;
+
+    libvpx_test::MD5 md5;
+    md5.Add(&img);
+    fb_md5.md5 = md5.Get();
+    jitter_buffer_md5_sums_.push(fb_md5);
+
+    // Check to see if any of the reconstructed image changed.
+    if (jitter_buffer_md5_sums_.size() >
+        static_cast<size_t>(num_jitter_buffers_)) {
+      fb_md5 = jitter_buffer_md5_sums_.front();
+
+      libvpx_test::MD5 md5;
+      md5.Add(&fb_md5.img);
+      const std::string check_str = md5.Get();
+
+      ASSERT_EQ(fb_md5.md5, check_str);
+      jitter_buffer_md5_sums_.pop();
+    }
+  }
+
+  libvpx_test::CompressedVideoSource *OpenCompressedFile(
+      const std::string &filename) {
+    if (filename.substr(filename.length() - 3, 3) == "ivf") {
+      return new libvpx_test::IVFVideoSource(filename);
+    } else if (filename.substr(filename.length() - 4, 4) == "webm") {
+      return new libvpx_test::WebMVideoSource(filename);
+    }
+    return NULL;
+  }
+
+  void set_num_jitter_buffers(int num_buffers) {
+    num_jitter_buffers_ = num_buffers;
+  }
+
+ private:
+  // Total number of external frame buffers.
+  int num_buffers_;
+  int num_jitter_buffers_;
+
+  // External frame buffers used by libvpx.
+  vpx_codec_frame_buffer_t *frame_buffers_;
+
+  // Save the md5 checksums for later comparison.
+  std::queue<FrameBufferMD5Sum> jitter_buffer_md5_sums_;
+};
+
+// This test runs through a set of test vectors, and decodes them.
+// Libvpx will call into the application to allocate a frame buffer when
+// needed. The md5 checksums are computed for each frame after it is
+// decoded and stored to be checked later. After a jitter frame buffer
+// has expired, the md5 checksum is computed again for the expired jitter
+// buffer frame and checked against the md5 checksum after the frame was
+// decoded. If md5 checksums match, then the test is passed. Otherwise,
+// the test failed.
+TEST_P(LRUFrameBufferTest, CheckLRUOneJitterBuffer) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+
+  set_num_jitter_buffers(1);
+
+  libvpx_test::CompressedVideoSource *const video =
+      OpenCompressedFile(filename);
+  video->Init();
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+TEST_P(LRUFrameBufferTest, CheckLRUFourJitterBuffers) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+
+  set_num_jitter_buffers(4);
+
+  libvpx_test::CompressedVideoSource *const video =
+      OpenCompressedFile(filename);
+  video->Init();
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+TEST_P(LRUFrameBufferTest, CheckLRUEightJitterBuffers) {
+  const std::string filename = GET_PARAM(kVideoNameParam);
+
+  set_num_jitter_buffers(8);
+
+  libvpx_test::CompressedVideoSource *const video =
+      OpenCompressedFile(filename);
+  video->Init();
+
+  // Decode frame, and check the md5 matching.
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  delete video;
+}
+
+VP9_INSTANTIATE_TEST_CASE(LRUFrameBufferTest,
+                          ::testing::ValuesIn(kLRUTestVectors));
+}  // namespace
diff --git a/source/libvpx/test/sixtap_predict_test.cc b/source/libvpx/test/sixtap_predict_test.cc
index ee4faac..0f5c0a5 100644
--- a/source/libvpx/test/sixtap_predict_test.cc
+++ b/source/libvpx/test/sixtap_predict_test.cc
@@ -1,12 +1,12 @@
 /*
-*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include <math.h>
 #include <stdlib.h>
@@ -32,7 +32,10 @@ typedef void (*sixtap_predict_fn_t)(uint8_t *src_ptr,
                                     uint8_t *dst_ptr,
                                     int  dst_pitch);
 
-class SixtapPredictTest : public PARAMS(int, int, sixtap_predict_fn_t) {
+typedef std::tr1::tuple<int, int, sixtap_predict_fn_t> sixtap_predict_param_t;
+
+class SixtapPredictTest
+    : public ::testing::TestWithParam<sixtap_predict_param_t> {
  public:
   static void SetUpTestCase() {
     src_ = reinterpret_cast<uint8_t*>(vpx_memalign(kDataAlignment, kSrcSize));
diff --git a/source/libvpx/test/svc_test.cc b/source/libvpx/test/svc_test.cc
new file mode 100644
index 0000000..3ddd9c1
--- /dev/null
+++ b/source/libvpx/test/svc_test.cc
@@ -0,0 +1,337 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "vpx/svc_context.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+namespace {
+
+using libvpx_test::CodecFactory;
+using libvpx_test::Decoder;
+using libvpx_test::VP9CodecFactory;
+
+class SvcTest : public ::testing::Test {
+ protected:
+  static const uint32_t kWidth = 352;
+  static const uint32_t kHeight = 288;
+
+  SvcTest()
+      : codec_iface_(0),
+        test_file_name_("hantro_collage_w352h288.yuv"),
+        codec_initialized_(false),
+        decoder_(0) {
+    memset(&svc_, 0, sizeof(svc_));
+    memset(&codec_, 0, sizeof(codec_));
+    memset(&codec_enc_, 0, sizeof(codec_enc_));
+  }
+
+  virtual ~SvcTest() {}
+
+  virtual void SetUp() {
+    svc_.encoding_mode = INTER_LAYER_PREDICTION_IP;
+    svc_.log_level = SVC_LOG_DEBUG;
+    svc_.log_print = 0;
+
+    codec_iface_ = vpx_codec_vp9_cx();
+    const vpx_codec_err_t res =
+        vpx_codec_enc_config_default(codec_iface_, &codec_enc_, 0);
+    EXPECT_EQ(VPX_CODEC_OK, res);
+
+    codec_enc_.g_w = kWidth;
+    codec_enc_.g_h = kHeight;
+    codec_enc_.g_timebase.num = 1;
+    codec_enc_.g_timebase.den = 60;
+    codec_enc_.kf_min_dist = 100;
+    codec_enc_.kf_max_dist = 100;
+
+    vpx_codec_dec_cfg_t dec_cfg = {0};
+    VP9CodecFactory codec_factory;
+    decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+  }
+
+  virtual void TearDown() {
+    vpx_svc_release(&svc_);
+    delete(decoder_);
+    if (codec_initialized_) vpx_codec_destroy(&codec_);
+  }
+
+  SvcContext svc_;
+  vpx_codec_ctx_t codec_;
+  struct vpx_codec_enc_cfg codec_enc_;
+  vpx_codec_iface_t *codec_iface_;
+  std::string test_file_name_;
+  bool codec_initialized_;
+  Decoder *decoder_;
+};
+
+TEST_F(SvcTest, SvcInit) {
+  // test missing parameters
+  vpx_codec_err_t res = vpx_svc_init(NULL, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+  res = vpx_svc_init(&svc_, NULL, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+  res = vpx_svc_init(&svc_, &codec_, NULL, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 6;  // too many layers
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 0;  // use default layers
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(VPX_SS_DEFAULT_LAYERS, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, InitTwoLayers) {
+  svc_.spatial_layers = 2;
+  vpx_svc_set_scale_factors(&svc_, "4/16,16*16");  // invalid scale values
+  vpx_codec_err_t res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");  // valid scale values
+  res = vpx_svc_init(&svc_, &codec_, codec_iface_, &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, InvalidOptions) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "not-an-option=1");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+}
+
+TEST_F(SvcTest, SetLayersOption) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(3, svc_.spatial_layers);
+}
+
+TEST_F(SvcTest, SetEncodingMode) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "encoding-mode=alt-ip");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(ALT_INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
+
+TEST_F(SvcTest, SetMultipleOptions) {
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "layers=2 encoding-mode=ip");
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+  EXPECT_EQ(2, svc_.spatial_layers);
+  EXPECT_EQ(INTER_LAYER_PREDICTION_IP, svc_.encoding_mode);
+}
+
+TEST_F(SvcTest, SetScaleFactorsOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res =
+      vpx_svc_set_options(&svc_, "scale-factors=not-scale-factors");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_options(&svc_, "scale-factors=1/3,2/3");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetQuantizersOption) {
+  svc_.spatial_layers = 2;
+  vpx_codec_err_t res = vpx_svc_set_options(&svc_, "quantizers=not-quantizers");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  vpx_svc_set_options(&svc_, "quantizers=40,45");
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetQuantizers) {
+  vpx_codec_err_t res = vpx_svc_set_quantizers(NULL, "40,30");
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_quantizers(&svc_, "40");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_quantizers(&svc_, "40,30");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+TEST_F(SvcTest, SetScaleFactors) {
+  vpx_codec_err_t res = vpx_svc_set_scale_factors(NULL, "4/16,16/16");
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_scale_factors(&svc_, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  svc_.spatial_layers = 2;
+  res = vpx_svc_set_scale_factors(&svc_, "4/16");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  res = vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+}
+
+// Test that decoder can handle an SVC frame as the first frame in a sequence.
+TEST_F(SvcTest, FirstFrameHasLayers) {
+  svc_.spatial_layers = 2;
+  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
+  vpx_svc_set_quantizers(&svc_, "40,30");
+
+  vpx_codec_err_t res =
+      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+
+  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                     codec_enc_.g_timebase.den,
+                                     codec_enc_.g_timebase.num, 0, 30);
+  video.Begin();
+
+  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                       video.duration(), VPX_DL_REALTIME);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+
+  const vpx_codec_err_t res_dec = decoder_->DecodeFrame(
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
+      vpx_svc_get_frame_size(&svc_));
+
+  // this test fails with a decoder error
+  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+}
+
+TEST_F(SvcTest, EncodeThreeFrames) {
+  svc_.spatial_layers = 2;
+  vpx_svc_set_scale_factors(&svc_, "4/16,16/16");
+  vpx_svc_set_quantizers(&svc_, "40,30");
+
+  vpx_codec_err_t res =
+      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+
+  libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+                                     codec_enc_.g_timebase.den,
+                                     codec_enc_.g_timebase.num, 0, 30);
+  // FRAME 0
+  video.Begin();
+  // This frame is a keyframe.
+  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                       video.duration(), VPX_DL_REALTIME);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+  EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
+
+  vpx_codec_err_t res_dec = decoder_->DecodeFrame(
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
+      vpx_svc_get_frame_size(&svc_));
+  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+
+  // FRAME 1
+  video.Next();
+  // This is a P-frame.
+  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                       video.duration(), VPX_DL_REALTIME);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
+
+  res_dec = decoder_->DecodeFrame(
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
+      vpx_svc_get_frame_size(&svc_));
+  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+
+  // FRAME 2
+  video.Next();
+  // This is a P-frame.
+  res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
+                       video.duration(), VPX_DL_REALTIME);
+  ASSERT_EQ(VPX_CODEC_OK, res);
+  EXPECT_EQ(0, vpx_svc_is_keyframe(&svc_));
+
+  res_dec = decoder_->DecodeFrame(
+      static_cast<const uint8_t *>(vpx_svc_get_buffer(&svc_)),
+      vpx_svc_get_frame_size(&svc_));
+  ASSERT_EQ(VPX_CODEC_OK, res_dec) << decoder_->DecodeError();
+}
+
+TEST_F(SvcTest, GetLayerResolution) {
+  svc_.spatial_layers = 2;
+  vpx_svc_set_scale_factors(&svc_, "4/16,8/16");
+  vpx_svc_set_quantizers(&svc_, "40,30");
+
+  vpx_codec_err_t res =
+      vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  codec_initialized_ = true;
+
+  // ensure that requested layer is a valid layer
+  uint32_t layer_width, layer_height;
+  res = vpx_svc_get_layer_resolution(&svc_, svc_.spatial_layers,
+                                     &layer_width, &layer_height);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_get_layer_resolution(NULL, 0, &layer_width, &layer_height);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_get_layer_resolution(&svc_, 0, NULL, &layer_height);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_get_layer_resolution(&svc_, 0, &layer_width, NULL);
+  EXPECT_EQ(VPX_CODEC_INVALID_PARAM, res);
+
+  res = vpx_svc_get_layer_resolution(&svc_, 0, &layer_width, &layer_height);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  EXPECT_EQ(kWidth * 4 / 16, layer_width);
+  EXPECT_EQ(kHeight * 4 / 16, layer_height);
+
+  res = vpx_svc_get_layer_resolution(&svc_, 1, &layer_width, &layer_height);
+  EXPECT_EQ(VPX_CODEC_OK, res);
+  EXPECT_EQ(kWidth * 8 / 16, layer_width);
+  EXPECT_EQ(kHeight * 8 / 16, layer_height);
+}
+
+}  // namespace
diff --git a/source/libvpx/test/test-data.sha1 b/source/libvpx/test/test-data.sha1
index 44220d5..442bfd2 100644
--- a/source/libvpx/test/test-data.sha1
+++ b/source/libvpx/test/test-data.sha1
@@ -61,6 +61,7 @@ c87599cbecd72d4cd4f7ace3313b7a6bc6eb8163  vp80-05-sharpness-1438.ivf
 aff51d865c2621b60510459244ea83e958e4baed  vp80-05-sharpness-1439.ivf
 da386e72b19b5485a6af199c5eb60ef25e510dd1  vp80-05-sharpness-1440.ivf
 6759a095203d96ccd267ce09b1b050b8cc4c2f1f  vp80-05-sharpness-1443.ivf
+b95d3cc1d0df991e63e150a801710a72f20d9ba0  vp80-06-smallsize.ivf
 db55ec7fd02c864ba996ff060b25b1e08611330b  vp80-00-comprehensive-001.ivf.md5
 29db0ad011cba1e45f856d5623cd38dac3e3bf19  vp80-00-comprehensive-002.ivf.md5
 e84f258f69e173e7d68f8f8c037a0a3766902182  vp80-00-comprehensive-003.ivf.md5
@@ -122,6 +123,7 @@ f95eb6214571434f1f73ab7833b9ccdf47588020  vp80-03-segmentation-1437.ivf.md5
 086c56378df81b6cee264d7540a7b8f2b405c7a4  vp80-05-sharpness-1439.ivf.md5
 d32dc2c4165eb266ea4c23c14a45459b363def32  vp80-05-sharpness-1440.ivf.md5
 8c69dc3d8e563f56ffab5ad1e400d9e689dd23df  vp80-05-sharpness-1443.ivf.md5
+d6f246df012c241b5fa6c1345019a3703d85c419  vp80-06-smallsize.ivf.md5
 ce881e567fe1d0fbcb2d3e9e6281a1a8d74d82e0  vp90-2-00-quantizer-00.webm
 ac5eda33407d0521c7afca43a63fd305c0cd9d13  vp90-2-00-quantizer-00.webm.md5
 2ca0463f2cfb93d25d7dded174db70b7cb87cb48  vp90-2-00-quantizer-01.webm
@@ -538,3 +540,32 @@ cf8ea970c776797aae71dac8317ea926d9431cab  vp90-2-08-tile_1x4_frame_parallel.webm
 a481fbea465010b57af5a19ebf6d4a5cfe5b9278  vp90-2-08-tile_1x4_frame_parallel.webm.md5
 0203ec456277a01aec401e7fb6c72c9a7e5e3f9d  vp90-2-08-tile_1x4.webm
 c9b237dfcc01c1b414fbcaa481d014a906ef7998  vp90-2-08-tile_1x4.webm.md5
+20c75157e91ab41f82f70ffa73d5d01df8469287  vp90-2-08-tile-4x4.webm
+ae7451810247fd13975cc257aa0301ff17102255  vp90-2-08-tile-4x4.webm.md5
+2ec6e15422ac7a61af072dc5f27fcaf1942ce116  vp90-2-08-tile-4x1.webm
+0094f5ee5e46345017c30e0aa4835b550212d853  vp90-2-08-tile-4x1.webm.md5
+edea45dac4a3c2e5372339f8851d24c9bef803d6  vp90-2-09-subpixel-00.ivf
+5428efc4bf92191faedf4a727fcd1d94966a7abc  vp90-2-09-subpixel-00.ivf.md5
+8cdd435d89029987ee196896e21520e5f879f04d  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+091b373aa2ecb59aa5c647affd5bcafcc7547364  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+87ee28032b0963a44b73a850fcc816a6dc83efbb  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+2064bdb22aa71c2691e0469fb62e8087a43f08f8  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+8080eda22694910162f0996e8a962612f381a57f  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+a484b335c27ea189c0f0d77babea4a510ce12d50  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+3eacf1f006250be4cc5c92a7ef146e385ee62653  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+217f089a16447490823127b36ce0d945522accfd  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+eedb3c641e60dacbe082491a16df529a5c9187df  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+cb7e4955af183dff33bcba0c837f0922ab066400  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+48613f9380e2580002f8a09d6e412ea4e89a52b9  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+990a91f24dd284562d21d714ae773dff5452cad8  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+b6dd558c90bca466b4bcbd03b3371648186465a7  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+1a9c2914ba932a38f0a143efc1ad0e318e78888b  vp90-2-tos_426x178_tile_1x1_181kbps.webm
+a3d2b09f24debad4747a1b3066f572be4273bced  vp90-2-tos_640x266_tile_1x2_336kbps.webm
+c64b03b5c090e6888cb39685c31f00a6b79fa45c  vp90-2-tos_854x356_tile_1x2_656kbps.webm
+0e7cd4135b231c9cea8d76c19f9e84b6fd77acec  vp90-2-08-tile_1x8_frame_parallel.webm
+c9b6850af28579b031791066457f4cb40df6e1c7  vp90-2-08-tile_1x8_frame_parallel.webm.md5
+e448b6e83490bca0f8d58b4f4b1126a17baf4b0c  vp90-2-08-tile_1x8.webm
+5e524165f0397e6141d914f4f0a66267d7658376  vp90-2-08-tile_1x8.webm.md5
+a34e14923d6d17b1144254d8187d7f85b700a63c  vp90-2-02-size-lf-1920x1080.webm
+e3b28ddcfaeb37fb4d132b93f92642a9ad17c22d  vp90-2-02-size-lf-1920x1080.webm.md5
diff --git a/source/libvpx/test/test.mk b/source/libvpx/test/test.mk
index e07dc77..361a34f 100644
--- a/source/libvpx/test/test.mk
+++ b/source/libvpx/test/test.mk
@@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
+LIBVPX_TEST_SRCS-yes += test_vectors.h
+LIBVPX_TEST_SRCS-yes += test_vectors.cc
 
 ##
 ## BLACK BOX TESTS
@@ -32,6 +34,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += lru_frame_buffer_test.cc
 
 ## WebM Parsing
 NESTEGG_SRCS                           += ../nestegg/halloc/halloc.h
@@ -44,6 +48,10 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += $(NESTEGG_SRCS)
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
+# Currently we only support decoder perf tests for vp9
+ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER), yesyes)
+LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
+endif
 
 ##
 ## WHITE BOX TESTS
@@ -96,6 +104,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
 
 endif # VP9
 
@@ -170,6 +179,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-003.ivf.md5
@@ -231,6 +241,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1438.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1439.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1440.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-05-sharpness-1443.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-06-smallsize.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-00.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-00-quantizer-01.webm
@@ -501,6 +512,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x64.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-66x66.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-02-size-lf-1920x1080.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x196.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-03-size-196x198.webm
@@ -645,5 +658,55 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4_frame_parallel.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8_frame_parallel.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile_1x8.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x4.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-09-subpixel-00.ivf.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
+
+ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# BBB VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+#Sintel VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+# TOS VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_426x178_tile_1x1_181kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_640x266_tile_1x2_336kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_854x356_tile_1x2_656kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+endif  # CONFIG_DECODE_PERF_TESTS
diff --git a/source/libvpx/test/test_vector_test.cc b/source/libvpx/test/test_vector_test.cc
index c6ad1c5..6d93bb8 100644
--- a/source/libvpx/test/test_vector_test.cc
+++ b/source/libvpx/test/test_vector_test.cc
@@ -1,11 +1,11 @@
 /*
- Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
- Use of this source code is governed by a BSD-style license
- that can be found in the LICENSE file in the root of the source
- tree. An additional intellectual property rights grant can be found
- in the file PATENTS.  All contributing project authors may
- be found in the AUTHORS file in the root of the source tree.
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <cstdio>
@@ -15,160 +15,15 @@
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/ivf_video_source.h"
-#include "test/webm_video_source.h"
-#include "test/util.h"
 #include "test/md5_helper.h"
+#include "test/test_vectors.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
 extern "C" {
 #include "vpx_mem/vpx_mem.h"
 }
 
 namespace {
-#if CONFIG_VP8_DECODER
-const char *kVP8TestVectors[] = {
-  "vp80-00-comprehensive-001.ivf",
-  "vp80-00-comprehensive-002.ivf", "vp80-00-comprehensive-003.ivf",
-  "vp80-00-comprehensive-004.ivf", "vp80-00-comprehensive-005.ivf",
-  "vp80-00-comprehensive-006.ivf", "vp80-00-comprehensive-007.ivf",
-  "vp80-00-comprehensive-008.ivf", "vp80-00-comprehensive-009.ivf",
-  "vp80-00-comprehensive-010.ivf", "vp80-00-comprehensive-011.ivf",
-  "vp80-00-comprehensive-012.ivf", "vp80-00-comprehensive-013.ivf",
-  "vp80-00-comprehensive-014.ivf", "vp80-00-comprehensive-015.ivf",
-  "vp80-00-comprehensive-016.ivf", "vp80-00-comprehensive-017.ivf",
-  "vp80-00-comprehensive-018.ivf", "vp80-01-intra-1400.ivf",
-  "vp80-01-intra-1411.ivf", "vp80-01-intra-1416.ivf",
-  "vp80-01-intra-1417.ivf", "vp80-02-inter-1402.ivf",
-  "vp80-02-inter-1412.ivf", "vp80-02-inter-1418.ivf",
-  "vp80-02-inter-1424.ivf", "vp80-03-segmentation-01.ivf",
-  "vp80-03-segmentation-02.ivf", "vp80-03-segmentation-03.ivf",
-  "vp80-03-segmentation-04.ivf", "vp80-03-segmentation-1401.ivf",
-  "vp80-03-segmentation-1403.ivf", "vp80-03-segmentation-1407.ivf",
-  "vp80-03-segmentation-1408.ivf", "vp80-03-segmentation-1409.ivf",
-  "vp80-03-segmentation-1410.ivf", "vp80-03-segmentation-1413.ivf",
-  "vp80-03-segmentation-1414.ivf", "vp80-03-segmentation-1415.ivf",
-  "vp80-03-segmentation-1425.ivf", "vp80-03-segmentation-1426.ivf",
-  "vp80-03-segmentation-1427.ivf", "vp80-03-segmentation-1432.ivf",
-  "vp80-03-segmentation-1435.ivf", "vp80-03-segmentation-1436.ivf",
-  "vp80-03-segmentation-1437.ivf", "vp80-03-segmentation-1441.ivf",
-  "vp80-03-segmentation-1442.ivf", "vp80-04-partitions-1404.ivf",
-  "vp80-04-partitions-1405.ivf", "vp80-04-partitions-1406.ivf",
-  "vp80-05-sharpness-1428.ivf", "vp80-05-sharpness-1429.ivf",
-  "vp80-05-sharpness-1430.ivf", "vp80-05-sharpness-1431.ivf",
-  "vp80-05-sharpness-1433.ivf", "vp80-05-sharpness-1434.ivf",
-  "vp80-05-sharpness-1438.ivf", "vp80-05-sharpness-1439.ivf",
-  "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf"
-};
-#endif
-#if CONFIG_VP9_DECODER
-const char *kVP9TestVectors[] = {
-  "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
-  "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
-  "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm",
-  "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm",
-  "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm",
-  "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm",
-  "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm",
-  "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm",
-  "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm",
-  "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm",
-  "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm",
-  "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm",
-  "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm",
-  "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm",
-  "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm",
-  "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm",
-  "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm",
-  "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm",
-  "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm",
-  "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm",
-  "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm",
-  "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm",
-  "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm",
-  "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm",
-  "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm",
-  "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm",
-  "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm",
-  "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm",
-  "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm",
-  "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm",
-  "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm",
-  "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm",
-  "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm",
-  "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm",
-  "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm",
-  "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm",
-  "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm",
-  "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm",
-  "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm",
-  "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm",
-  "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm",
-  "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm",
-  "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm",
-  "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm",
-  "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm",
-  "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm",
-  "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm",
-  "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm",
-  "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm",
-  "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm",
-  "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm",
-  "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm",
-  "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm",
-  "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm",
-  "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm",
-  "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm",
-  "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm",
-  "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm",
-  "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm",
-  "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm",
-  "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm",
-  "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm",
-  "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm",
-  "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm",
-  "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
-  "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
-  "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
-  "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
-  "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
-  "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
-  "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
-  "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm",
-  "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm",
-  "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm",
-  "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm",
-  "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm",
-  "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm",
-  "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm",
-  "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm",
-  "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm",
-  "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm",
-  "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm",
-  "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm",
-  "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm",
-  "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm",
-  "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm",
-  "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm",
-  "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm",
-  "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm",
-  "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm",
-  "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm",
-  "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm",
-  "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm",
-  "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm",
-  "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm",
-  "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm",
-  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
-  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
-  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
-  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
-  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
-  "vp90-2-07-frame_parallel.webm",
-  "vp90-2-08-tile_1x2_frame_parallel.webm", "vp90-2-08-tile_1x2.webm",
-  "vp90-2-08-tile_1x4_frame_parallel.webm", "vp90-2-08-tile_1x4.webm",
-#if CONFIG_NON420
-  "vp91-2-04-yv444.webm"
-#endif
-};
-#endif
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
     public ::libvpx_test::CodecTestWithParam<const char*> {
@@ -236,8 +91,8 @@ TEST_P(TestVectorTest, MD5Match) {
 }
 
 VP8_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(kVP8TestVectors));
+                          ::testing::ValuesIn(libvpx_test::kVP8TestVectors));
 VP9_INSTANTIATE_TEST_CASE(TestVectorTest,
-                          ::testing::ValuesIn(kVP9TestVectors));
+                          ::testing::ValuesIn(libvpx_test::kVP9TestVectors));
 
 }  // namespace
diff --git a/source/libvpx/test/test_vectors.cc b/source/libvpx/test/test_vectors.cc
new file mode 100644
index 0000000..7ffecf0
--- /dev/null
+++ b/source/libvpx/test/test_vectors.cc
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/test_vectors.h"
+
+namespace libvpx_test {
+
+#if CONFIG_VP8_DECODER
+const char *kVP8TestVectors[kNumVp8TestVectors] = {
+  "vp80-00-comprehensive-001.ivf",
+  "vp80-00-comprehensive-002.ivf", "vp80-00-comprehensive-003.ivf",
+  "vp80-00-comprehensive-004.ivf", "vp80-00-comprehensive-005.ivf",
+  "vp80-00-comprehensive-006.ivf", "vp80-00-comprehensive-007.ivf",
+  "vp80-00-comprehensive-008.ivf", "vp80-00-comprehensive-009.ivf",
+  "vp80-00-comprehensive-010.ivf", "vp80-00-comprehensive-011.ivf",
+  "vp80-00-comprehensive-012.ivf", "vp80-00-comprehensive-013.ivf",
+  "vp80-00-comprehensive-014.ivf", "vp80-00-comprehensive-015.ivf",
+  "vp80-00-comprehensive-016.ivf", "vp80-00-comprehensive-017.ivf",
+  "vp80-00-comprehensive-018.ivf", "vp80-01-intra-1400.ivf",
+  "vp80-01-intra-1411.ivf", "vp80-01-intra-1416.ivf",
+  "vp80-01-intra-1417.ivf", "vp80-02-inter-1402.ivf",
+  "vp80-02-inter-1412.ivf", "vp80-02-inter-1418.ivf",
+  "vp80-02-inter-1424.ivf", "vp80-03-segmentation-01.ivf",
+  "vp80-03-segmentation-02.ivf", "vp80-03-segmentation-03.ivf",
+  "vp80-03-segmentation-04.ivf", "vp80-03-segmentation-1401.ivf",
+  "vp80-03-segmentation-1403.ivf", "vp80-03-segmentation-1407.ivf",
+  "vp80-03-segmentation-1408.ivf", "vp80-03-segmentation-1409.ivf",
+  "vp80-03-segmentation-1410.ivf", "vp80-03-segmentation-1413.ivf",
+  "vp80-03-segmentation-1414.ivf", "vp80-03-segmentation-1415.ivf",
+  "vp80-03-segmentation-1425.ivf", "vp80-03-segmentation-1426.ivf",
+  "vp80-03-segmentation-1427.ivf", "vp80-03-segmentation-1432.ivf",
+  "vp80-03-segmentation-1435.ivf", "vp80-03-segmentation-1436.ivf",
+  "vp80-03-segmentation-1437.ivf", "vp80-03-segmentation-1441.ivf",
+  "vp80-03-segmentation-1442.ivf", "vp80-04-partitions-1404.ivf",
+  "vp80-04-partitions-1405.ivf", "vp80-04-partitions-1406.ivf",
+  "vp80-05-sharpness-1428.ivf", "vp80-05-sharpness-1429.ivf",
+  "vp80-05-sharpness-1430.ivf", "vp80-05-sharpness-1431.ivf",
+  "vp80-05-sharpness-1433.ivf", "vp80-05-sharpness-1434.ivf",
+  "vp80-05-sharpness-1438.ivf", "vp80-05-sharpness-1439.ivf",
+  "vp80-05-sharpness-1440.ivf", "vp80-05-sharpness-1443.ivf",
+  "vp80-06-smallsize.ivf"
+};
+#endif  // CONFIG_VP8_DECODER
+#if CONFIG_VP9_DECODER
+const char *kVP9TestVectors[kNumVp9TestVectors] = {
+  "vp90-2-00-quantizer-00.webm", "vp90-2-00-quantizer-01.webm",
+  "vp90-2-00-quantizer-02.webm", "vp90-2-00-quantizer-03.webm",
+  "vp90-2-00-quantizer-04.webm", "vp90-2-00-quantizer-05.webm",
+  "vp90-2-00-quantizer-06.webm", "vp90-2-00-quantizer-07.webm",
+  "vp90-2-00-quantizer-08.webm", "vp90-2-00-quantizer-09.webm",
+  "vp90-2-00-quantizer-10.webm", "vp90-2-00-quantizer-11.webm",
+  "vp90-2-00-quantizer-12.webm", "vp90-2-00-quantizer-13.webm",
+  "vp90-2-00-quantizer-14.webm", "vp90-2-00-quantizer-15.webm",
+  "vp90-2-00-quantizer-16.webm", "vp90-2-00-quantizer-17.webm",
+  "vp90-2-00-quantizer-18.webm", "vp90-2-00-quantizer-19.webm",
+  "vp90-2-00-quantizer-20.webm", "vp90-2-00-quantizer-21.webm",
+  "vp90-2-00-quantizer-22.webm", "vp90-2-00-quantizer-23.webm",
+  "vp90-2-00-quantizer-24.webm", "vp90-2-00-quantizer-25.webm",
+  "vp90-2-00-quantizer-26.webm", "vp90-2-00-quantizer-27.webm",
+  "vp90-2-00-quantizer-28.webm", "vp90-2-00-quantizer-29.webm",
+  "vp90-2-00-quantizer-30.webm", "vp90-2-00-quantizer-31.webm",
+  "vp90-2-00-quantizer-32.webm", "vp90-2-00-quantizer-33.webm",
+  "vp90-2-00-quantizer-34.webm", "vp90-2-00-quantizer-35.webm",
+  "vp90-2-00-quantizer-36.webm", "vp90-2-00-quantizer-37.webm",
+  "vp90-2-00-quantizer-38.webm", "vp90-2-00-quantizer-39.webm",
+  "vp90-2-00-quantizer-40.webm", "vp90-2-00-quantizer-41.webm",
+  "vp90-2-00-quantizer-42.webm", "vp90-2-00-quantizer-43.webm",
+  "vp90-2-00-quantizer-44.webm", "vp90-2-00-quantizer-45.webm",
+  "vp90-2-00-quantizer-46.webm", "vp90-2-00-quantizer-47.webm",
+  "vp90-2-00-quantizer-48.webm", "vp90-2-00-quantizer-49.webm",
+  "vp90-2-00-quantizer-50.webm", "vp90-2-00-quantizer-51.webm",
+  "vp90-2-00-quantizer-52.webm", "vp90-2-00-quantizer-53.webm",
+  "vp90-2-00-quantizer-54.webm", "vp90-2-00-quantizer-55.webm",
+  "vp90-2-00-quantizer-56.webm", "vp90-2-00-quantizer-57.webm",
+  "vp90-2-00-quantizer-58.webm", "vp90-2-00-quantizer-59.webm",
+  "vp90-2-00-quantizer-60.webm", "vp90-2-00-quantizer-61.webm",
+  "vp90-2-00-quantizer-62.webm", "vp90-2-00-quantizer-63.webm",
+  "vp90-2-01-sharpness-1.webm", "vp90-2-01-sharpness-2.webm",
+  "vp90-2-01-sharpness-3.webm", "vp90-2-01-sharpness-4.webm",
+  "vp90-2-01-sharpness-5.webm", "vp90-2-01-sharpness-6.webm",
+  "vp90-2-01-sharpness-7.webm", "vp90-2-02-size-08x08.webm",
+  "vp90-2-02-size-08x10.webm", "vp90-2-02-size-08x16.webm",
+  "vp90-2-02-size-08x18.webm", "vp90-2-02-size-08x32.webm",
+  "vp90-2-02-size-08x34.webm", "vp90-2-02-size-08x64.webm",
+  "vp90-2-02-size-08x66.webm", "vp90-2-02-size-10x08.webm",
+  "vp90-2-02-size-10x10.webm", "vp90-2-02-size-10x16.webm",
+  "vp90-2-02-size-10x18.webm", "vp90-2-02-size-10x32.webm",
+  "vp90-2-02-size-10x34.webm", "vp90-2-02-size-10x64.webm",
+  "vp90-2-02-size-10x66.webm", "vp90-2-02-size-16x08.webm",
+  "vp90-2-02-size-16x10.webm", "vp90-2-02-size-16x16.webm",
+  "vp90-2-02-size-16x18.webm", "vp90-2-02-size-16x32.webm",
+  "vp90-2-02-size-16x34.webm", "vp90-2-02-size-16x64.webm",
+  "vp90-2-02-size-16x66.webm", "vp90-2-02-size-18x08.webm",
+  "vp90-2-02-size-18x10.webm", "vp90-2-02-size-18x16.webm",
+  "vp90-2-02-size-18x18.webm", "vp90-2-02-size-18x32.webm",
+  "vp90-2-02-size-18x34.webm", "vp90-2-02-size-18x64.webm",
+  "vp90-2-02-size-18x66.webm", "vp90-2-02-size-32x08.webm",
+  "vp90-2-02-size-32x10.webm", "vp90-2-02-size-32x16.webm",
+  "vp90-2-02-size-32x18.webm", "vp90-2-02-size-32x32.webm",
+  "vp90-2-02-size-32x34.webm", "vp90-2-02-size-32x64.webm",
+  "vp90-2-02-size-32x66.webm", "vp90-2-02-size-34x08.webm",
+  "vp90-2-02-size-34x10.webm", "vp90-2-02-size-34x16.webm",
+  "vp90-2-02-size-34x18.webm", "vp90-2-02-size-34x32.webm",
+  "vp90-2-02-size-34x34.webm", "vp90-2-02-size-34x64.webm",
+  "vp90-2-02-size-34x66.webm", "vp90-2-02-size-64x08.webm",
+  "vp90-2-02-size-64x10.webm", "vp90-2-02-size-64x16.webm",
+  "vp90-2-02-size-64x18.webm", "vp90-2-02-size-64x32.webm",
+  "vp90-2-02-size-64x34.webm", "vp90-2-02-size-64x64.webm",
+  "vp90-2-02-size-64x66.webm", "vp90-2-02-size-66x08.webm",
+  "vp90-2-02-size-66x10.webm", "vp90-2-02-size-66x16.webm",
+  "vp90-2-02-size-66x18.webm", "vp90-2-02-size-66x32.webm",
+  "vp90-2-02-size-66x34.webm", "vp90-2-02-size-66x64.webm",
+  "vp90-2-02-size-66x66.webm", "vp90-2-03-size-196x196.webm",
+  "vp90-2-03-size-196x198.webm", "vp90-2-03-size-196x200.webm",
+  "vp90-2-03-size-196x202.webm", "vp90-2-03-size-196x208.webm",
+  "vp90-2-03-size-196x210.webm", "vp90-2-03-size-196x224.webm",
+  "vp90-2-03-size-196x226.webm", "vp90-2-03-size-198x196.webm",
+  "vp90-2-03-size-198x198.webm", "vp90-2-03-size-198x200.webm",
+  "vp90-2-03-size-198x202.webm", "vp90-2-03-size-198x208.webm",
+  "vp90-2-03-size-198x210.webm", "vp90-2-03-size-198x224.webm",
+  "vp90-2-03-size-198x226.webm", "vp90-2-03-size-200x196.webm",
+  "vp90-2-03-size-200x198.webm", "vp90-2-03-size-200x200.webm",
+  "vp90-2-03-size-200x202.webm", "vp90-2-03-size-200x208.webm",
+  "vp90-2-03-size-200x210.webm", "vp90-2-03-size-200x224.webm",
+  "vp90-2-03-size-200x226.webm", "vp90-2-03-size-202x196.webm",
+  "vp90-2-03-size-202x198.webm", "vp90-2-03-size-202x200.webm",
+  "vp90-2-03-size-202x202.webm", "vp90-2-03-size-202x208.webm",
+  "vp90-2-03-size-202x210.webm", "vp90-2-03-size-202x224.webm",
+  "vp90-2-03-size-202x226.webm", "vp90-2-03-size-208x196.webm",
+  "vp90-2-03-size-208x198.webm", "vp90-2-03-size-208x200.webm",
+  "vp90-2-03-size-208x202.webm", "vp90-2-03-size-208x208.webm",
+  "vp90-2-03-size-208x210.webm", "vp90-2-03-size-208x224.webm",
+  "vp90-2-03-size-208x226.webm", "vp90-2-03-size-210x196.webm",
+  "vp90-2-03-size-210x198.webm", "vp90-2-03-size-210x200.webm",
+  "vp90-2-03-size-210x202.webm", "vp90-2-03-size-210x208.webm",
+  "vp90-2-03-size-210x210.webm", "vp90-2-03-size-210x224.webm",
+  "vp90-2-03-size-210x226.webm", "vp90-2-03-size-224x196.webm",
+  "vp90-2-03-size-224x198.webm", "vp90-2-03-size-224x200.webm",
+  "vp90-2-03-size-224x202.webm", "vp90-2-03-size-224x208.webm",
+  "vp90-2-03-size-224x210.webm", "vp90-2-03-size-224x224.webm",
+  "vp90-2-03-size-224x226.webm", "vp90-2-03-size-226x196.webm",
+  "vp90-2-03-size-226x198.webm", "vp90-2-03-size-226x200.webm",
+  "vp90-2-03-size-226x202.webm", "vp90-2-03-size-226x208.webm",
+  "vp90-2-03-size-226x210.webm", "vp90-2-03-size-226x224.webm",
+  "vp90-2-03-size-226x226.webm", "vp90-2-03-deltaq.webm",
+  "vp90-2-05-resize.ivf",        "vp90-2-06-bilinear.webm",
+  "vp90-2-07-frame_parallel.webm",
+  "vp90-2-08-tile_1x2_frame_parallel.webm", "vp90-2-08-tile_1x2.webm",
+  "vp90-2-08-tile_1x4_frame_parallel.webm", "vp90-2-08-tile_1x4.webm",
+  "vp90-2-08-tile_1x8_frame_parallel.webm", "vp90-2-08-tile_1x8.webm",
+  "vp90-2-08-tile-4x4.webm", "vp90-2-08-tile-4x1.webm",
+  "vp90-2-09-subpixel-00.ivf",
+  "vp90-2-02-size-lf-1920x1080.webm",
+#if CONFIG_NON420
+  "vp91-2-04-yv444.webm"
+#endif
+};
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace libvpx_test
diff --git a/source/libvpx/test/test_vectors.h b/source/libvpx/test/test_vectors.h
new file mode 100644
index 0000000..942175a
--- /dev/null
+++ b/source/libvpx/test/test_vectors.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_TEST_VECTORS_H_
+#define TEST_TEST_VECTORS_H_
+
+#include "./vpx_config.h"
+
+namespace libvpx_test {
+
+#if CONFIG_VP8_DECODER
+const int kNumVp8TestVectors = 62;
+extern const char *kVP8TestVectors[kNumVp8TestVectors];
+#endif
+
+#if CONFIG_VP9_DECODER
+#if CONFIG_NON420
+const int kNumVp9TestVectors = 214;
+#else
+const int kNumVp9TestVectors = 213;
+#endif
+
+extern const char *kVP9TestVectors[kNumVp9TestVectors];
+#endif  // CONFIG_VP9_DECODER
+
+}  // namespace libvpx_test
+
+#endif  // TEST_TEST_VECTORS_H_
diff --git a/source/libvpx/test/tile_independence_test.cc b/source/libvpx/test/tile_independence_test.cc
index 403dbb6..863a366 100644
--- a/source/libvpx/test/tile_independence_test.cc
+++ b/source/libvpx/test/tile_independence_test.cc
@@ -1,11 +1,11 @@
 /*
- Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
- Use of this source code is governed by a BSD-style license
- that can be found in the LICENSE file in the root of the source
- tree. An additional intellectual property rights grant can be found
- in the file PATENTS.  All contributing project authors may
- be found in the AUTHORS file in the root of the source tree.
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
  */
 
 #include <cstdio>
diff --git a/source/libvpx/test/util.h b/source/libvpx/test/util.h
index 4d7f3d4..3c45721 100644
--- a/source/libvpx/test/util.h
+++ b/source/libvpx/test/util.h
@@ -17,7 +17,6 @@
 #include "vpx/vpx_image.h"
 
 // Macros
-#define PARAMS(...) ::testing::TestWithParam< std::tr1::tuple< __VA_ARGS__ > >
 #define GET_PARAM(k) std::tr1::get< k >(GetParam())
 
 static double compute_psnr(const vpx_image_t *img1,
diff --git a/source/libvpx/test/video_source.h b/source/libvpx/test/video_source.h
index 26d5328..3d01d39 100644
--- a/source/libvpx/test/video_source.h
+++ b/source/libvpx/test/video_source.h
@@ -18,16 +18,35 @@
 
 namespace libvpx_test {
 
-static FILE *OpenTestDataFile(const std::string& file_name) {
-  std::string path_to_source = file_name;
-  const char *kDataPath = getenv("LIBVPX_TEST_DATA_PATH");
-
-  if (kDataPath) {
-    path_to_source = kDataPath;
-    path_to_source += "/";
-    path_to_source += file_name;
+// Helper macros to ensure LIBVPX_TEST_DATA_PATH is a quoted string.
+// These are undefined right below GetDataPath
+// NOTE: LIBVPX_TEST_DATA_PATH MUST NOT be a quoted string before
+// Stringification or the GetDataPath will fail at runtime
+#define TO_STRING(S) #S
+#define STRINGIFY(S) TO_STRING(S)
+
+// A simple function to encapsulate cross platform retrieval of test data path
+static std::string GetDataPath() {
+  const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
+  if (data_path == NULL) {
+#ifdef LIBVPX_TEST_DATA_PATH
+    // In some environments, we cannot set environment variables
+    // Instead, we set the data path by using a preprocessor symbol
+    // which can be set from make files
+    return STRINGIFY(LIBVPX_TEST_DATA_PATH);
+#else
+    return ".";
+#endif
   }
+  return data_path;
+}
 
+// Undefining stringification macros because they are not used elsewhere
+#undef TO_STRING
+#undef STRINGIFY
+
+static FILE *OpenTestDataFile(const std::string& file_name) {
+  const std::string path_to_source = GetDataPath() + "/" + file_name;
   return fopen(path_to_source.c_str(), "rb");
 }
 
diff --git a/source/libvpx/test/vp8_fdct4x4_test.cc b/source/libvpx/test/vp8_fdct4x4_test.cc
index c823436..25465c5 100644
--- a/source/libvpx/test/vp8_fdct4x4_test.cc
+++ b/source/libvpx/test/vp8_fdct4x4_test.cc
@@ -1,13 +1,12 @@
 /*
-*  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-*
-*  Use of this source code is governed by a BSD-style license
-*  that can be found in the LICENSE file in the root of the source
-*  tree. An additional intellectual property rights grant can be found
-*  in the file PATENTS.  All contributing project authors may
-*  be found in the AUTHORS file in the root of the source tree.
-*/
-
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include <math.h>
 #include <stddef.h>
@@ -16,7 +15,6 @@
 #include <string.h>
 #include <sys/types.h>
 
-
 extern "C" {
 #include "./vp8_rtcd.h"
 }
@@ -25,7 +23,6 @@ extern "C" {
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vpx/vpx_integer.h"
 
-
 namespace {
 
 const int cospi8sqrt2minus1 = 20091;
diff --git a/source/libvpx/test/vp9_lossless_test.cc b/source/libvpx/test/vp9_lossless_test.cc
index 441cc44..03b89f8 100644
--- a/source/libvpx/test/vp9_lossless_test.cc
+++ b/source/libvpx/test/vp9_lossless_test.cc
@@ -1,12 +1,12 @@
 /*
-  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-
-  Use of this source code is governed by a BSD-style license
-  that can be found in the LICENSE file in the root of the source
-  tree. An additional intellectual property rights grant can be found
-  in the file PATENTS.  All contributing project authors may
-  be found in the AUTHORS file in the root of the source tree.
-*/
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
@@ -35,7 +35,7 @@ class LossLessTest : public ::libvpx_test::EncoderTest,
   }
 
   virtual void BeginPassHook(unsigned int /*pass*/) {
-    psnr_ = 0.0;
+    psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
@@ -65,9 +65,9 @@ TEST_P(LossLessTest, TestLossLessEncoding) {
   init_flags_ = VPX_CODEC_USE_PSNR;
 
   // intentionally changed the dimension for better testing coverage
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 356, 284,
-                                     timebase.den, timebase.num, 0, 30);
-
+  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                     timebase.den, timebase.num, 0, 10);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   const double psnr_lossless = GetMinPsnr();
   EXPECT_GE(psnr_lossless, kMaxPsnr);
 }
diff --git a/source/libvpx/test/vp9_subtract_test.cc b/source/libvpx/test/vp9_subtract_test.cc
index 332a839..e4c4cfe 100644
--- a/source/libvpx/test/vp9_subtract_test.cc
+++ b/source/libvpx/test/vp9_subtract_test.cc
@@ -41,8 +41,8 @@ TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
   // FIXME(rbultje) split in its own file
   for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
        bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width  = 4 << b_width_log2(bsize);
-    const int block_height = 4 << b_height_log2(bsize);
+    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
     int16_t *diff = reinterpret_cast<int16_t *>(
         vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
     uint8_t *pred = reinterpret_cast<uint8_t *>(
diff --git a/source/libvpx/test/vp9_thread_test.cc b/source/libvpx/test/vp9_thread_test.cc
index a8ce6e4..a78cdea 100644
--- a/source/libvpx/test/vp9_thread_test.cc
+++ b/source/libvpx/test/vp9_thread_test.cc
@@ -141,10 +141,12 @@ TEST(VP9DecodeMTTest, MTDecode2) {
       "68ede6abd66bae0a2edf2eb9232241b6" },
     { "vp90-2-08-tile_1x4_frame_parallel.webm",
       "368ebc6ebf3a5e478d85b2c3149b2848" },
+    { "vp90-2-08-tile_1x8_frame_parallel.webm",
+      "17e439da2388aff3a0f69cb22579c6c1" },
   };
 
   for (int i = 0; i < static_cast<int>(sizeof(files) / sizeof(files[0])); ++i) {
-    for (int t = 2; t <= 4; ++t) {
+    for (int t = 2; t <= 8; ++t) {
       EXPECT_STREQ(files[i].expected_md5, DecodeFile(files[i].name, t).c_str())
           << "threads = " << t;
     }
diff --git a/source/libvpx/test/webm_video_source.h b/source/libvpx/test/webm_video_source.h
index 9fc8545..53b0ba2 100644
--- a/source/libvpx/test/webm_video_source.h
+++ b/source/libvpx/test/webm_video_source.h
@@ -90,8 +90,12 @@ class WebMVideoSource : public CompressedVideoSource {
   virtual ~WebMVideoSource() {
     if (input_file_)
       fclose(input_file_);
-    if (nestegg_ctx_)
+    if (nestegg_ctx_ != NULL) {
+      if (pkt_ != NULL) {
+        nestegg_free_packet(pkt_);
+      }
       nestegg_destroy(nestegg_ctx_);
+    }
   }
 
   virtual void Init() {
@@ -136,8 +140,10 @@ class WebMVideoSource : public CompressedVideoSource {
 
       do {
         /* End of this packet, get another. */
-        if (pkt_)
+        if (pkt_ != NULL) {
           nestegg_free_packet(pkt_);
+          pkt_ = NULL;
+        }
 
         int again = nestegg_read_packet(nestegg_ctx_, &pkt_);
         ASSERT_GE(again, 0) << "nestegg_read_packet failed";
diff --git a/source/libvpx/tools_common.c b/source/libvpx/tools_common.c
index 92de794..9c24983 100644
--- a/source/libvpx/tools_common.c
+++ b/source/libvpx/tools_common.c
@@ -7,8 +7,14 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <stdio.h>
+
 #include "tools_common.h"
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
@@ -20,6 +26,18 @@
 #endif
 #endif
 
+#define LOG_ERROR(label) do {\
+  const char *l = label;\
+  va_list ap;\
+  va_start(ap, fmt);\
+  if (l)\
+    fprintf(stderr, "%s: ", l);\
+  vfprintf(stderr, fmt, ap);\
+  fprintf(stderr, "\n");\
+  va_end(ap);\
+} while (0)
+
+
 FILE *set_binary_mode(FILE *stream) {
   (void)stream;
 #if defined(_WIN32) || defined(__OS2__)
@@ -27,3 +45,88 @@ FILE *set_binary_mode(FILE *stream) {
 #endif
   return stream;
 }
+
+void die(const char *fmt, ...) {
+  LOG_ERROR(NULL);
+  usage_exit();
+}
+
+void fatal(const char *fmt, ...) {
+  LOG_ERROR("Fatal");
+  exit(EXIT_FAILURE);
+}
+
+void warn(const char *fmt, ...) {
+  LOG_ERROR("Warning");
+}
+
+uint16_t mem_get_le16(const void *data) {
+  uint16_t val;
+  const uint8_t *mem = (const uint8_t*)data;
+
+  val = mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+uint32_t mem_get_le32(const void *data) {
+  uint32_t val;
+  const uint8_t *mem = (const uint8_t*)data;
+
+  val = mem[3] << 24;
+  val |= mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
+  FILE *f = input_ctx->file;
+  struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+  int plane = 0;
+  int shortread = 0;
+
+  for (plane = 0; plane < 3; ++plane) {
+    uint8_t *ptr;
+    const int w = (plane ? (1 + yuv_frame->d_w) / 2 : yuv_frame->d_w);
+    const int h = (plane ? (1 + yuv_frame->d_h) / 2 : yuv_frame->d_h);
+    int r;
+
+    /* Determine the correct plane based on the image format. The for-loop
+     * always counts in Y,U,V order, but this may not match the order of
+     * the data on disk.
+     */
+    switch (plane) {
+      case 1:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
+        break;
+      case 2:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
+        break;
+      default:
+        ptr = yuv_frame->planes[plane];
+    }
+
+    for (r = 0; r < h; ++r) {
+      size_t needed = w;
+      size_t buf_position = 0;
+      const size_t left = detect->buf_read - detect->position;
+      if (left > 0) {
+        const size_t more = (left < needed) ? left : needed;
+        memcpy(ptr, detect->buf + detect->position, more);
+        buf_position = more;
+        needed -= more;
+        detect->position += more;
+      }
+      if (needed > 0) {
+        shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+      }
+
+      ptr += yuv_frame->stride[plane];
+    }
+  }
+
+  return shortread;
+}
diff --git a/source/libvpx/tools_common.h b/source/libvpx/tools_common.h
index 9e56149..7500523 100644
--- a/source/libvpx/tools_common.h
+++ b/source/libvpx/tools_common.h
@@ -7,10 +7,123 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#ifndef TOOLS_COMMON_H
-#define TOOLS_COMMON_H
+#ifndef TOOLS_COMMON_H_
+#define TOOLS_COMMON_H_
+
+#include <stdio.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_integer.h"
+
+#if CONFIG_ENCODERS
+#include "./y4minput.h"
+#endif
+
+#if defined(_MSC_VER)
+/* MSVS doesn't define off_t, and uses _f{seek,tell}i64. */
+typedef __int64 off_t;
+#define fseeko _fseeki64
+#define ftello _ftelli64
+#elif defined(_WIN32)
+/* MinGW defines off_t as long and uses f{seek,tell}o64/off64_t for large
+ * files. */
+#define fseeko fseeko64
+#define ftello ftello64
+#define off_t off64_t
+#endif  /* _WIN32 */
+
+#if CONFIG_OS_SUPPORT
+#if defined(_MSC_VER)
+#include <io.h>  /* NOLINT */
+#define snprintf _snprintf
+#define isatty   _isatty
+#define fileno   _fileno
+#else
+#include <unistd.h>  /* NOLINT */
+#endif  /* _MSC_VER */
+#endif  /* CONFIG_OS_SUPPORT */
+
+/* Use 32-bit file operations in WebM file format when building ARM
+ * executables (.axf) with RVCT. */
+#if !CONFIG_OS_SUPPORT
+typedef long off_t;  /* NOLINT */
+#define fseeko fseek
+#define ftello ftell
+#endif  /* CONFIG_OS_SUPPORT */
+
+#define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo)
+
+#ifndef PATH_MAX
+#define PATH_MAX 512
+#endif
+
+#define IVF_FRAME_HDR_SZ (4 + 8)  /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+
+#define VP8_FOURCC (0x30385056)
+#define VP9_FOURCC (0x30395056)
+#define VP8_FOURCC_MASK (0x00385056)
+#define VP9_FOURCC_MASK (0x00395056)
+
+enum VideoFileType {
+  FILE_TYPE_RAW,
+  FILE_TYPE_IVF,
+  FILE_TYPE_Y4M,
+  FILE_TYPE_WEBM
+};
+
+struct FileTypeDetectionBuffer {
+  char buf[4];
+  size_t buf_read;
+  size_t position;
+};
+
+struct VpxRational {
+  int numerator;
+  int denominator;
+};
+
+struct VpxInputContext {
+  const char *filename;
+  FILE *file;
+  off_t length;
+  struct FileTypeDetectionBuffer detect;
+  enum VideoFileType file_type;
+  uint32_t width;
+  uint32_t height;
+  int use_i420;
+  int only_i420;
+  uint32_t fourcc;
+  struct VpxRational framerate;
+#if CONFIG_ENCODERS
+  y4m_input y4m;
+#endif
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
+void die(const char *fmt, ...);
+void fatal(const char *fmt, ...);
+void warn(const char *fmt, ...);
+
+/* The tool including this file must define usage_exit() */
+void usage_exit();
+
+uint16_t mem_get_le16(const void *data);
+uint32_t mem_get_le32(const void *data);
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
+
+#ifdef __cplusplus
+}  /* extern "C" */
 #endif
+
+#endif  // TOOLS_COMMON_H_
diff --git a/source/libvpx/vp8/common/postproc.c b/source/libvpx/vp8/common/postproc.c
index dd998f1..e3bee32 100644
--- a/source/libvpx/vp8/common/postproc.c
+++ b/source/libvpx/vp8/common/postproc.c
@@ -71,11 +71,6 @@ static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] =
 };
 #endif
 
-static const short kernel5[] =
-{
-    1, 1, 4, 1, 1
-};
-
 const short vp8_rv[] =
 {
     8, 5, 2, 2, 8, 12, 4, 9, 8, 3,
diff --git a/source/libvpx/vp8/common/setupintrarecon.h b/source/libvpx/vp8/common/setupintrarecon.h
index e515c3a..8b6c50b 100644
--- a/source/libvpx/vp8/common/setupintrarecon.h
+++ b/source/libvpx/vp8/common/setupintrarecon.h
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#ifndef SETUPINTRARECON_H
+#define SETUPINTRARECON_H
 
 #include "vpx_scale/yv12config.h"
 extern void vp8_setup_intra_recon(YV12_BUFFER_CONFIG *ybf);
@@ -31,3 +33,5 @@ void setup_intra_recon_left(unsigned char *y_buffer,
     for (i = 0; i < 8; i++)
         v_buffer[uv_stride *i] = (unsigned char) 129;
 }
+
+#endif
diff --git a/source/libvpx/vp8/decoder/decodframe.c b/source/libvpx/vp8/decoder/decodeframe.c
index 16da78a..bfde599 100644
--- a/source/libvpx/vp8/decoder/decodframe.c
+++ b/source/libvpx/vp8/decoder/decodeframe.c
@@ -680,7 +680,6 @@ static void decode_mb_rows(VP8D_COMP *pbi)
                     vp8_loop_filter_row_simple(pc, lf_mic, mb_row-1,
                                                recon_y_stride, recon_uv_stride,
                                                lf_dst[0], lf_dst[1], lf_dst[2]);
-
                 if(mb_row > 1)
                 {
                     yv12_extend_frame_left_right_c(yv12_fb_new,
@@ -691,10 +690,6 @@ static void decode_mb_rows(VP8D_COMP *pbi)
                     eb_dst[0] += recon_y_stride  * 16;
                     eb_dst[1] += recon_uv_stride *  8;
                     eb_dst[2] += recon_uv_stride *  8;
-
-                    if(mb_row == 2)
-                        yv12_extend_frame_top_c(yv12_fb_new);
-
                 }
 
                 lf_dst[0] += recon_y_stride  * 16;
@@ -713,13 +708,9 @@ static void decode_mb_rows(VP8D_COMP *pbi)
                                                eb_dst[0],
                                                eb_dst[1],
                                                eb_dst[2]);
-
                 eb_dst[0] += recon_y_stride  * 16;
                 eb_dst[1] += recon_uv_stride *  8;
                 eb_dst[2] += recon_uv_stride *  8;
-
-                if(mb_row == 1)
-                    yv12_extend_frame_top_c(yv12_fb_new);
             }
         }
     }
@@ -747,7 +738,7 @@ static void decode_mb_rows(VP8D_COMP *pbi)
                                    eb_dst[0],
                                    eb_dst[1],
                                    eb_dst[2]);
-
+    yv12_extend_frame_top_c(yv12_fb_new);
     yv12_extend_frame_bottom_c(yv12_fb_new);
 
 }
diff --git a/source/libvpx/vp8/encoder/onyx_if.c b/source/libvpx/vp8/encoder/onyx_if.c
index 7c07975..de57c32 100644
--- a/source/libvpx/vp8/encoder/onyx_if.c
+++ b/source/libvpx/vp8/encoder/onyx_if.c
@@ -3574,7 +3574,8 @@ static void encode_frame_to_data_rate
                 for (i=cpi->current_layer+1; i<cpi->oxcf.number_of_layers; i++)
                 {
                     LAYER_CONTEXT *lc = &cpi->layer_context[i];
-                    lc->bits_off_target += cpi->av_per_frame_bandwidth;
+                    lc->bits_off_target += (int)(lc->target_bandwidth /
+                                                 lc->framerate);
                     if (lc->bits_off_target > lc->maximum_buffer_size)
                         lc->bits_off_target = lc->maximum_buffer_size;
                     lc->buffer_level = lc->bits_off_target;
@@ -3807,7 +3808,7 @@ static void encode_frame_to_data_rate
 
     /* Setup background Q adjustment for error resilient mode.
      * For multi-layer encodes only enable this for the base layer.
-     */
+    */
     if (cpi->cyclic_refresh_mode_enabled)
     {
       if (cpi->current_layer==0)
@@ -4620,45 +4621,43 @@ static void encode_frame_to_data_rate
         vp8_clear_system_state();
 
         if (cpi->twopass.total_left_stats.coded_error != 0.0)
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
-                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
-                       "%10.3f %8d\n",
+            fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+                       "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+                       "%8.2lf %"PRId64" %10.3lf %10"PRId64" %8d\n",
                        cpi->common.current_video_frame, cpi->this_frame_target,
                        cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
-                       (int)cpi->total_target_vs_actual,
+                       cpi->total_target_vs_actual,
                        cpi->buffer_level,
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
-                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
                        cpi->ni_av_qi, cpi->cq_target_quality,
-                       cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
-                       (int)cpi->twopass.bits_left,
+                       cpi->twopass.bits_left,
                        cpi->twopass.total_left_stats.coded_error,
                        (double)cpi->twopass.bits_left /
                            cpi->twopass.total_left_stats.coded_error,
                        cpi->tot_recode_hits);
         else
-            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %6d %6d"
-                       "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f"
-                       "%8d\n",
-                       cpi->common.current_video_frame,
-                       cpi->this_frame_target, cpi->projected_frame_size,
+            fprintf(f, "%10d %10d %10d %10d %10d %10"PRId64" %10"PRId64
+                       "%10"PRId64" %10d %6d %6d %6d %6d %5d %5d %5d %8d "
+                       "%8.2lf %"PRId64" %10.3lf %8d\n",
+                       cpi->common.current_video_frame, cpi->this_frame_target,
+                       cpi->projected_frame_size,
                        (cpi->projected_frame_size - cpi->this_frame_target),
-                       (int)cpi->total_target_vs_actual,
+                       cpi->total_target_vs_actual,
                        cpi->buffer_level,
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
-                       (int)cpi->total_actual_bits, cm->base_qindex,
+                       cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
                        cpi->ni_av_qi, cpi->cq_target_quality,
-                       cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->twopass.est_max_qcorrection_factor,
-                       (int)cpi->twopass.bits_left,
+                       cpi->twopass.bits_left,
                        cpi->twopass.total_left_stats.coded_error,
                        cpi->tot_recode_hits);
 
@@ -4666,7 +4665,6 @@ static void encode_frame_to_data_rate
 
         {
             FILE *fmodes = fopen("Modes.stt", "a");
-            int i;
 
             fprintf(fmodes, "%6d:%1d:%1d:%1d ",
                         cpi->common.current_video_frame,
diff --git a/source/libvpx/vp8/encoder/ratectrl.c b/source/libvpx/vp8/encoder/ratectrl.c
index 1e8259c..c51650c 100644
--- a/source/libvpx/vp8/encoder/ratectrl.c
+++ b/source/libvpx/vp8/encoder/ratectrl.c
@@ -174,14 +174,6 @@ static const int kf_gf_boost_qlimits[QINDEX_RANGE] =
     600, 600, 600, 600, 600, 600, 600, 600,
 };
 
-/* % adjustment to target kf size based on seperation from previous frame */
-static const int kf_boost_seperation_adjustment[16] =
-{
-    30,   40,   50,   55,   60,   65,   70,   75,
-    80,   85,   90,   95,  100,  100,  100,  100,
-};
-
-
 static const int gf_adjust_table[101] =
 {
     100,
@@ -956,6 +948,21 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
             if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
               cpi->bits_off_target = (int)cpi->oxcf.maximum_buffer_size;
             cpi->buffer_level = cpi->bits_off_target;
+
+            if (cpi->oxcf.number_of_layers > 1) {
+              unsigned int i;
+
+              // Propagate bits saved by dropping the frame to higher layers.
+              for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers;
+                  i++) {
+                LAYER_CONTEXT *lc = &cpi->layer_context[i];
+                lc->bits_off_target += (int)(lc->target_bandwidth /
+                                             lc->framerate);
+                if (lc->bits_off_target > lc->maximum_buffer_size)
+                  lc->bits_off_target = lc->maximum_buffer_size;
+                lc->buffer_level = lc->bits_off_target;
+              }
+            }
         }
     }
 
@@ -1223,7 +1230,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
         {
             Q = cpi->oxcf.gold_q;
         }
-
     }
     else
     {
diff --git a/source/libvpx/vp8/vp8_dx_iface.c b/source/libvpx/vp8/vp8_dx_iface.c
index 871b8d3..0b4c4cb 100644
--- a/source/libvpx/vp8/vp8_dx_iface.c
+++ b/source/libvpx/vp8/vp8_dx_iface.c
@@ -929,6 +929,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
         vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
         vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
         vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+        NOT_IMPLEMENTED,
     },
     { /* encoder functions */
         NOT_IMPLEMENTED,
diff --git a/source/libvpx/vp8/vp8dx.mk b/source/libvpx/vp8/vp8dx.mk
index 4a8f467..892ed70 100644
--- a/source/libvpx/vp8/vp8dx.mk
+++ b/source/libvpx/vp8/vp8dx.mk
@@ -22,7 +22,7 @@ VP8_DX_SRCS-yes += vp8_dx_iface.c
 
 VP8_DX_SRCS-yes += decoder/dboolhuff.c
 VP8_DX_SRCS-yes += decoder/decodemv.c
-VP8_DX_SRCS-yes += decoder/decodframe.c
+VP8_DX_SRCS-yes += decoder/decodeframe.c
 VP8_DX_SRCS-yes += decoder/detokenize.c
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/ec_types.h
 VP8_DX_SRCS-$(CONFIG_ERROR_CONCEALMENT) += decoder/error_concealment.h
diff --git a/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm b/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
new file mode 100644
index 0000000..751bc74
--- /dev/null
+++ b/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.asm
@@ -0,0 +1,199 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_loop_filter_horizontal_edge_16_neon|
+    ARM
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p,
+;                                             const uint8_t *blimit0,
+;                                             const uint8_t *limit0,
+;                                             const uint8_t *thresh0,
+;                                             const uint8_t *blimit1,
+;                                             const uint8_t *limit1,
+;                                             const uint8_t *thresh1)
+; r0    uint8_t *s,
+; r1    int p,
+; r2    const uint8_t *blimit0,
+; r3    const uint8_t *limit0,
+; sp    const uint8_t *thresh0,
+; sp+4  const uint8_t *blimit1,
+; sp+8  const uint8_t *limit1,
+; sp+12 const uint8_t *thresh1,
+
+|vp9_loop_filter_horizontal_edge_16_neon| PROC
+    push        {lr}
+
+    ldr         r12, [sp, #4]              ; load thresh0
+    vld1.8      {d0}, [r2]                 ; load blimit0 to first half q
+    vld1.8      {d2}, [r3]                 ; load limit0 to first half q
+
+    add         r1, r1, r1                 ; double pitch
+    ldr         r2, [sp, #8]               ; load blimit1
+
+    vld1.8      {d4}, [r12]                ; load thresh0 to first half q
+
+    ldr         r3, [sp, #12]              ; load limit1
+    ldr         r12, [sp, #16]             ; load thresh1
+    vld1.8      {d1}, [r2]                 ; load blimit1 to 2nd half q
+
+    sub         r2, r0, r1, lsl #1         ; s[-4 * p]
+
+    vld1.8      {d3}, [r3]                 ; load limit1 to 2nd half q
+    vld1.8      {d5}, [r12]                ; load thresh1 to 2nd half q
+
+    vpush       {d8-d15}                   ; save neon registers
+
+    add         r3, r2, r1, lsr #1         ; s[-3 * p]
+
+    vld1.u8     {q3}, [r2@64], r1          ; p3
+    vld1.u8     {q4}, [r3@64], r1          ; p2
+    vld1.u8     {q5}, [r2@64], r1          ; p1
+    vld1.u8     {q6}, [r3@64], r1          ; p0
+    vld1.u8     {q7}, [r2@64], r1          ; q0
+    vld1.u8     {q8}, [r3@64], r1          ; q1
+    vld1.u8     {q9}, [r2@64]              ; q2
+    vld1.u8     {q10}, [r3@64]             ; q3
+
+    sub         r2, r2, r1, lsl #1
+    sub         r3, r3, r1, lsl #1
+
+    bl          vp9_loop_filter_neon_16
+
+    vst1.u8     {q5}, [r2@64], r1          ; store op1
+    vst1.u8     {q6}, [r3@64], r1          ; store op0
+    vst1.u8     {q7}, [r2@64], r1          ; store oq0
+    vst1.u8     {q8}, [r3@64], r1          ; store oq1
+
+    vpop        {d8-d15}                   ; restore neon registers
+
+    pop         {pc}
+    ENDP        ; |vp9_loop_filter_horizontal_edge_16_neon|
+
+; void vp9_loop_filter_neon_16();
+; This is a helper function for the loopfilters. The invidual functions do the
+; necessary load, transpose (if necessary) and store. This function uses
+; registers d8-d15, so the calling function must save those registers.
+;
+; r0-r3, r12 PRESERVE
+; q0    blimit
+; q1    limit
+; q2    thresh
+; q3    p3
+; q4    p2
+; q5    p1
+; q6    p0
+; q7    q0
+; q8    q1
+; q9    q2
+; q10   q3
+;
+; Outputs:
+; q5    op1
+; q6    op0
+; q7    oq0
+; q8    oq1
+|vp9_loop_filter_neon_16| PROC
+
+    ; filter_mask
+    vabd.u8     q11, q3, q4                 ; m1 = abs(p3 - p2)
+    vabd.u8     q12, q4, q5                 ; m2 = abs(p2 - p1)
+    vabd.u8     q13, q5, q6                 ; m3 = abs(p1 - p0)
+    vabd.u8     q14, q8, q7                 ; m4 = abs(q1 - q0)
+    vabd.u8     q3, q9, q8                  ; m5 = abs(q2 - q1)
+    vabd.u8     q4, q10, q9                 ; m6 = abs(q3 - q2)
+
+    ; only compare the largest value to limit
+    vmax.u8     q11, q11, q12               ; m7 = max(m1, m2)
+    vmax.u8     q12, q13, q14               ; m8 = max(m3, m4)
+
+    vabd.u8     q9, q6, q7                  ; abs(p0 - q0)
+
+    vmax.u8     q3, q3, q4                  ; m9 = max(m5, m6)
+
+    vmov.u8     q10, #0x80
+
+    vmax.u8     q15, q11, q12               ; m10 = max(m7, m8)
+
+    vcgt.u8     q13, q13, q2                ; (abs(p1 - p0) > thresh)*-1
+    vcgt.u8     q14, q14, q2                ; (abs(q1 - q0) > thresh)*-1
+    vmax.u8     q15, q15, q3                ; m11 = max(m10, m9)
+
+    vabd.u8     q2, q5, q8                  ; a = abs(p1 - q1)
+    vqadd.u8    q9, q9, q9                  ; b = abs(p0 - q0) * 2
+
+    veor        q7, q7, q10                 ; qs0
+
+    vcge.u8     q15, q1, q15                ; abs(m11) > limit
+
+    vshr.u8     q2, q2, #1                  ; a = a / 2
+    veor        q6, q6, q10                 ; ps0
+
+    veor        q5, q5, q10                 ; ps1
+    vqadd.u8    q9, q9, q2                  ; a = b + a
+
+    veor        q8, q8, q10                 ; qs1
+
+    vmov.u16    q4, #3
+
+    vsubl.s8    q2, d14, d12                ; ( qs0 - ps0)
+    vsubl.s8    q11, d15, d13
+
+    vcge.u8     q9, q0, q9                  ; a > blimit
+
+    vqsub.s8    q1, q5, q8                  ; filter = clamp(ps1-qs1)
+    vorr        q14, q13, q14               ; hev
+
+    vmul.i16    q2, q2, q4                  ; 3 * ( qs0 - ps0)
+    vmul.i16    q11, q11, q4
+
+    vand        q1, q1, q14                 ; filter &= hev
+    vand        q15, q15, q9                ; mask
+
+    vmov.u8     q4, #3
+
+    vaddw.s8    q2, q2, d2                  ; filter + 3 * (qs0 - ps0)
+    vaddw.s8    q11, q11, d3
+
+    vmov.u8     q9, #4
+
+    ; filter = clamp(filter + 3 * ( qs0 - ps0))
+    vqmovn.s16  d2, q2
+    vqmovn.s16  d3, q11
+    vand        q1, q1, q15                 ; filter &= mask
+
+    vqadd.s8    q2, q1, q4                  ; filter2 = clamp(filter+3)
+    vqadd.s8    q1, q1, q9                  ; filter1 = clamp(filter+4)
+    vshr.s8     q2, q2, #3                  ; filter2 >>= 3
+    vshr.s8     q1, q1, #3                  ; filter1 >>= 3
+
+
+    vqadd.s8    q11, q6, q2                 ; u = clamp(ps0 + filter2)
+    vqsub.s8    q0, q7, q1                  ; u = clamp(qs0 - filter1)
+
+    ; outer tap adjustments
+    vrshr.s8    q1, q1, #1                  ; filter = ++filter1 >> 1
+
+    veor        q7, q0,  q10                ; *oq0 = u^0x80
+
+    vbic        q1, q1, q14                 ; filter &= ~hev
+
+    vqadd.s8    q13, q5, q1                 ; u = clamp(ps1 + filter)
+    vqsub.s8    q12, q8, q1                 ; u = clamp(qs1 - filter)
+
+    veor        q6, q11, q10                ; *op0 = u^0x80
+    veor        q5, q13, q10                ; *op1 = u^0x80
+    veor        q8, q12, q10                ; *oq1 = u^0x80
+
+    bx          lr
+    ENDP        ; |vp9_loop_filter_neon_16|
+
+    END
diff --git a/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
new file mode 100644
index 0000000..b97e7aa
--- /dev/null
+++ b/source/libvpx/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
+                                               const uint8_t *blimit0,
+                                               const uint8_t *limit0,
+                                               const uint8_t *thresh0,
+                                               const uint8_t *blimit1,
+                                               const uint8_t *limit1,
+                                               const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_loop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_neon(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_neon(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_neon(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_neon(uint8_t *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_neon(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_neon(s + 8 * p, p, blimit, limit, thresh);
+}
diff --git a/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm b/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
new file mode 100644
index 0000000..d290d07
--- /dev/null
+++ b/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_1_add_neon.asm
@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT  |vp9_idct32x32_1_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ;TODO(hkuang): put the following macros in a seperate
+    ;file so other idct function could also use them.
+    MACRO
+    LD_16x8          $src, $stride
+    vld1.8           {q8}, [$src], $stride
+    vld1.8           {q9}, [$src], $stride
+    vld1.8           {q10}, [$src], $stride
+    vld1.8           {q11}, [$src], $stride
+    vld1.8           {q12}, [$src], $stride
+    vld1.8           {q13}, [$src], $stride
+    vld1.8           {q14}, [$src], $stride
+    vld1.8           {q15}, [$src], $stride
+    MEND
+
+    MACRO
+    ADD_DIFF_16x8    $diff
+    vqadd.u8         q8, q8, $diff
+    vqadd.u8         q9, q9, $diff
+    vqadd.u8         q10, q10, $diff
+    vqadd.u8         q11, q11, $diff
+    vqadd.u8         q12, q12, $diff
+    vqadd.u8         q13, q13, $diff
+    vqadd.u8         q14, q14, $diff
+    vqadd.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    SUB_DIFF_16x8    $diff
+    vqsub.u8         q8, q8, $diff
+    vqsub.u8         q9, q9, $diff
+    vqsub.u8         q10, q10, $diff
+    vqsub.u8         q11, q11, $diff
+    vqsub.u8         q12, q12, $diff
+    vqsub.u8         q13, q13, $diff
+    vqsub.u8         q14, q14, $diff
+    vqsub.u8         q15, q15, $diff
+    MEND
+
+    MACRO
+    ST_16x8          $dst, $stride
+    vst1.8           {q8}, [$dst], $stride
+    vst1.8           {q9}, [$dst], $stride
+    vst1.8           {q10},[$dst], $stride
+    vst1.8           {q11},[$dst], $stride
+    vst1.8           {q12},[$dst], $stride
+    vst1.8           {q13},[$dst], $stride
+    vst1.8           {q14},[$dst], $stride
+    vst1.8           {q15},[$dst], $stride
+    MEND
+
+;void vp9_idct32x32_1_add_neon(int16_t *input, uint8_t *dest,
+;                              int dest_stride)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+
+|vp9_idct32x32_1_add_neon| PROC
+    push             {lr}
+    pld              [r1]
+    add              r3, r1, #16               ; r3 dest + 16 for second loop
+    ldrsh            r0, [r0]
+
+    ; generate cospi_16_64 = 11585
+    mov              r12, #0x2d00
+    add              r12, #0x41
+
+    ; out = dct_const_round_shift(input[0] * cospi_16_64)
+    mul              r0, r0, r12               ; input[0] * cospi_16_64
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; out = dct_const_round_shift(out * cospi_16_64)
+    mul              r0, r0, r12               ; out * cospi_16_64
+    mov              r12, r1                   ; save dest
+    add              r0, r0, #0x2000           ; +(1 << ((DCT_CONST_BITS) - 1))
+    asr              r0, r0, #14               ; >> DCT_CONST_BITS
+
+    ; a1 = ROUND_POWER_OF_TWO(out, 6)
+    add              r0, r0, #32               ; + (1 <<((6) - 1))
+    asrs             r0, r0, #6                ; >> 6
+    bge              diff_positive_32_32
+
+diff_negative_32_32
+    neg              r0, r0
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_negative_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    SUB_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_negative_32_32_loop
+    pop              {pc}
+
+diff_positive_32_32
+    usat             r0, #8, r0
+    vdup.u8          q0, r0
+    mov              r0, #4
+
+diff_positive_32_32_loop
+    sub              r0, #1
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+
+    LD_16x8          r1, r2
+    ADD_DIFF_16x8    q0
+    ST_16x8          r12, r2
+    cmp              r0, #2
+    moveq            r1, r3
+    moveq            r12, r3
+    cmp              r0, #0
+    bne              diff_positive_32_32_loop
+    pop              {pc}
+
+    ENDP             ; |vp9_idct32x32_1_add_neon|
+    END
diff --git a/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
index f00d027..388a7d7 100644
--- a/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+++ b/source/libvpx/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -1145,7 +1145,7 @@ idct32_bands_end_1st_pass
 
     ; pass loop processing
     add r5, r5, #1
-    B idct32_pass_loop
+    b idct32_pass_loop
 
 idct32_bands_end_2nd_pass
     STORE_COMBINE_CENTER_RESULTS
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
new file mode 100644
index 0000000..b0dc496
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c
@@ -0,0 +1,332 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+  int32_t  tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "lb         %[tmp5],      4(%[left])                   \n\t"
+      "lb         %[tmp6],      5(%[left])                   \n\t"
+      "lb         %[tmp7],      6(%[left])                   \n\t"
+      "lb         %[tmp8],      7(%[left])                   \n\t"
+      "lb         %[tmp9],      8(%[left])                   \n\t"
+      "lb         %[tmp10],     9(%[left])                   \n\t"
+      "lb         %[tmp11],     10(%[left])                  \n\t"
+      "lb         %[tmp12],     11(%[left])                  \n\t"
+      "lb         %[tmp13],     12(%[left])                  \n\t"
+      "lb         %[tmp14],     13(%[left])                  \n\t"
+      "lb         %[tmp15],     14(%[left])                  \n\t"
+      "lb         %[tmp16],     15(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
+      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
+      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
+      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
+      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
+      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
+      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
+      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
+      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "sw         %[tmp1],      4(%[dst])                    \n\t"
+      "sw         %[tmp1],      8(%[dst])                    \n\t"
+      "sw         %[tmp1],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "sw         %[tmp2],      4(%[dst])                    \n\t"
+      "sw         %[tmp2],      8(%[dst])                    \n\t"
+      "sw         %[tmp2],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "sw         %[tmp3],      4(%[dst])                    \n\t"
+      "sw         %[tmp3],      8(%[dst])                    \n\t"
+      "sw         %[tmp3],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+      "sw         %[tmp4],      4(%[dst])                    \n\t"
+      "sw         %[tmp4],      8(%[dst])                    \n\t"
+      "sw         %[tmp4],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp5],      (%[dst])                     \n\t"
+      "sw         %[tmp5],      4(%[dst])                    \n\t"
+      "sw         %[tmp5],      8(%[dst])                    \n\t"
+      "sw         %[tmp5],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp6],      (%[dst])                     \n\t"
+      "sw         %[tmp6],      4(%[dst])                    \n\t"
+      "sw         %[tmp6],      8(%[dst])                    \n\t"
+      "sw         %[tmp6],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp7],      (%[dst])                     \n\t"
+      "sw         %[tmp7],      4(%[dst])                    \n\t"
+      "sw         %[tmp7],      8(%[dst])                    \n\t"
+      "sw         %[tmp7],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp8],      (%[dst])                     \n\t"
+      "sw         %[tmp8],      4(%[dst])                    \n\t"
+      "sw         %[tmp8],      8(%[dst])                    \n\t"
+      "sw         %[tmp8],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp9],      (%[dst])                     \n\t"
+      "sw         %[tmp9],      4(%[dst])                    \n\t"
+      "sw         %[tmp9],      8(%[dst])                    \n\t"
+      "sw         %[tmp9],      12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp10],     (%[dst])                     \n\t"
+      "sw         %[tmp10],     4(%[dst])                    \n\t"
+      "sw         %[tmp10],     8(%[dst])                    \n\t"
+      "sw         %[tmp10],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp11],     (%[dst])                     \n\t"
+      "sw         %[tmp11],     4(%[dst])                    \n\t"
+      "sw         %[tmp11],     8(%[dst])                    \n\t"
+      "sw         %[tmp11],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp12],     (%[dst])                     \n\t"
+      "sw         %[tmp12],     4(%[dst])                    \n\t"
+      "sw         %[tmp12],     8(%[dst])                    \n\t"
+      "sw         %[tmp12],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp13],     (%[dst])                     \n\t"
+      "sw         %[tmp13],     4(%[dst])                    \n\t"
+      "sw         %[tmp13],     8(%[dst])                    \n\t"
+      "sw         %[tmp13],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp14],     (%[dst])                     \n\t"
+      "sw         %[tmp14],     4(%[dst])                    \n\t"
+      "sw         %[tmp14],     8(%[dst])                    \n\t"
+      "sw         %[tmp14],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp15],     (%[dst])                     \n\t"
+      "sw         %[tmp15],     4(%[dst])                    \n\t"
+      "sw         %[tmp15],     8(%[dst])                    \n\t"
+      "sw         %[tmp15],     12(%[dst])                   \n\t"
+
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp16],     (%[dst])                     \n\t"
+      "sw         %[tmp16],     4(%[dst])                    \n\t"
+      "sw         %[tmp16],     8(%[dst])                    \n\t"
+      "sw         %[tmp16],     12(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1),   [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3),   [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5),   [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6),   [tmp8] "=&r" (tmp8),
+        [tmp9] "=&r" (tmp9),   [tmp10] "=&r" (tmp10),
+        [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12),
+        [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14),
+        [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, left2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],           (%[above])                    \n\t"
+      "lw              %[above2],           4(%[above])                   \n\t"
+      "lw              %[left1],            (%[left])                     \n\t"
+      "lw              %[left2],            4(%[left])                    \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "lw              %[above1],           8(%[above])                   \n\t"
+      "lw              %[above2],           12(%[above])                  \n\t"
+      "lw              %[left1],            8(%[left])                    \n\t"
+      "lw              %[left2],            12(%[left])                   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
+      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
+      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
+      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
+
+      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
+      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
+      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
+
+      "addiu           %[average],          %[average],      16           \n\t"
+      "srl             %[tmp],              %[average],      16           \n\t"
+      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
+      "srl             %[expected_dc],      %[average],      5            \n\t"
+      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      "add             %[dst],              %[dst],          %[stride]    \n\t"
+      "sw              %[expected_dc],      (%[dst])                      \n\t"
+      "sw              %[expected_dc],      4(%[dst])                     \n\t"
+      "sw              %[expected_dc],      8(%[dst])                     \n\t"
+      "sw              %[expected_dc],      12(%[dst])                    \n\t"
+
+      : [left1] "=&r" (left1), [above1] "=&r" (above1),
+        [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1),
+        [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1),
+        [above2] "=&r" (above2), [left2] "=&r" (left2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
new file mode 100644
index 0000000..a53c623
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c
@@ -0,0 +1,232 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                    \n\t"
+      "lb         %[tmp2],      1(%[left])                   \n\t"
+      "lb         %[tmp3],      2(%[left])                   \n\t"
+      "lb         %[tmp4],      3(%[left])                   \n\t"
+      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
+      "sw         %[tmp1],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp2],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp3],      (%[dst])                     \n\t"
+      "add        %[dst],       %[dst],         %[stride]    \n\t"
+      "sw         %[tmp4],      (%[dst])                     \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4)
+      : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above_c, above_l, above_r, left_c, left_r, left_l;
+
+  __asm__ __volatile__ (
+      "lw              %[above_c],         (%[above])                    \n\t"
+      "lw              %[left_c],          (%[left])                     \n\t"
+
+      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
+      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
+      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
+      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
+
+      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
+      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
+      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
+      "addiu           %[average],         %[average],       4           \n\t"
+      "srl             %[tmp],             %[average],       16          \n\t"
+      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
+      "srl             %[expected_dc],     %[average],       3           \n\t"
+      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
+
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+      "add             %[dst],              %[dst],          %[stride]   \n\t"
+      "sw              %[expected_dc],     (%[dst])                      \n\t"
+
+      : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l),
+        [above_r] "=&r" (above_r), [left_c] "=&r" (left_c),
+        [left_l] "=&r" (left_l), [left_r] "=&r" (left_r),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  abovel, abover;
+  int32_t  left0, left1, left2, left3;
+  int32_t  res0, res1;
+  int32_t  resl;
+  int32_t  resr;
+  int32_t  top_left;
+  uint8_t  *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[resl],       (%[above])                         \n\t"
+
+      "lbu             %[left0],       (%[left])                         \n\t"
+      "lbu             %[left1],       1(%[left])                        \n\t"
+      "lbu             %[left2],       2(%[left])                        \n\t"
+      "lbu             %[left3],       3(%[left])                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                      \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[resl]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[resl]                           \n\t"
+
+      "replv.ph        %[left0],       %[left0]                          \n\t"
+      "replv.ph        %[left1],       %[left1]                          \n\t"
+      "replv.ph        %[left2],       %[left2]                          \n\t"
+      "replv.ph        %[left3],       %[left3]                          \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                       \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left0]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left0]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left1]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left1]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],         %[left2]       \n\t"
+      "subu.ph         %[resl],        %[resl],           %[top_left]    \n\t"
+
+      "addu.ph         %[resr],        %[abover],         %[left2]       \n\t"
+      "subu.ph         %[resr],        %[resr],           %[top_left]    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],           %[stride]      \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+
+      "sb              %[res1],        1(%[dst])                         \n\t"
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "addu.ph         %[resl],        %[abovel],        %[left3]        \n\t"
+      "subu.ph         %[resl],        %[resl],          %[top_left]     \n\t"
+
+      "addu.ph         %[resr],        %[abover],        %[left3]        \n\t"
+      "subu.ph         %[resr],        %[resr],          %[top_left]     \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      "add             %[dst],          %[dst],          %[stride]       \n\t"
+
+      "sll             %[res0],        %[resr],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+
+      "sra             %[res1],        %[resr],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+      "sb              %[res0],        (%[dst])                          \n\t"
+
+      "sll             %[res0],        %[resl],           16             \n\t"
+      "sra             %[res0],        %[res0],           16             \n\t"
+      "lbux            %[res0],        %[res0](%[cm])                    \n\t"
+      "sb              %[res1],        1(%[dst])                         \n\t"
+
+      "sra             %[res1],        %[resl],           16             \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                    \n\t"
+
+      "sb              %[res0],        2(%[dst])                         \n\t"
+      "sb              %[res1],        3(%[dst])                         \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2),
+        [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3),
+        [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
new file mode 100644
index 0000000..40d93ae
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c
@@ -0,0 +1,610 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int32_t  tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
+
+  __asm__ __volatile__ (
+      "lb         %[tmp1],      (%[left])                   \n\t"
+      "lb         %[tmp2],      1(%[left])                  \n\t"
+      "lb         %[tmp3],      2(%[left])                  \n\t"
+      "lb         %[tmp4],      3(%[left])                  \n\t"
+      "lb         %[tmp5],      4(%[left])                  \n\t"
+      "lb         %[tmp6],      5(%[left])                  \n\t"
+      "lb         %[tmp7],      6(%[left])                  \n\t"
+      "lb         %[tmp8],      7(%[left])                  \n\t"
+
+      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
+      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
+      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
+      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
+      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
+      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
+      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
+      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
+
+      "sw         %[tmp1],      (%[dst])                    \n\t"
+      "sw         %[tmp1],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp2],      (%[dst])                    \n\t"
+      "sw         %[tmp2],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp3],      (%[dst])                    \n\t"
+      "sw         %[tmp3],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp4],      (%[dst])                    \n\t"
+      "sw         %[tmp4],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp5],      (%[dst])                    \n\t"
+      "sw         %[tmp5],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp6],      (%[dst])                    \n\t"
+      "sw         %[tmp6],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp7],      (%[dst])                    \n\t"
+      "sw         %[tmp7],      4(%[dst])                   \n\t"
+      "add        %[dst],       %[dst],         %[stride]   \n\t"
+      "sw         %[tmp8],      (%[dst])                    \n\t"
+      "sw         %[tmp8],      4(%[dst])                   \n\t"
+
+      : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2),
+        [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4),
+        [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7),
+        [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8)
+      : [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t  expected_dc;
+  int32_t  average;
+  int32_t  tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
+  int32_t  above2, above_l2, above_r2, left2, left_r2, left_l2;
+
+  __asm__ __volatile__ (
+      "lw              %[above1],         (%[above])                      \n\t"
+      "lw              %[above2],         4(%[above])                     \n\t"
+      "lw              %[left1],          (%[left])                       \n\t"
+      "lw              %[left2],          4(%[left])                      \n\t"
+
+      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
+      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
+      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
+      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
+
+      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
+      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
+      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
+      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
+
+      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
+
+      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
+      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
+      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
+
+      "addiu           %[average],        %[average],       8             \n\t"
+
+      "srl             %[tmp],            %[average],       16            \n\t"
+      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
+      "srl             %[expected_dc],    %[average],       4             \n\t"
+      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
+
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      "add             %[dst],             %[dst],          %[stride]     \n\t"
+      "sw              %[expected_dc],    (%[dst])                        \n\t"
+      "sw              %[expected_dc],    4(%[dst])                       \n\t"
+
+      : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1),
+        [above_r1] "=&r" (above_r1), [left1] "=&r" (left1),
+        [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1),
+        [above2] "=&r" (above2), [above_l2] "=&r" (above_l2),
+        [above_r2] "=&r" (above_r2), [left2] "=&r" (left2),
+        [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2),
+        [average] "=&r" (average), [tmp] "=&r" (tmp),
+        [expected_dc] "=&r" (expected_dc)
+      : [above] "r" (above), [left] "r" (left), [dst] "r" (dst),
+        [stride] "r" (stride)
+  );
+}
+
+void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int32_t   abovel, abover;
+  int32_t   abovel_1, abover_1;
+  int32_t   left0;
+  int32_t   res0, res1, res2, res3;
+  int32_t   reshw;
+  int32_t   top_left;
+  uint8_t   *cm = vp9_ff_cropTbl;
+
+  __asm__ __volatile__ (
+      "ulw             %[reshw],       (%[above])                         \n\t"
+      "ulw             %[top_left],    4(%[above])                        \n\t"
+
+      "lbu             %[left0],       (%[left])                          \n\t"
+
+      "preceu.ph.qbl   %[abovel],      %[reshw]                           \n\t"
+      "preceu.ph.qbr   %[abover],      %[reshw]                           \n\t"
+      "preceu.ph.qbl   %[abovel_1],    %[top_left]                        \n\t"
+      "preceu.ph.qbr   %[abover_1],    %[top_left]                        \n\t"
+
+      "lbu             %[top_left],    -1(%[above])                       \n\t"
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+
+      "replv.ph        %[top_left],    %[top_left]                        \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       1(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       2(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       3(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       4(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       5(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       6(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbu             %[left0],       7(%[left])                         \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      "replv.ph        %[left0],       %[left0]                           \n\t"
+      "add             %[dst],          %[dst],             %[stride]     \n\t"
+
+      "addu.ph         %[reshw],       %[abovel],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover],           %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        (%[dst])                           \n\t"
+      "sb              %[res1],        1(%[dst])                          \n\t"
+      "sb              %[res2],        2(%[dst])                          \n\t"
+      "sb              %[res3],        3(%[dst])                          \n\t"
+
+      "addu.ph         %[reshw],       %[abovel_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res2],        %[reshw],            16            \n\t"
+      "sra             %[res2],        %[res2],             16            \n\t"
+      "sra             %[res3],        %[reshw],            16            \n\t"
+
+      "addu.ph         %[reshw],       %[abover_1],         %[left0]      \n\t"
+      "subu.ph         %[reshw],       %[reshw],            %[top_left]   \n\t"
+
+      "sll             %[res0],        %[reshw],            16            \n\t"
+      "sra             %[res0],        %[res0],             16            \n\t"
+      "sra             %[res1],        %[reshw],            16            \n\t"
+
+      "lbux            %[res0],        %[res0](%[cm])                     \n\t"
+      "lbux            %[res1],        %[res1](%[cm])                     \n\t"
+      "lbux            %[res2],        %[res2](%[cm])                     \n\t"
+      "lbux            %[res3],        %[res3](%[cm])                     \n\t"
+
+      "sb              %[res0],        4(%[dst])                          \n\t"
+      "sb              %[res1],        5(%[dst])                          \n\t"
+      "sb              %[res2],        6(%[dst])                          \n\t"
+      "sb              %[res3],        7(%[dst])                          \n\t"
+
+      : [abovel] "=&r" (abovel), [abover] "=&r" (abover),
+        [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1),
+        [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3),
+        [res0] "=&r" (res0), [res1] "=&r" (res1),
+        [reshw] "=&r" (reshw), [top_left] "=&r" (top_left)
+      : [above] "r" (above), [left] "r" (left),
+        [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm)
+  );
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
index d3aee73..bc67594 100644
--- a/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_itrans32_dspr2.c
@@ -19,7 +19,8 @@
 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
 
 #if HAVE_DSPR2
-static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
+static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output,
+                                 uint32_t no_rows) {
   int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
   int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
   int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
@@ -42,7 +43,7 @@ static void idct32_1d_rows_dspr2(const int16_t *input, int16_t *output) {
   const int const_2_power_13 = 8192;
   const int32_t *input_int;
 
-  for (i = 32; i--; ) {
+  for (i = no_rows; i--; ) {
     input_int = (const int32_t *)input;
 
     if (!(input_int[0]  | input_int[1]  | input_int[2]  | input_int[3]  |
@@ -881,12 +882,74 @@ void vp9_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
   );
 
   // Rows
-  idct32_1d_rows_dspr2(input, outptr);
+  idct32_1d_rows_dspr2(input, outptr, 32);
 
   // Columns
   vp9_idct32_1d_cols_add_blk_dspr2(out, dest, dest_stride);
 }
 
+void vp9_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
+                                int stride) {
+  DECLARE_ALIGNED(32, int16_t,  out[32 * 32]);
+  int16_t *outptr = out;
+  uint32_t i;
+  uint32_t pos = 45;
+
+  /* bit positon for extract from acc */
+  __asm__ __volatile__ (
+    "wrdsp      %[pos],     1           \n\t"
+    :
+    : [pos] "r" (pos)
+  );
+
+  // Rows
+  idct32_1d_rows_dspr2(input, outptr, 8);
+
+  outptr += 8;
+  __asm__ __volatile__ (
+      "sw     $zero,      0(%[outptr])     \n\t"
+      "sw     $zero,      4(%[outptr])     \n\t"
+      "sw     $zero,      8(%[outptr])     \n\t"
+      "sw     $zero,     12(%[outptr])     \n\t"
+      "sw     $zero,     16(%[outptr])     \n\t"
+      "sw     $zero,     20(%[outptr])     \n\t"
+      "sw     $zero,     24(%[outptr])     \n\t"
+      "sw     $zero,     28(%[outptr])     \n\t"
+      "sw     $zero,     32(%[outptr])     \n\t"
+      "sw     $zero,     36(%[outptr])     \n\t"
+      "sw     $zero,     40(%[outptr])     \n\t"
+      "sw     $zero,     44(%[outptr])     \n\t"
+
+      :
+      : [outptr] "r" (outptr)
+  );
+
+  for (i = 0; i < 31; ++i) {
+    outptr += 32;
+
+    __asm__ __volatile__ (
+        "sw     $zero,      0(%[outptr])     \n\t"
+        "sw     $zero,      4(%[outptr])     \n\t"
+        "sw     $zero,      8(%[outptr])     \n\t"
+        "sw     $zero,     12(%[outptr])     \n\t"
+        "sw     $zero,     16(%[outptr])     \n\t"
+        "sw     $zero,     20(%[outptr])     \n\t"
+        "sw     $zero,     24(%[outptr])     \n\t"
+        "sw     $zero,     28(%[outptr])     \n\t"
+        "sw     $zero,     32(%[outptr])     \n\t"
+        "sw     $zero,     36(%[outptr])     \n\t"
+        "sw     $zero,     40(%[outptr])     \n\t"
+        "sw     $zero,     44(%[outptr])     \n\t"
+
+        :
+        : [outptr] "r" (outptr)
+    );
+  }
+
+  // Columns
+  vp9_idct32_1d_cols_add_blk_dspr2(out, dest, stride);
+}
+
 void vp9_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
                                int stride) {
   int       r, out;
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
new file mode 100644
index 0000000..0c0f155
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
@@ -0,0 +1,364 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_loop_filter_horizontal_edge_dspr2(unsigned char *s,
+                                           int pitch,
+                                           const uint8_t *blimit,
+                                           const uint8_t *limit,
+                                           const uint8_t *thresh,
+                                           int count) {
+  uint8_t   i;
+  uint32_t  mask;
+  uint32_t  hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  /* loop filter designed to work using chars so that we can make maximum use
+     of 8 bit simd instructions. */
+  for (i = 0; i < 2; i++) {
+    sm1 = s - (pitch << 2);
+    s0 = sm1 + pitch;
+    s1 = s0 + pitch;
+    s2 = s - pitch;
+    s3 = s;
+    s4 = s + pitch;
+    s5 = s4 + pitch;
+    s6 = s5 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p1],  (%[s1])    \n\t"
+        "lw     %[p2],  (%[s2])    \n\t"
+        "lw     %[p3],  (%[s3])    \n\t"
+        "lw     %[p4],  (%[s4])    \n\t"
+
+        : [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+       mask will be zero and filtering is not needed */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      __asm__ __volatile__ (
+          "lw       %[pm1], (%[sm1])   \n\t"
+          "lw       %[p0],  (%[s0])    \n\t"
+          "lw       %[p5],  (%[s5])    \n\t"
+          "lw       %[p6],  (%[s6])    \n\t"
+
+          : [pm1] "=&r" (pm1), [p0] "=&r" (p0), [p5] "=&r" (p5),
+            [p6] "=&r" (p6)
+          : [sm1] "r" (sm1), [s0] "r" (s0), [s5] "r" (s5), [s6] "r" (s6)
+      );
+
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2,
+                                pm1, p0, p3, p4, p5, p6,
+                                thresh_vec, &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        __asm__ __volatile__ (
+            "sw     %[p1],  (%[s1])    \n\t"
+            "sw     %[p2],  (%[s2])    \n\t"
+            "sw     %[p3],  (%[s3])    \n\t"
+            "sw     %[p4],  (%[s4])    \n\t"
+
+            :
+            : [p1] "r" (p1), [p2] "r" (p2), [p3] "r" (p3), [p4] "r" (p4),
+              [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_loop_filter_vertical_edge_dspr2(unsigned char *s,
+                                         int pitch,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh,
+                                         int count) {
+  uint8_t   i;
+  uint32_t  mask, hev;
+  uint32_t  pm1, p0, p1, p2, p3, p4, p5, p6;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    /* load quad-byte vectors
+     * memory is 4 byte aligned
+     */
+    p2  = *((uint32_t *)(s1 - 4));
+    p6  = *((uint32_t *)(s1));
+    p1  = *((uint32_t *)(s2 - 4));
+    p5  = *((uint32_t *)(s2));
+    p0  = *((uint32_t *)(s3 - 4));
+    p4  = *((uint32_t *)(s3));
+    pm1 = *((uint32_t *)(s4 - 4));
+    p3  = *((uint32_t *)(s4));
+
+    /* transpose pm1, p0, p1, p2 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
+        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[pm1],     %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0), [pm1] "+r" (pm1),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p3, p4, p5, p6 */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p6] "+r" (p6), [p5] "+r" (p5), [p4] "+r" (p4), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
+     * mask will be zero and filtering is not needed
+     */
+    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
+      vp9_filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1,
+                                p0, p3, p4, p5, p6, thresh_vec,
+                                &hev, &mask);
+
+      /* if mask == 0 do filtering is not needed */
+      if (mask) {
+        /* filtering */
+        vp9_filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
+
+        /* unpack processed 4x4 neighborhood
+         * don't use transpose on output data
+         * because memory isn't aligned
+         */
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s4])    \n\t"
+            "sb     %[p3],   0(%[s4])    \n\t"
+            "sb     %[p2],  -1(%[s4])    \n\t"
+            "sb     %[p1],  -2(%[s4])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s3])    \n\t"
+            "sb     %[p3],   0(%[s3])    \n\t"
+            "sb     %[p2],  -1(%[s3])    \n\t"
+            "sb     %[p1],  -2(%[s3])    \n\t"
+
+            : [p1] "+r" (p1)
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s2])    \n\t"
+            "sb     %[p3],   0(%[s2])    \n\t"
+            "sb     %[p2],  -1(%[s2])    \n\t"
+            "sb     %[p1],  -2(%[s2])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "srl    %[p4],  %[p4],  8     \n\t"
+            "srl    %[p3],  %[p3],  8     \n\t"
+            "srl    %[p2],  %[p2],  8     \n\t"
+            "srl    %[p1],  %[p1],  8     \n\t"
+
+            : [p4] "+r" (p4), [p3] "+r" (p3), [p2] "+r" (p2), [p1] "+r" (p1)
+            :
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[p4],   1(%[s1])    \n\t"
+            "sb     %[p3],   0(%[s1])    \n\t"
+            "sb     %[p2],  -1(%[s1])    \n\t"
+            "sb     %[p1],  -2(%[s1])    \n\t"
+
+            :
+            : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+
+void vp9_loop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_dspr2(uint8_t *s, int p /* pitch */,
+                                                const uint8_t *blimit0,
+                                                const uint8_t *limit0,
+                                                const uint8_t *thresh0,
+                                                const uint8_t *blimit1,
+                                                const uint8_t *limit1,
+                                                const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge_dspr2(s + 8, p, blimit1, limit1, thresh1,
+                                          1);
+}
+
+void vp9_loop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                      1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_dspr2(uint8_t *s, int p,
+                                              const uint8_t *blimit0,
+                                              const uint8_t *limit0,
+                                              const uint8_t *thresh0,
+                                              const uint8_t *blimit1,
+                                              const uint8_t *limit1,
+                                              const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_dspr2(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_dspr2(s + 8 * p, p, blimit1, limit1, thresh1,
+                                       1);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_dspr2(uint8_t *s, int p,
+                                         const uint8_t *blimit,
+                                         const uint8_t *limit,
+                                         const uint8_t *thresh) {
+  vp9_mb_lpf_vertical_edge_w_dspr2(s, p, blimit, limit, thresh);
+  vp9_mb_lpf_vertical_edge_w_dspr2(s + 8 * p, p, blimit, limit, thresh);
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
new file mode 100644
index 0000000..98bfcfa
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
@@ -0,0 +1,755 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* inputs & outputs are quad-byte vectors */
+static INLINE void vp9_filter_dspr2(uint32_t mask, uint32_t hev,
+                                    uint32_t *ps1, uint32_t *ps0,
+                                    uint32_t *qs0, uint32_t *qs1) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (*ps0) ^ N128;
+  vps1 = (*ps1) ^ N128;
+  vqs0 = (*qs0) ^ N128;
+  vqs1 = (*qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r),
+        [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *ps0 = vps0 ^ N128;
+  *ps1 = vps1 ^ N128;
+  *qs0 = vqs0 ^ N128;
+  *qs1 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_filter1_dspr2(uint32_t mask, uint32_t hev,
+                                     uint32_t ps1, uint32_t ps0,
+                                     uint32_t qs0, uint32_t qs1,
+                                     uint32_t *p1_f0, uint32_t *p0_f0,
+                                     uint32_t *q0_f0, uint32_t *q1_f0) {
+  int32_t   vp9_filter_l, vp9_filter_r;
+  int32_t   Filter1_l, Filter1_r, Filter2_l, Filter2_r;
+  int32_t   subr_r, subr_l;
+  uint32_t  t1, t2, HWM, t3;
+  uint32_t  hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
+  int32_t   vps1, vps0, vqs0, vqs1;
+  int32_t   vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
+  uint32_t  N128;
+
+  N128 = 0x80808080;
+  t1  = 0x03000300;
+  t2  = 0x04000400;
+  t3  = 0x01000100;
+  HWM = 0xFF00FF00;
+
+  vps0 = (ps0) ^ N128;
+  vps1 = (ps1) ^ N128;
+  vqs0 = (qs0) ^ N128;
+  vqs1 = (qs1) ^ N128;
+
+  /* use halfword pairs instead quad-bytes because of accuracy */
+  vps0_l = vps0 & HWM;
+  vps0_r = vps0 << 8;
+  vps0_r = vps0_r & HWM;
+
+  vps1_l = vps1 & HWM;
+  vps1_r = vps1 << 8;
+  vps1_r = vps1_r & HWM;
+
+  vqs0_l = vqs0 & HWM;
+  vqs0_r = vqs0 << 8;
+  vqs0_r = vqs0_r & HWM;
+
+  vqs1_l = vqs1 & HWM;
+  vqs1_r = vqs1 << 8;
+  vqs1_r = vqs1_r & HWM;
+
+  mask_l = mask & HWM;
+  mask_r = mask << 8;
+  mask_r = mask_r & HWM;
+
+  hev_l = hev & HWM;
+  hev_r = hev << 8;
+  hev_r = hev_r & HWM;
+
+  __asm__ __volatile__ (
+      /* vp9_filter = vp8_signed_char_clamp(ps1 - qs1); */
+      "subq_s.ph    %[vp9_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
+      "subq_s.ph    %[vp9_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
+
+      /* qs0 - ps0 */
+      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
+      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
+
+      /* vp9_filter &= hev; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[hev_l]        \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[hev_r]        \n\t"
+
+      /* vp9_filter = vp8_signed_char_clamp(vp9_filter + 3 * (qs0 - ps0)); */
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
+      "addq_s.ph    %[vp9_filter_l], %[vp9_filter_l], %[subr_l]       \n\t"
+      "addq_s.ph    %[vp9_filter_r], %[vp9_filter_r], %[subr_r]       \n\t"
+
+      /* vp9_filter &= mask; */
+      "and          %[vp9_filter_l], %[vp9_filter_l], %[mask_l]       \n\t"
+      "and          %[vp9_filter_r], %[vp9_filter_r], %[mask_r]       \n\t"
+
+      : [vp9_filter_l] "=&r" (vp9_filter_l),
+        [vp9_filter_r] "=&r" (vp9_filter_r),
+        [subr_l] "=&r" (subr_l), [subr_r] "=&r" (subr_r),
+        [invhev_l] "=&r" (invhev_l), [invhev_r] "=&r" (invhev_r)
+      : [vps0_l] "r" (vps0_l), [vps0_r] "r" (vps0_r), [vps1_l] "r" (vps1_l),
+        [vps1_r] "r" (vps1_r), [vqs0_l] "r" (vqs0_l), [vqs0_r] "r" (vqs0_r),
+        [vqs1_l] "r" (vqs1_l), [vqs1_r] "r" (vqs1_r),
+        [mask_l] "r" (mask_l), [mask_r] "r" (mask_r),
+        [hev_l] "r" (hev_l), [hev_r] "r" (hev_r), [HWM] "r" (HWM)
+  );
+
+  /* save bottom 3 bits so that we round one side +4 and the other +3 */
+  __asm__ __volatile__ (
+      /* Filter2 = vp8_signed_char_clamp(vp9_filter + 3) >>= 3; */
+      "addq_s.ph    %[Filter1_l],    %[vp9_filter_l], %[t2]           \n\t"
+      "addq_s.ph    %[Filter1_r],    %[vp9_filter_r], %[t2]           \n\t"
+
+      /* Filter1 = vp8_signed_char_clamp(vp9_filter + 4) >>= 3; */
+      "addq_s.ph    %[Filter2_l],    %[vp9_filter_l], %[t1]           \n\t"
+      "addq_s.ph    %[Filter2_r],    %[vp9_filter_r], %[t1]           \n\t"
+      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
+      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
+
+      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
+      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
+
+      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
+
+      /* vps0 = vp8_signed_char_clamp(ps0 + Filter2); */
+      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
+      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
+
+      /* vqs0 = vp8_signed_char_clamp(qs0 - Filter1); */
+      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "=&r" (Filter1_l), [Filter1_r] "=&r" (Filter1_r),
+        [Filter2_l] "=&r" (Filter2_l), [Filter2_r] "=&r" (Filter2_r),
+        [vps0_l] "+r" (vps0_l), [vps0_r] "+r" (vps0_r),
+        [vqs0_l] "+r" (vqs0_l), [vqs0_r] "+r" (vqs0_r)
+      : [t1] "r" (t1), [t2] "r" (t2), [HWM] "r" (HWM),
+        [vp9_filter_l] "r" (vp9_filter_l), [vp9_filter_r] "r" (vp9_filter_r)
+  );
+
+  __asm__ __volatile__ (
+      /* (vp9_filter += 1) >>= 1 */
+      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
+      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
+
+      /* vp9_filter &= ~hev; */
+      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
+      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
+
+      /* vps1 = vp8_signed_char_clamp(ps1 + vp9_filter); */
+      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
+      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
+
+      /* vqs1 = vp8_signed_char_clamp(qs1 - vp9_filter); */
+      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
+      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
+
+      : [Filter1_l] "+r" (Filter1_l), [Filter1_r] "+r" (Filter1_r),
+        [vps1_l] "+r" (vps1_l), [vps1_r] "+r" (vps1_r),
+        [vqs1_l] "+r" (vqs1_l), [vqs1_r] "+r" (vqs1_r)
+      : [t3] "r" (t3), [invhev_l] "r" (invhev_l), [invhev_r] "r" (invhev_r)
+  );
+
+  /* Create quad-bytes from halfword pairs */
+  vqs0_l = vqs0_l & HWM;
+  vqs1_l = vqs1_l & HWM;
+  vps0_l = vps0_l & HWM;
+  vps1_l = vps1_l & HWM;
+
+  __asm__ __volatile__ (
+      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
+      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
+      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
+      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
+
+      : [vps1_r] "+r" (vps1_r), [vqs1_r] "+r" (vqs1_r),
+        [vps0_r] "+r" (vps0_r), [vqs0_r] "+r" (vqs0_r)
+      :
+  );
+
+  vqs0 = vqs0_l | vqs0_r;
+  vqs1 = vqs1_l | vqs1_r;
+  vps0 = vps0_l | vps0_r;
+  vps1 = vps1_l | vps1_r;
+
+  *p0_f0 = vps0 ^ N128;
+  *p1_f0 = vps1 ^ N128;
+  *q0_f0 = vqs0 ^ N128;
+  *q1_f0 = vqs1 ^ N128;
+}
+
+static INLINE void vp9_mbfilter_dspr2(uint32_t *op3, uint32_t *op2,
+                                      uint32_t *op1, uint32_t *op0,
+                                      uint32_t *oq0, uint32_t *oq1,
+                                      uint32_t *oq2, uint32_t *oq3) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  uint32_t       res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2;
+  uint32_t       tmp;
+  uint32_t       add_p210_q012;
+  uint32_t       u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012),
+        [tmp] "=&r" (tmp), [res_op2] "=&r" (res_op2),
+        [res_op1] "=&r" (res_op1), [res_op0] "=&r" (res_op0),
+        [res_oq0] "=&r" (res_oq0), [res_oq1] "=&r" (res_oq1),
+        [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+}
+
+static INLINE void vp9_mbfilter1_dspr2(uint32_t p3, uint32_t p2,
+                                       uint32_t p1, uint32_t p0,
+                                       uint32_t q0, uint32_t q1,
+                                       uint32_t q2, uint32_t q3,
+                                       uint32_t *op2_f1,
+                                       uint32_t *op1_f1, uint32_t *op0_f1,
+                                       uint32_t *oq0_f1, uint32_t *oq1_f1,
+                                       uint32_t *oq2_f1) {
+  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
+  uint32_t  res_op2, res_op1, res_op0;
+  uint32_t  res_oq0, res_oq1, res_oq2;
+  uint32_t  tmp;
+  uint32_t  add_p210_q012;
+  uint32_t  u32Four = 0x00040004;
+
+  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
+  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
+  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
+  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
+  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
+  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
+
+  __asm__ __volatile__ (
+      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
+      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
+
+      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
+      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
+      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
+      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
+      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
+      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
+      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
+      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
+      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
+      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
+      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
+      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
+      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
+      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
+      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
+      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
+      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
+      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
+      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
+      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
+      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
+
+      : [add_p210_q012] "=&r" (add_p210_q012), [tmp] "=&r" (tmp),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [res_oq0] "=&r" (res_oq0),
+        [res_oq1] "=&r" (res_oq1), [res_oq2] "=&r" (res_oq2)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [q1] "r" (q1),
+        [p2] "r" (p2), [q2] "r" (q2), [p3] "r" (p3), [q3] "r" (q3),
+        [u32Four] "r" (u32Four)
+  );
+
+  *op2_f1 = res_op2;
+  *op1_f1 = res_op1;
+  *op0_f1 = res_op0;
+  *oq0_f1 = res_oq0;
+  *oq1_f1 = res_oq1;
+  *oq2_f1 = res_oq2;
+}
+
+static INLINE void vp9_wide_mbfilter_dspr2(uint32_t *op7, uint32_t *op6,
+                                           uint32_t *op5, uint32_t *op4,
+                                           uint32_t *op3, uint32_t *op2,
+                                           uint32_t *op1, uint32_t *op0,
+                                           uint32_t *oq0, uint32_t *oq1,
+                                           uint32_t *oq2, uint32_t *oq3,
+                                           uint32_t *oq4, uint32_t *oq5,
+                                           uint32_t *oq6, uint32_t *oq7) {
+  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
+  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
+  uint32_t       res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
+  uint32_t       res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
+  uint32_t       tmp;
+  uint32_t       add_p6toq6;
+  uint32_t       u32Eight = 0x00080008;
+
+  __asm__ __volatile__ (
+      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
+         which is used most of the time */
+      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
+      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
+
+      : [add_p6toq6] "=&r" (add_p6toq6)
+      : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3),
+        [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [u32Eight] "r" (u32Eight)
+  );
+
+  __asm__ __volatile__ (
+      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
+                                   p3 + p2 + p1 + p0 + q0, 4) */
+      "shll.ph       %[tmp],            %[p7],            3               \n\t"
+      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
+      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
+      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
+      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
+
+      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
+                                   p2 + p1 + p0 + q0 + q1, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
+      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
+      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
+      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
+
+      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
+                                   p1 + p0 + q0 + q1 + q2, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
+      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
+      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
+      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
+
+      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
+                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
+      "shll.ph       %[tmp],            %[p7],            2               \n\t"
+      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
+      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
+      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
+      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
+
+      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
+                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
+      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
+      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
+      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
+
+      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
+                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
+      "shll.ph       %[tmp],            %[p7],            1               \n\t"
+      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
+      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
+      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
+
+      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
+      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
+      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
+
+      : [res_op6] "=&r" (res_op6), [res_op5] "=&r" (res_op5),
+        [res_op4] "=&r" (res_op4), [res_op3] "=&r" (res_op3),
+        [res_op2] "=&r" (res_op2), [res_op1] "=&r" (res_op1),
+        [res_op0] "=&r" (res_op0), [tmp] "=&r" (tmp)
+      : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),
+        [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+        [q2] "r" (q2), [q1] "r" (q1),
+        [q3] "r" (q3), [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *op6 = res_op6;
+  *op5 = res_op5;
+  *op4 = res_op4;
+  *op3 = res_op3;
+  *op2 = res_op2;
+  *op1 = res_op1;
+  *op0 = res_op0;
+
+  __asm__ __volatile__ (
+      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
+                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
+      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
+      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
+      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
+
+      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
+                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
+      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
+
+      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
+                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
+      "shll.ph       %[tmp],            %[q7],            1               \n\t"
+      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
+      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
+      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
+
+      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
+                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
+      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
+      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
+
+      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
+                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
+      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
+      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
+
+      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
+                                   q5 * 2 + q6 + q7 * 6, 4) */
+      "shll.ph       %[tmp],            %[q7],            2               \n\t"
+      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
+      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
+      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
+
+      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
+                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
+      "shll.ph       %[tmp],            %[q7],            3               \n\t"
+      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
+      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
+      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
+      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
+
+      : [res_oq6] "=&r" (res_oq6), [res_oq5] "=&r" (res_oq5),
+        [res_oq4] "=&r" (res_oq4), [res_oq3] "=&r" (res_oq3),
+        [res_oq2] "=&r" (res_oq2), [res_oq1] "=&r" (res_oq1),
+        [res_oq0] "=&r" (res_oq0), [tmp] "=&r" (tmp)
+      : [q7] "r" (q7), [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4),
+        [q3] "r" (q3), [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+        [p1] "r" (p1), [p2] "r" (p2),
+        [p3] "r" (p3), [p4] "r" (p4), [p5] "r" (p5), [p6] "r" (p6),
+        [add_p6toq6] "r" (add_p6toq6)
+  );
+
+  *oq0 = res_oq0;
+  *oq1 = res_oq1;
+  *oq2 = res_oq2;
+  *oq3 = res_oq3;
+  *oq4 = res_oq4;
+  *oq5 = res_oq5;
+  *oq6 = res_oq6;
+}
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
new file mode 100644
index 0000000..4cb2ebb
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
@@ -0,0 +1,470 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+#define STORE_F0() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s4])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s4])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s4])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s3])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s3])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s3])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s3])           \n\t"                   \
+                                                                        \
+        : [p1_f0] "+r" (p1_f0)                                          \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [s3] "r" (s3), [p0_f0] "r" (p0_f0)                            \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s2])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s2])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s2])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                   \
+        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                   \
+        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                   \
+        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                   \
+                                                                        \
+        : [q1_f0] "+r" (q1_f0), [q0_f0] "+r" (q0_f0),                   \
+          [p0_f0] "+r" (p0_f0), [p1_f0] "+r" (p1_f0)                    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q1_f0],    1(%[s1])           \n\t"                   \
+        "sb     %[q0_f0],    0(%[s1])           \n\t"                   \
+        "sb     %[p0_f0],   -1(%[s1])           \n\t"                   \
+        "sb     %[p1_f0],   -2(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q1_f0] "r" (q1_f0), [q0_f0] "r" (q0_f0),                     \
+          [p0_f0] "r" (p0_f0), [p1_f0] "r" (p1_f0),                     \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F1() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+                                                                        \
+        : [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r), [q0_r] "+r" (q0_r),   \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_r] "r" (q2_r), [q1_r] "r" (q1_r), [q0_r] "r" (q0_r),      \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q2_l],    %[q2_l],    16      \n\t"                   \
+        "srl    %[q1_l],    %[q1_l],    16      \n\t"                   \
+        "srl    %[q0_l],    %[q0_l],    16      \n\t"                   \
+        "srl    %[p0_l],    %[p0_l],    16      \n\t"                   \
+        "srl    %[p1_l],    %[p1_l],    16      \n\t"                   \
+        "srl    %[p2_l],    %[p2_l],    16      \n\t"                   \
+                                                                        \
+        : [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l), [q0_l] "+r" (q0_l),   \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l)    \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q2_l] "r" (q2_l), [q1_l] "r" (q1_l), [q0_l] "r" (q0_l),      \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define STORE_F2() {                                                    \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s4])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s4])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s4])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s4])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s4])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s4])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s4])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s4])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s4])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s4])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s4])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s4])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s4])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s4])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s4] "r" (s4)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_r],    %[q6_r],    16      \n\t"                   \
+        "srl    %[q5_r],    %[q5_r],    16      \n\t"                   \
+        "srl    %[q4_r],    %[q4_r],    16      \n\t"                   \
+        "srl    %[q3_r],    %[q3_r],    16      \n\t"                   \
+        "srl    %[q2_r],    %[q2_r],    16      \n\t"                   \
+        "srl    %[q1_r],    %[q1_r],    16      \n\t"                   \
+        "srl    %[q0_r],    %[q0_r],    16      \n\t"                   \
+        "srl    %[p0_r],    %[p0_r],    16      \n\t"                   \
+        "srl    %[p1_r],    %[p1_r],    16      \n\t"                   \
+        "srl    %[p2_r],    %[p2_r],    16      \n\t"                   \
+        "srl    %[p3_r],    %[p3_r],    16      \n\t"                   \
+        "srl    %[p4_r],    %[p4_r],    16      \n\t"                   \
+        "srl    %[p5_r],    %[p5_r],    16      \n\t"                   \
+        "srl    %[p6_r],    %[p6_r],    16      \n\t"                   \
+                                                                        \
+        : [q6_r] "+r" (q6_r), [q5_r] "+r" (q5_r), [q4_r] "+r" (q4_r),   \
+          [q3_r] "+r" (q3_r), [q2_r] "+r" (q2_r), [q1_r] "+r" (q1_r),   \
+          [q0_r] "+r" (q0_r),                                           \
+          [p0_r] "+r" (p0_r), [p1_r] "+r" (p1_r), [p2_r] "+r" (p2_r),   \
+          [p3_r] "+r" (p3_r), [p4_r] "+r" (p4_r), [p5_r] "+r" (p5_r),   \
+          [p6_r] "+r" (p6_r)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_r],     6(%[s3])           \n\t"                   \
+        "sb     %[q5_r],     5(%[s3])           \n\t"                   \
+        "sb     %[q4_r],     4(%[s3])           \n\t"                   \
+        "sb     %[q3_r],     3(%[s3])           \n\t"                   \
+        "sb     %[q2_r],     2(%[s3])           \n\t"                   \
+        "sb     %[q1_r],     1(%[s3])           \n\t"                   \
+        "sb     %[q0_r],     0(%[s3])           \n\t"                   \
+        "sb     %[p0_r],    -1(%[s3])           \n\t"                   \
+        "sb     %[p1_r],    -2(%[s3])           \n\t"                   \
+        "sb     %[p2_r],    -3(%[s3])           \n\t"                   \
+        "sb     %[p3_r],    -4(%[s3])           \n\t"                   \
+        "sb     %[p4_r],    -5(%[s3])           \n\t"                   \
+        "sb     %[p5_r],    -6(%[s3])           \n\t"                   \
+        "sb     %[p6_r],    -7(%[s3])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_r] "r" (q6_r), [q5_r] "r" (q5_r), [q4_r] "r" (q4_r),      \
+          [q3_r] "r" (q3_r), [q2_r] "r" (q2_r), [q1_r] "r" (q1_r),      \
+          [q0_r] "r" (q0_r),                                            \
+          [p0_r] "r" (p0_r), [p1_r] "r" (p1_r), [p2_r] "r" (p2_r),      \
+          [p3_r] "r" (p3_r), [p4_r] "r" (p4_r), [p5_r] "r" (p5_r),      \
+          [p6_r] "r" (p6_r),                                            \
+          [s3] "r" (s3)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s2])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s2])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s2])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s2])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s2])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s2])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s2])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s2])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s2])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s2])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s2])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s2])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s2])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s2])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s2] "r" (s2)                                                 \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "srl    %[q6_l],    %[q6_l],    16     \n\t"                    \
+        "srl    %[q5_l],    %[q5_l],    16     \n\t"                    \
+        "srl    %[q4_l],    %[q4_l],    16     \n\t"                    \
+        "srl    %[q3_l],    %[q3_l],    16     \n\t"                    \
+        "srl    %[q2_l],    %[q2_l],    16     \n\t"                    \
+        "srl    %[q1_l],    %[q1_l],    16     \n\t"                    \
+        "srl    %[q0_l],    %[q0_l],    16     \n\t"                    \
+        "srl    %[p0_l],    %[p0_l],    16     \n\t"                    \
+        "srl    %[p1_l],    %[p1_l],    16     \n\t"                    \
+        "srl    %[p2_l],    %[p2_l],    16     \n\t"                    \
+        "srl    %[p3_l],    %[p3_l],    16     \n\t"                    \
+        "srl    %[p4_l],    %[p4_l],    16     \n\t"                    \
+        "srl    %[p5_l],    %[p5_l],    16     \n\t"                    \
+        "srl    %[p6_l],    %[p6_l],    16     \n\t"                    \
+                                                                        \
+        : [q6_l] "+r" (q6_l), [q5_l] "+r" (q5_l), [q4_l] "+r" (q4_l),   \
+          [q3_l] "+r" (q3_l), [q2_l] "+r" (q2_l), [q1_l] "+r" (q1_l),   \
+          [q0_l] "+r" (q0_l),                                           \
+          [p0_l] "+r" (p0_l), [p1_l] "+r" (p1_l), [p2_l] "+r" (p2_l),   \
+          [p3_l] "+r" (p3_l), [p4_l] "+r" (p4_l), [p5_l] "+r" (p5_l),   \
+          [p6_l] "+r" (p6_l)                                            \
+        :                                                               \
+    );                                                                  \
+                                                                        \
+    __asm__ __volatile__ (                                              \
+        "sb     %[q6_l],     6(%[s1])           \n\t"                   \
+        "sb     %[q5_l],     5(%[s1])           \n\t"                   \
+        "sb     %[q4_l],     4(%[s1])           \n\t"                   \
+        "sb     %[q3_l],     3(%[s1])           \n\t"                   \
+        "sb     %[q2_l],     2(%[s1])           \n\t"                   \
+        "sb     %[q1_l],     1(%[s1])           \n\t"                   \
+        "sb     %[q0_l],     0(%[s1])           \n\t"                   \
+        "sb     %[p0_l],    -1(%[s1])           \n\t"                   \
+        "sb     %[p1_l],    -2(%[s1])           \n\t"                   \
+        "sb     %[p2_l],    -3(%[s1])           \n\t"                   \
+        "sb     %[p3_l],    -4(%[s1])           \n\t"                   \
+        "sb     %[p4_l],    -5(%[s1])           \n\t"                   \
+        "sb     %[p5_l],    -6(%[s1])           \n\t"                   \
+        "sb     %[p6_l],    -7(%[s1])           \n\t"                   \
+                                                                        \
+        :                                                               \
+        : [q6_l] "r" (q6_l), [q5_l] "r" (q5_l), [q4_l] "r" (q4_l),      \
+          [q3_l] "r" (q3_l), [q2_l] "r" (q2_l), [q1_l] "r" (q1_l),      \
+          [q0_l] "r" (q0_l),                                            \
+          [p0_l] "r" (p0_l), [p1_l] "r" (p1_l), [p2_l] "r" (p2_l),      \
+          [p3_l] "r" (p3_l), [p4_l] "r" (p4_l), [p5_l] "r" (p5_l),      \
+          [p6_l] "r" (p6_l),                                            \
+          [s1] "r" (s1)                                                 \
+    );                                                                  \
+}
+
+#define PACK_LEFT_0TO3() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                       \
+        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                       \
+        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                       \
+        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                       \
+        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                       \
+        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                       \
+        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                       \
+        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_l] "=&r" (p3_l), [p2_l] "=&r" (p2_l),                     \
+          [p1_l] "=&r" (p1_l), [p0_l] "=&r" (p0_l),                     \
+          [q0_l] "=&r" (q0_l), [q1_l] "=&r" (q1_l),                     \
+          [q2_l] "=&r" (q2_l), [q3_l] "=&r" (q3_l)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_LEFT_4TO7() {                                              \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                       \
+        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                       \
+        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                       \
+        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                       \
+        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                       \
+        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                       \
+        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                       \
+        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_l] "=&r" (p7_l), [p6_l] "=&r" (p6_l),                     \
+          [p5_l] "=&r" (p5_l), [p4_l] "=&r" (p4_l),                     \
+          [q4_l] "=&r" (q4_l), [q5_l] "=&r" (q5_l),                     \
+          [q6_l] "=&r" (q6_l), [q7_l] "=&r" (q7_l)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_0TO3() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                        \
+        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                       \
+        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                       \
+        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                       \
+        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                       \
+        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                       \
+        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                       \
+        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                       \
+                                                                        \
+        : [p3_r] "=&r" (p3_r), [p2_r] "=&r" (p2_r),                     \
+          [p1_r] "=&r" (p1_r), [p0_r] "=&r" (p0_r),                     \
+          [q0_r] "=&r" (q0_r), [q1_r] "=&r" (q1_r),                     \
+          [q2_r] "=&r" (q2_r), [q3_r] "=&r" (q3_r)                      \
+        : [p3] "r" (p3), [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),   \
+          [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2), [q3] "r" (q3)    \
+    );                                                                  \
+}
+
+#define PACK_RIGHT_4TO7() {                                             \
+    __asm__ __volatile__ (                                              \
+        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                       \
+        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                       \
+        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                       \
+        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                       \
+        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                       \
+        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                       \
+        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                       \
+        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                       \
+                                                                        \
+        : [p7_r] "=&r" (p7_r), [p6_r] "=&r" (p6_r),                     \
+          [p5_r] "=&r" (p5_r), [p4_r] "=&r" (p4_r),                     \
+          [q4_r] "=&r" (q4_r), [q5_r] "=&r" (q5_r),                     \
+          [q6_r] "=&r" (q6_r), [q7_r] "=&r" (q7_r)                      \
+        : [p7] "r" (p7), [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4),   \
+          [q4] "r" (q4), [q5] "r" (q5), [q6] "r" (q6), [q7] "r" (q7)    \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_0TO2() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"            \
+        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"            \
+        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"            \
+        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"            \
+        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"            \
+        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"            \
+                                                                        \
+        : [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),            \
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2)             \
+        : [p2_l] "r" (p2_l), [p2_r] "r" (p2_r),                         \
+          [p1_l] "r" (p1_l), [p1_r] "r" (p1_r),                         \
+          [p0_l] "r" (p0_l), [p0_r] "r" (p0_r),                         \
+          [q0_l] "r" (q0_l), [q0_r] "r" (q0_r),                         \
+          [q1_l] "r" (q1_l), [q1_r] "r" (q1_r),                         \
+          [q2_l] "r" (q2_l), [q2_r] "r" (q2_r)                          \
+    );                                                                  \
+}
+
+#define COMBINE_LEFT_RIGHT_3TO6() {                                     \
+    __asm__ __volatile__ (                                              \
+        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"            \
+        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"            \
+        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"            \
+        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"            \
+        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"            \
+        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"            \
+        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"            \
+        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"            \
+                                                                        \
+        : [p6] "=&r" (p6),[p5] "=&r" (p5),                              \
+          [p4] "=&r" (p4),[p3] "=&r" (p3),                              \
+          [q3] "=&r" (q3),[q4] "=&r" (q4),                              \
+          [q5] "=&r" (q5),[q6] "=&r" (q6)                               \
+        : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l),                         \
+          [p4_l] "r" (p4_l), [p3_l] "r" (p3_l),                         \
+          [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),                         \
+          [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),                         \
+          [q3_l] "r" (q3_l), [q4_l] "r" (q4_l),                         \
+          [q5_l] "r" (q5_l), [q6_l] "r" (q6_l),                         \
+          [q3_r] "r" (q3_r), [q4_r] "r" (q4_r),                         \
+          [q5_r] "r" (q5_r), [q6_r] "r" (q6_r)                          \
+    );                                                                  \
+}
+
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
new file mode 100644
index 0000000..b9e0aca
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
@@ -0,0 +1,365 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+#define VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_onyxc_int.h"
+
+#if HAVE_DSPR2
+/* processing 4 pixels at the same time
+ * compute hev and mask in the same function */
+static INLINE void vp9_filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
+                                             uint32_t p1, uint32_t p0,
+                                             uint32_t p3, uint32_t p2,
+                                             uint32_t q0, uint32_t q1,
+                                             uint32_t q2, uint32_t q3,
+                                             uint32_t thresh, uint32_t *hev,
+                                             uint32_t *mask) {
+  uint32_t  c, r, r3, r_k;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  hev1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
+      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   $0,        %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
+      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  $0,        %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
+      "or             %[r3],  %[r3],     %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
+      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+      "sll            %[r3],    %[r3],    24          \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
+      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
+      "or             %[r_k], %[r_k],    %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
+      "or             %[r],   %[r],      %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k),
+        [r] "=&r" (r), [r3] "=&r" (r3)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+}
+
+static INLINE void vp9_filter_hev_mask_flatmask4_dspr2(uint32_t limit,
+                                                       uint32_t flimit,
+                                                       uint32_t thresh,
+                                                       uint32_t p1, uint32_t p0,
+                                                       uint32_t p3, uint32_t p2,
+                                                       uint32_t q0, uint32_t q1,
+                                                       uint32_t q2, uint32_t q3,
+                                                       uint32_t *hev,
+                                                       uint32_t *mask,
+                                                       uint32_t *flat) {
+  uint32_t  c, r, r3, r_k, r_flat;
+  uint32_t  s1, s2, s3;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  hev1;
+  uint32_t  flat1;
+
+  __asm__ __volatile__ (
+      /* mask |= (abs(p3 - p2) > limit) */
+      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       $0,             %[c]         \n\t"
+
+      /* mask |= (abs(p2 - p1) > limit) */
+      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
+      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      /* mask |= (abs(p1 - p0) > limit)
+       * hev  |= (abs(p1 - p0) > thresh)
+       * flat |= (abs(p1 - p0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      $0,             %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* mask |= (abs(q1 - q0) > limit)
+       * hev  |= (abs(q1 - q0) > thresh)
+       * flat |= (abs(q1 - q0) > thresh)
+       */
+      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
+      "or             %[r3],      %[r3],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      /* look at stall here */
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+
+      /* mask |= (abs(q2 - q1) > limit) */
+      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
+      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+      "sll            %[r3],      %[r3],          24           \n\t"
+
+      /* mask |= (abs(q3 - q2) > limit) */
+      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
+      "or             %[r],       %[r],           %[c]         \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), [r3] "=&r" (r3),
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1)
+      : [limit] "r" (limit), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q1] "r" (q1), [q0] "r" (q0),
+        [q2] "r" (q2), [q3] "r" (q3), [thresh] "r" (thresh),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  __asm__ __volatile__ (
+      /* abs(p0 - q0) */
+      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
+      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
+      "wrdsp          %[r3]                           \n\t"
+      "or             %[s1],  %[r_k],    %[c]         \n\t"
+
+      /* abs(p1 - q1) */
+      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
+      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
+      "pick.qb        %[hev1], %[ones],  $0           \n\t"
+      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
+      "or             %[s2],   %[r_k],   %[c]         \n\t"
+
+      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
+      "shrl.qb        %[s2],   %[s2],     1           \n\t"
+      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
+      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
+      "or             %[r],    %[r],      %[c]        \n\t"
+      "sll            %[r],    %[r],      24          \n\t"
+
+      "wrdsp          %[r]                            \n\t"
+      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [s1] "=&r" (s1), [hev1] "=&r" (hev1),
+        [s2] "=&r" (s2), [r] "+r" (r), [s3] "=&r" (s3)
+      : [p0] "r" (p0), [q0] "r" (q0), [p1] "r" (p1), [r3] "r" (r3),
+        [q1] "r" (q1), [ones] "r" (ones), [flimit] "r" (flimit)
+  );
+
+  *hev = hev1;
+  *mask = s2;
+  *flat = flat1;
+}
+
+static INLINE void vp9_flatmask5(uint32_t p4, uint32_t p3,
+                                 uint32_t p2, uint32_t p1,
+                                 uint32_t p0, uint32_t q0,
+                                 uint32_t q1, uint32_t q2,
+                                 uint32_t q3, uint32_t q4,
+                                 uint32_t *flat2) {
+  uint32_t  c, r, r_k, r_flat;
+  uint32_t  ones = 0xFFFFFFFF;
+  uint32_t  flat_thresh = 0x01010101;
+  uint32_t  flat1, flat3;
+
+  __asm__ __volatile__ (
+      /* flat |= (abs(p4 - p0) > thresh) */
+      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
+      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
+      "or             %[r_k], %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r],   $0,              %[c]         \n\t"
+
+      /* flat |= (abs(q4 - q0) > thresh) */
+      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
+      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
+      "or             %[r_k],   %[r_k],          %[c]      \n\t"
+      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
+      "or             %[r],     %[r],            %[c]      \n\t"
+      "sll            %[r],     %[r],            24        \n\t"
+      "wrdsp          %[r]                                 \n\t"
+      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
+
+      /* flat |= (abs(p1 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  $0,             %[c]         \n\t"
+
+      /* flat |= (abs(q1 - q0) > thresh) */
+      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
+      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
+      "or             %[r_k],    %[r_k],          %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
+      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
+
+      /* flat |= (abs(p0 - p2) > thresh) */
+      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
+      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q0 - q2) > thresh) */
+      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
+      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(p3 - p0) > thresh) */
+      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
+      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+
+      /* flat |= (abs(q3 - q0) > thresh) */
+      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
+      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
+      "or             %[r_k],     %[r_k],         %[c]         \n\t"
+      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
+      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
+      "sll            %[r_flat],  %[r_flat],      24           \n\t"
+      "wrdsp          %[r_flat]                                \n\t"
+      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
+      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
+      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
+
+      : [c] "=&r" (c), [r_k] "=&r" (r_k), [r] "=&r" (r), 
+        [r_flat] "=&r" (r_flat), [flat1] "=&r" (flat1), [flat3] "=&r" (flat3)
+      : [p4] "r" (p4), [p3] "r" (p3), [p2] "r" (p2),
+        [p1] "r" (p1), [p0] "r" (p0), [q0] "r" (q0), [q1] "r" (q1),
+        [q2] "r" (q2), [q3] "r" (q3), [q4] "r" (q4),
+        [flat_thresh] "r" (flat_thresh), [ones] "r" (ones)
+  );
+
+  *flat2 = flat1;
+}
+#endif  // #if HAVE_DSPR2
+#endif  // VP9_COMMON_MIPS_DSPR2_VP9_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
new file mode 100644
index 0000000..adfd755
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
@@ -0,0 +1,652 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mbloop_filter_horizontal_edge_dspr2(unsigned char *s,
+                                             int pitch,
+                                             const uint8_t *blimit,
+                                             const uint8_t *limit,
+                                             const uint8_t *thresh,
+                                             int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat;
+  uint8_t   i;
+  uint8_t   *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p3, p2, p1, p0, q0, q1, q2, q3;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < 2; i++) {
+    sp3 = s - (pitch << 2);
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p3],      (%[sp3])    \n\t"
+        "lw     %[p2],      (%[sp2])    \n\t"
+        "lw     %[p1],      (%[sp1])    \n\t"
+        "lw     %[p0],      (%[sp0])    \n\t"
+        "lw     %[q0],      (%[sq0])    \n\t"
+        "lw     %[q1],      (%[sq1])    \n\t"
+        "lw     %[q2],      (%[sq2])    \n\t"
+        "lw     %[q3],      (%[sq3])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])    \n\t"
+          "sw       %[p0_f0],   (%[sp0])    \n\t"
+          "sw       %[q0_f0],   (%[sq0])    \n\t"
+          "sw       %[q1_f0],   (%[sq1])    \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw       %[p2],      (%[sp2])    \n\t"
+          "sw       %[p1],      (%[sp1])    \n\t"
+          "sw       %[p0],      (%[sp0])    \n\t"
+          "sw       %[q0],      (%[sq0])    \n\t"
+          "sw       %[q1],      (%[sq1])    \n\t"
+          "sw       %[q2],      (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat != 0) && (mask != 0)) {
+      /* filtering */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    (%[sp2])    \n\t"
+            "sb     %[p1_r],    (%[sp1])    \n\t"
+            "sb     %[p0_r],    (%[sp0])    \n\t"
+            "sb     %[q0_r],    (%[sq0])    \n\t"
+            "sb     %[q1_r],    (%[sq1])    \n\t"
+            "sb     %[q2_r],    (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    +1(%[sp2])    \n\t"
+            "sb     %[p1_r],    +1(%[sp1])    \n\t"
+            "sb     %[p0_r],    +1(%[sp0])    \n\t"
+            "sb     %[q0_r],    +1(%[sq0])    \n\t"
+            "sb     %[q1_r],    +1(%[sq1])    \n\t"
+            "sb     %[q2_r],    +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +2(%[sp2])    \n\t"
+            "sb     %[p1_l],    +2(%[sp1])    \n\t"
+            "sb     %[p0_l],    +2(%[sp0])    \n\t"
+            "sb     %[q0_l],    +2(%[sq0])    \n\t"
+            "sb     %[q1_l],    +2(%[sq1])    \n\t"
+            "sb     %[q2_l],    +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+
+void vp9_mbloop_filter_vertical_edge_dspr2(unsigned char *s,
+                                           int pitch,
+                                           const uint8_t *blimit,
+                                           const uint8_t *limit,
+                                           const uint8_t *thresh,
+                                           int count) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p3, p2, p1, p0, q3, q2, q1, q0;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
+  uint32_t  p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[q3],    (%[s1])    \n\t"
+        "lw     %[q2],    (%[s2])    \n\t"
+        "lw     %[q1],    (%[s3])    \n\t"
+        "lw     %[q0],    (%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [q0] "=&r" (q0), [q1] "=&r" (q1), [q2] "=&r" (q2), [q3] "=&r" (q3)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    if ((flat == 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((mask & flat) == 0xFFFFFFFF) {
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat != 0) && (mask != 0)) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s4])    \n\t"
+            "sb         %[p1_r],  -2(%[s4])    \n\t"
+            "sb         %[p0_r],  -1(%[s4])    \n\t"
+            "sb         %[q0_r],    (%[s4])    \n\t"
+            "sb         %[q1_r],  +1(%[s4])    \n\t"
+            "sb         %[q2_r],  +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  -3(%[s3])    \n\t"
+            "sb         %[p1_r],  -2(%[s3])    \n\t"
+            "sb         %[p0_r],  -1(%[s3])    \n\t"
+            "sb         %[q0_r],    (%[s3])    \n\t"
+            "sb         %[q1_r],  +1(%[s3])    \n\t"
+            "sb         %[q2_r],  +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s3])    \n\t"
+            "sb         %[p0_f0],  -1(%[s3])    \n\t"
+            "sb         %[q0_f0],    (%[s3])    \n\t"
+            "sb         %[q1_f0],  +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p2] "+r" (p2), [p1] "+r" (p1), [p0] "+r" (p0),
+            [q0] "+r" (q0), [q1] "+r" (q1), [q2] "+r" (q2),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb         %[p2_l],  -3(%[s2])    \n\t"
+          "sb         %[p1_l],  -2(%[s2])    \n\t"
+          "sb         %[p0_l],  -1(%[s2])    \n\t"
+          "sb         %[q0_l],    (%[s2])    \n\t"
+          "sb         %[q1_l],  +1(%[s2])    \n\t"
+          "sb         %[q2_l],  +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s2])    \n\t"
+            "sb         %[p0_f0],  -1(%[s2])    \n\t"
+            "sb         %[q0_f0],    (%[s2])    \n\t"
+            "sb         %[q1_f0],  +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  -3(%[s1])    \n\t"
+            "sb         %[p1_l],  -2(%[s1])    \n\t"
+            "sb         %[p0_l],  -1(%[s1])    \n\t"
+            "sb         %[q0_l],    (%[s1])    \n\t"
+            "sb         %[q1_l],  +1(%[s1])    \n\t"
+            "sb         %[q2_l],  +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s1])    \n\t"
+            "sb         %[p0_f0],  -1(%[s1])    \n\t"
+            "sb         %[q0_f0],    (%[s1])    \n\t"
+            "sb         %[q1_f0],  +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
new file mode 100644
index 0000000..0759755
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
@@ -0,0 +1,795 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_horizontal_edge_w_dspr2(unsigned char *s,
+                                        int pitch,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh,
+                                        int count) {
+  uint32_t  mask;
+  uint32_t  hev, flat, flat2;
+  uint8_t   i;
+  uint8_t   *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
+  uint8_t   *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit  = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
+      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
+      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  /* prefetch data for store */
+  vp9_prefetch_store(s);
+
+  for (i = 0; i < (2 * count); i++) {
+    sp7 = s - (pitch << 3);
+    sp6 = sp7 + pitch;
+    sp5 = sp6 + pitch;
+    sp4 = sp5 + pitch;
+    sp3 = sp4 + pitch;
+    sp2 = sp3 + pitch;
+    sp1 = sp2 + pitch;
+    sp0 = sp1 + pitch;
+    sq0 = s;
+    sq1 = s + pitch;
+    sq2 = sq1 + pitch;
+    sq3 = sq2 + pitch;
+    sq4 = sq3 + pitch;
+    sq5 = sq4 + pitch;
+    sq6 = sq5 + pitch;
+    sq7 = sq6 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p7],      (%[sp7])            \n\t"
+        "lw     %[p6],      (%[sp6])            \n\t"
+        "lw     %[p5],      (%[sp5])            \n\t"
+        "lw     %[p4],      (%[sp4])            \n\t"
+        "lw     %[p3],      (%[sp3])            \n\t"
+        "lw     %[p2],      (%[sp2])            \n\t"
+        "lw     %[p1],      (%[sp1])            \n\t"
+        "lw     %[p0],      (%[sp0])            \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1), [p0] "=&r" (p0),
+          [p7] "=&r" (p7), [p6] "=&r" (p6), [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+          [sp4] "r" (sp4), [sp5] "r" (sp5), [sp6] "r" (sp6), [sp7] "r" (sp7)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q0],      (%[sq0])            \n\t"
+        "lw     %[q1],      (%[sq1])            \n\t"
+        "lw     %[q2],      (%[sq2])            \n\t"
+        "lw     %[q3],      (%[sq3])            \n\t"
+        "lw     %[q4],      (%[sq4])            \n\t"
+        "lw     %[q5],      (%[sq5])            \n\t"
+        "lw     %[q6],      (%[sq6])            \n\t"
+        "lw     %[q7],      (%[sq7])            \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1), [q0] "=&r" (q0),
+          [q7] "=&r" (q7), [q6] "=&r" (q6), [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [sq3] "r" (sq3), [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0),
+          [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6), [sq7] "r" (sq7)
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      __asm__ __volatile__ (
+          "sw       %[p1_f0],   (%[sp1])            \n\t"
+          "sw       %[p0_f0],   (%[sp0])            \n\t"
+          "sw       %[q0_f0],   (%[sq0])            \n\t"
+          "sw       %[q1_f0],   (%[sq1])            \n\t"
+
+          :
+          : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+            [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+            [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1)
+      );
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+      COMBINE_LEFT_RIGHT_3TO6()
+
+      __asm__ __volatile__ (
+          "sw         %[p6], (%[sp6])    \n\t"
+          "sw         %[p5], (%[sp5])    \n\t"
+          "sw         %[p4], (%[sp4])    \n\t"
+          "sw         %[p3], (%[sp3])    \n\t"
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+
+          :
+          : [p6] "r" (p6), [p5] "r" (p5), [p4] "r" (p4), [p3] "r" (p3),
+            [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4), [sp3] "r" (sp3),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+      );
+
+      __asm__ __volatile__ (
+          "sw         %[q6], (%[sq6])    \n\t"
+          "sw         %[q5], (%[sq5])    \n\t"
+          "sw         %[q4], (%[sq4])    \n\t"
+          "sw         %[q3], (%[sq3])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+
+          :
+          : [q6] "r" (q6), [q5] "r" (q5), [q4] "r" (q4), [q3] "r" (q3),
+            [q2] "r" (q2), [q1] "r" (q1), [q0] "r" (q0),
+            [sq6] "r" (sq6), [sq5] "r" (sq5), [sq4] "r" (sq4), [sq3] "r" (sq3),
+            [sq2] "r" (sq2), [sq1] "r" (sq1), [sq0] "r" (sq0)
+      );
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      COMBINE_LEFT_RIGHT_0TO2()
+
+      __asm__ __volatile__ (
+          "sw         %[p2], (%[sp2])    \n\t"
+          "sw         %[p1], (%[sp1])    \n\t"
+          "sw         %[p0], (%[sp0])    \n\t"
+          "sw         %[q0], (%[sq0])    \n\t"
+          "sw         %[q1], (%[sq1])    \n\t"
+          "sw         %[q2], (%[sq2])    \n\t"
+
+          :
+          : [p2] "r" (p2), [p1] "r" (p1), [p0] "r" (p0),
+            [q0] "r" (q0), [q1] "r" (q1), [q2] "r" (q2),
+            [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+            [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+      );
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l],  +3(%[sp2])    \n\t"
+            "sb         %[p1_l],  +3(%[sp1])    \n\t"
+            "sb         %[p0_l],  +3(%[sp0])    \n\t"
+            "sb         %[q0_l],  +3(%[sq0])    \n\t"
+            "sb         %[q1_l],  +3(%[sq1])    \n\t"
+            "sb         %[q2_l],  +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 + f2 */
+      /* f0  function */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* f1  function */
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      /* f2  function */
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  (%[sp6])    \n\t"
+            "sb         %[p5_r],  (%[sp5])    \n\t"
+            "sb         %[p4_r],  (%[sp4])    \n\t"
+            "sb         %[p3_r],  (%[sp3])    \n\t"
+            "sb         %[p2_r],  (%[sp2])    \n\t"
+            "sb         %[p1_r],  (%[sp1])    \n\t"
+            "sb         %[p0_r],  (%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [sp6] "r" (sp6), [sp5] "r" (sp5), [sp4] "r" (sp4),
+              [sp3] "r" (sp3), [sp2] "r" (sp2), [sp1] "r" (sp1),
+              [p0_r] "r" (p0_r), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  (%[sq0])    \n\t"
+            "sb         %[q1_r],  (%[sq1])    \n\t"
+            "sb         %[q2_r],  (%[sq2])    \n\t"
+            "sb         %[q3_r],  (%[sq3])    \n\t"
+            "sb         %[q4_r],  (%[sq4])    \n\t"
+            "sb         %[q5_r],  (%[sq5])    \n\t"
+            "sb         %[q6_r],  (%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  (%[sp1])    \n\t"
+            "sb         %[p0_f0],  (%[sp0])    \n\t"
+            "sb         %[q0_f0],  (%[sq0])    \n\t"
+            "sb         %[q1_f0],  (%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p6_r], %[p6_r], 16     \n\t"
+          "srl        %[p5_r], %[p5_r], 16     \n\t"
+          "srl        %[p4_r], %[p4_r], 16     \n\t"
+          "srl        %[p3_r], %[p3_r], 16     \n\t"
+          "srl        %[p2_r], %[p2_r], 16     \n\t"
+          "srl        %[p1_r], %[p1_r], 16     \n\t"
+          "srl        %[p0_r], %[p0_r], 16     \n\t"
+          "srl        %[q0_r], %[q0_r], 16     \n\t"
+          "srl        %[q1_r], %[q1_r], 16     \n\t"
+          "srl        %[q2_r], %[q2_r], 16     \n\t"
+          "srl        %[q3_r], %[q3_r], 16     \n\t"
+          "srl        %[q4_r], %[q4_r], 16     \n\t"
+          "srl        %[q5_r], %[q5_r], 16     \n\t"
+          "srl        %[q6_r], %[q6_r], 16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [q3_r] "+r" (q3_r), [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [p6_r] "+r" (p6_r), [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r),
+            [q6_r] "+r" (q6_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
+          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
+          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
+          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
+          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
+          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
+          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
+          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
+          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
+          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p6_r],  +1(%[sp6])    \n\t"
+            "sb         %[p5_r],  +1(%[sp5])    \n\t"
+            "sb         %[p4_r],  +1(%[sp4])    \n\t"
+            "sb         %[p3_r],  +1(%[sp3])    \n\t"
+            "sb         %[p2_r],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r],  +1(%[sp0])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_r],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r],  +1(%[sq2])    \n\t"
+            "sb         %[q3_r],  +1(%[sq3])    \n\t"
+            "sb         %[q4_r],  +1(%[sq4])    \n\t"
+            "sb         %[q5_r],  +1(%[sq5])    \n\t"
+            "sb         %[q6_r],  +1(%[sq6])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [q3_r] "r" (q3_r), [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
+            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
+            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
+            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
+            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
+            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
+          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
+          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
+          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p6_l],  +2(%[sp6])    \n\t"
+            "sb         %[p5_l],  +2(%[sp5])    \n\t"
+            "sb         %[p4_l],  +2(%[sp4])    \n\t"
+            "sb         %[p3_l],  +2(%[sp3])    \n\t"
+            "sb         %[p2_l],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l],  +2(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb         %[q0_l],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l],  +2(%[sq2])    \n\t"
+            "sb         %[q3_l],  +2(%[sq3])    \n\t"
+            "sb         %[q4_l],  +2(%[sq4])    \n\t"
+            "sb         %[q5_l],  +2(%[sq5])    \n\t"
+            "sb         %[q6_l],  +2(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [sq0] "r" (sq0), [sq1] "r" (sq1),
+              [sq2] "r" (sq2), [sq3] "r" (sq3),
+              [sq4] "r" (sq4), [sq5] "r" (sq5), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
+            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
+            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
+            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
+            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
+            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
+            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
+            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
+            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0), [q0_f0] "r" (q0_f0),
+              [q1_f0] "r" (q1_f0), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],    %[p6_l],    16   \n\t"
+          "srl      %[p5_l],    %[p5_l],    16   \n\t"
+          "srl      %[p4_l],    %[p4_l],    16   \n\t"
+          "srl      %[p3_l],    %[p3_l],    16   \n\t"
+          "srl      %[p2_l],    %[p2_l],    16   \n\t"
+          "srl      %[p1_l],    %[p1_l],    16   \n\t"
+          "srl      %[p0_l],    %[p0_l],    16   \n\t"
+          "srl      %[q0_l],    %[q0_l],    16   \n\t"
+          "srl      %[q1_l],    %[q1_l],    16   \n\t"
+          "srl      %[q2_l],    %[q2_l],    16   \n\t"
+          "srl      %[q3_l],    %[q3_l],    16   \n\t"
+          "srl      %[q4_l],    %[q4_l],    16   \n\t"
+          "srl      %[q5_l],    %[q5_l],    16   \n\t"
+          "srl      %[q6_l],    %[q6_l],    16   \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
+          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
+          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
+          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
+          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
+          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
+          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
+          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
+          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
+          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    +3(%[sp6])    \n\t"
+            "sb     %[p5_l],    +3(%[sp5])    \n\t"
+            "sb     %[p4_l],    +3(%[sp4])    \n\t"
+            "sb     %[p3_l],    +3(%[sp3])    \n\t"
+            "sb     %[p2_l],    +3(%[sp2])    \n\t"
+            "sb     %[p1_l],    +3(%[sp1])    \n\t"
+            "sb     %[p0_l],    +3(%[sp0])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [sp6] "r" (sp6), [sp5] "r" (sp5),
+              [sp4] "r" (sp4), [sp3] "r" (sp3), [sp2] "r" (sp2),
+              [sp1] "r" (sp1), [sp0] "r" (sp0)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],    +3(%[sq0])    \n\t"
+            "sb     %[q1_l],    +3(%[sq1])    \n\t"
+            "sb     %[q2_l],    +3(%[sq2])    \n\t"
+            "sb     %[q3_l],    +3(%[sq3])    \n\t"
+            "sb     %[q4_l],    +3(%[sq4])    \n\t"
+            "sb     %[q5_l],    +3(%[sq5])    \n\t"
+            "sb     %[q6_l],    +3(%[sq6])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l),
+              [q2_l] "r" (q2_l), [q3_l] "r" (q3_l),
+              [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2),
+              [sq3] "r" (sq3), [sq4] "r" (sq4), [sq5] "r" (sq5),
+              [q6_l] "r" (q6_l), [sq6] "r" (sq6)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
+            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
+            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
+            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
+            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
+            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [sp2] "r" (sp2), [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1), [sq2] "r" (sq2)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
+            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
+            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
+            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [sp1] "r" (sp1), [sp0] "r" (sp0),
+              [sq0] "r" (sq0), [sq1] "r" (sq1)
+        );
+      }
+    }
+
+    s = s + 4;
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c b/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
new file mode 100644
index 0000000..9e9171c
--- /dev/null
+++ b/source/libvpx/vp9/common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
@@ -0,0 +1,840 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_loopfilter.h"
+#include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_macros_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_masks_dspr2.h"
+#include "vp9/common/mips/dspr2/vp9_loopfilter_filters_dspr2.h"
+
+#if HAVE_DSPR2
+void vp9_mb_lpf_vertical_edge_w_dspr2(uint8_t *s,
+                                      int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh) {
+  uint8_t   i;
+  uint32_t  mask, hev, flat, flat2;
+  uint8_t   *s1, *s2, *s3, *s4;
+  uint32_t  prim1, prim2, sec3, sec4, prim3, prim4;
+  uint32_t  thresh_vec, flimit_vec, limit_vec;
+  uint32_t  uflimit, ulimit, uthresh;
+  uint32_t  p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  uint32_t  p1_f0, p0_f0, q0_f0, q1_f0;
+  uint32_t  p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
+  uint32_t  q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
+  uint32_t  p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
+  uint32_t  q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
+  uint32_t  p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
+  uint32_t  q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
+
+  uflimit = *blimit;
+  ulimit = *limit;
+  uthresh = *thresh;
+
+  /* create quad-byte */
+  __asm__ __volatile__ (
+      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
+      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
+      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
+
+      : [thresh_vec] "=&r" (thresh_vec), [flimit_vec] "=&r" (flimit_vec),
+        [limit_vec] "=r" (limit_vec)
+      : [uthresh] "r" (uthresh), [uflimit] "r" (uflimit), [ulimit] "r" (ulimit)
+  );
+
+  vp9_prefetch_store(s + pitch);
+
+  for (i = 0; i < 2; i++) {
+    s1 = s;
+    s2 = s + pitch;
+    s3 = s2 + pitch;
+    s4 = s3 + pitch;
+    s  = s4 + pitch;
+
+    __asm__ __volatile__ (
+        "lw     %[p0],  -4(%[s1])    \n\t"
+        "lw     %[p1],  -4(%[s2])    \n\t"
+        "lw     %[p2],  -4(%[s3])    \n\t"
+        "lw     %[p3],  -4(%[s4])    \n\t"
+        "lw     %[p4],  -8(%[s1])    \n\t"
+        "lw     %[p5],  -8(%[s2])    \n\t"
+        "lw     %[p6],  -8(%[s3])    \n\t"
+        "lw     %[p7],  -8(%[s4])    \n\t"
+
+        : [p3] "=&r" (p3), [p2] "=&r" (p2), [p1] "=&r" (p1),
+          [p0] "=&r" (p0), [p7] "=&r" (p7), [p6] "=&r" (p6),
+          [p5] "=&r" (p5), [p4] "=&r" (p4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    __asm__ __volatile__ (
+        "lw     %[q3],  (%[s1])     \n\t"
+        "lw     %[q2],  (%[s2])     \n\t"
+        "lw     %[q1],  (%[s3])     \n\t"
+        "lw     %[q0],  (%[s4])     \n\t"
+        "lw     %[q7],  +4(%[s1])   \n\t"
+        "lw     %[q6],  +4(%[s2])   \n\t"
+        "lw     %[q5],  +4(%[s3])   \n\t"
+        "lw     %[q4],  +4(%[s4])   \n\t"
+
+        : [q3] "=&r" (q3), [q2] "=&r" (q2), [q1] "=&r" (q1),
+          [q0] "=&r" (q0), [q7] "=&r" (q7), [q6] "=&r" (q6),
+          [q5] "=&r" (q5), [q4] "=&r" (q4)
+        : [s1] "r" (s1), [s2] "r" (s2), [s3] "r" (s3), [s4] "r" (s4)
+    );
+
+    /* transpose p3, p2, p1, p0
+       original (when loaded from memory)
+       register       -4    -3   -2     -1
+         p0         p0_0  p0_1  p0_2  p0_3
+         p1         p1_0  p1_1  p1_2  p1_3
+         p2         p2_0  p2_1  p2_2  p2_3
+         p3         p3_0  p3_1  p3_2  p3_3
+
+       after transpose
+       register
+         p0         p3_3  p2_3  p1_3  p0_3
+         p1         p3_2  p2_2  p1_2  p0_2
+         p2         p3_1  p2_1  p1_1  p0_1
+         p3         p3_0  p2_0  p1_0  p0_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
+
+        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
+        "append         %[p1],      %[sec3],    16          \n\t"
+        "append         %[p3],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p0] "+r" (p0), [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q0, q1, q2, q3
+       original (when loaded from memory)
+       register       +1    +2    +3    +4
+         q3         q3_0  q3_1  q3_2  q3_3
+         q2         q2_0  q2_1  q2_2  q2_3
+         q1         q1_0  q1_1  q1_2  q1_3
+         q0         q0_0  q0_1  q0_2  q0_3
+
+       after transpose
+       register
+         q3         q0_3  q1_3  q2_3  q3_3
+         q2         q0_2  q1_2  q2_2  q3_2
+         q1         q0_1  q1_1  q2_1  q3_1
+         q0         q0_0  q1_0  q2_0  q3_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
+
+        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
+        "append         %[q2],      %[sec3],    16          \n\t"
+        "append         %[q0],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q3] "+r" (q3), [q2] "+r" (q2), [q1] "+r" (q1), [q0] "+r" (q0),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose p7, p6, p5, p4
+       original (when loaded from memory)
+       register      -8    -7   -6     -5
+         p4         p4_0  p4_1  p4_2  p4_3
+         p5         p5_0  p5_1  p5_2  p5_3
+         p6         p6_0  p6_1  p6_2  p6_3
+         p7         p7_0  p7_1  p7_2  p7_3
+
+       after transpose
+       register
+         p4         p7_3  p6_3  p5_3  p4_3
+         p5         p7_2  p6_2  p5_2  p4_2
+         p6         p7_1  p6_1  p5_1  p4_1
+         p7         p7_0  p6_0  p5_0  p4_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
+        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
+        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
+
+        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
+        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
+        "append         %[p5],      %[sec3],    16          \n\t"
+        "append         %[p7],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [p4] "+r" (p4), [p5] "+r" (p5), [p6] "+r" (p6), [p7] "+r" (p7),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    /* transpose q4, q5, q6, q7
+       original (when loaded from memory)
+       register      +5    +6    +7    +8
+         q7         q7_0  q7_1  q7_2  q7_3
+         q6         q6_0  q6_1  q6_2  q6_3
+         q5         q5_0  q5_1  q5_2  q5_3
+         q4         q4_0  q4_1  q4_2  q4_3
+
+       after transpose
+       register
+         q7         q4_3  q5_3  q26_3  q7_3
+         q6         q4_2  q5_2  q26_2  q7_2
+         q5         q4_1  q5_1  q26_1  q7_1
+         q4         q4_0  q5_0  q26_0  q7_0
+    */
+    __asm__ __volatile__ (
+        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
+        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
+        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
+        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
+
+        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
+        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
+        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
+        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
+
+        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
+        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
+        "append         %[q6],      %[sec3],    16          \n\t"
+        "append         %[q4],      %[sec4],    16          \n\t"
+
+        : [prim1] "=&r" (prim1), [prim2] "=&r" (prim2),
+          [prim3] "=&r" (prim3), [prim4] "=&r" (prim4),
+          [q7] "+r" (q7), [q6] "+r" (q6), [q5] "+r" (q5), [q4] "+r" (q4),
+          [sec3] "=&r" (sec3), [sec4] "=&r" (sec4)
+        :
+    );
+
+    vp9_filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec,
+                                        p1, p0, p3, p2, q0, q1, q2, q3,
+                                        &hev, &mask, &flat);
+
+    vp9_flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
+
+    /* f0 */
+    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
+        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+      STORE_F0()
+    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
+               (mask == 0xFFFFFFFF)) {
+      /* f2 */
+      PACK_LEFT_0TO3()
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_0TO3()
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      STORE_F2()
+    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
+      /* f1 */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      STORE_F1()
+    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
+      /* f0 + f1 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      /* left 2 element operation */
+      PACK_LEFT_0TO3()
+      vp9_mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l,
+                         &q0_l, &q1_l, &q2_l, &q3_l);
+
+      /* right 2 element operation */
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r,
+                         &q0_r, &q1_r, &q2_r, &q3_r);
+
+      if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb         %[p1_f0],  -2(%[s4])    \n\t"
+            "sb         %[p0_f0],  -1(%[s4])    \n\t"
+            "sb         %[q0_f0],    (%[s4])    \n\t"
+            "sb         %[q1_f0],  +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r],    %[p2_r],    16      \n\t"
+          "srl      %[p1_r],    %[p1_r],    16      \n\t"
+          "srl      %[p0_r],    %[p0_r],    16      \n\t"
+          "srl      %[q0_r],    %[q0_r],    16      \n\t"
+          "srl      %[q1_r],    %[q1_r],    16      \n\t"
+          "srl      %[q2_r],    %[q2_r],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_r] "+r" (p2_r), [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r),
+            [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r), [q2_r] "+r" (q2_r),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+
+            :
+            : [p2_r] "r" (p2_r), [p1_r] "r" (p1_r), [p0_r] "r" (p0_r),
+              [q0_r] "r" (q0_r), [q1_r] "r" (q1_r), [q2_r] "r" (q2_r),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+          "sb       %[p2_l],    -3(%[s2])    \n\t"
+          "sb       %[p1_l],    -2(%[s2])    \n\t"
+          "sb       %[p0_l],    -1(%[s2])    \n\t"
+          "sb       %[q0_l],      (%[s2])    \n\t"
+          "sb       %[q1_l],    +1(%[s2])    \n\t"
+          "sb       %[q2_l],    +2(%[s2])    \n\t"
+
+          :
+          : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+            [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+            [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l],    %[p2_l],    16      \n\t"
+          "srl      %[p1_l],    %[p1_l],    16      \n\t"
+          "srl      %[p0_l],    %[p0_l],    16      \n\t"
+          "srl      %[q0_l],    %[q0_l],    16      \n\t"
+          "srl      %[q1_l],    %[q1_l],    16      \n\t"
+          "srl      %[q2_l],    %[q2_l],    16      \n\t"
+          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
+
+          : [p2_l] "+r" (p2_l), [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l),
+            [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+            "sb     %[q0_l],      (%[s1])    \n\t"
+            "sb     %[q1_l],    +1(%[s1])    \n\t"
+            "sb     %[q2_l],    +2(%[s1])    \n\t"
+
+            :
+            : [p2_l] "r" (p2_l), [p1_l] "r" (p1_l), [p0_l] "r" (p0_l),
+              [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
+      /* f0+f1+f2 */
+      vp9_filter1_dspr2(mask, hev, p1, p0, q0, q1,
+                        &p1_f0, &p0_f0, &q0_f0, &q1_f0);
+
+      PACK_LEFT_0TO3()
+      vp9_mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l,
+                          q0_l, q1_l, q2_l, q3_l,
+                          &p2_l_f1, &p1_l_f1, &p0_l_f1,
+                          &q0_l_f1, &q1_l_f1, &q2_l_f1);
+
+      PACK_RIGHT_0TO3()
+      vp9_mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r,
+                          q0_r, q1_r, q2_r, q3_r,
+                          &p2_r_f1, &p1_r_f1, &p0_r_f1,
+                          &q0_r_f1, &q1_r_f1, &q2_r_f1);
+
+      PACK_LEFT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l,
+                              &p3_l, &p2_l, &p1_l, &p0_l,
+                              &q0_l, &q1_l, &q2_l, &q3_l,
+                              &q4_l, &q5_l, &q6_l, &q7_l);
+
+      PACK_RIGHT_4TO7()
+      vp9_wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r,
+                              &p3_r, &p2_r, &p1_r, &p0_r,
+                              &q0_r, &q1_r, &q2_r, &q3_r,
+                              &q4_r, &q5_r, &q6_r, &q7_r);
+
+      if (mask & flat & flat2 & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s4])    \n\t"
+            "sb     %[p5_r],    -6(%[s4])    \n\t"
+            "sb     %[p4_r],    -5(%[s4])    \n\t"
+            "sb     %[p3_r],    -4(%[s4])    \n\t"
+            "sb     %[p2_r],    -3(%[s4])    \n\t"
+            "sb     %[p1_r],    -2(%[s4])    \n\t"
+            "sb     %[p0_r],    -1(%[s4])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r),
+              [p4_r] "r" (p4_r), [p3_r] "r" (p3_r),
+              [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s4] "r" (s4)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s4])    \n\t"
+            "sb     %[q1_r],    +1(%[s4])    \n\t"
+            "sb     %[q2_r],    +2(%[s4])    \n\t"
+            "sb     %[q3_r],    +3(%[s4])    \n\t"
+            "sb     %[q4_r],    +4(%[s4])    \n\t"
+            "sb     %[q5_r],    +5(%[s4])    \n\t"
+            "sb     %[q6_r],    +6(%[s4])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s4] "r" (s4)
+        );
+      } else if (mask & flat & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
+            "sb     %[q0_r_f1],       (%[s4])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s4] "r" (s4)
+        );
+      } else if (mask & 0x000000FF) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s4])    \n\t"
+            "sb     %[p0_f0],   -1(%[s4])    \n\t"
+            "sb     %[q0_f0],     (%[s4])    \n\t"
+            "sb     %[q1_f0],   +1(%[s4])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s4] "r" (s4)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_r],        %[p6_r],        16     \n\t"
+          "srl      %[p5_r],        %[p5_r],        16     \n\t"
+          "srl      %[p4_r],        %[p4_r],        16     \n\t"
+          "srl      %[p3_r],        %[p3_r],        16     \n\t"
+          "srl      %[p2_r],        %[p2_r],        16     \n\t"
+          "srl      %[p1_r],        %[p1_r],        16     \n\t"
+          "srl      %[p0_r],        %[p0_r],        16     \n\t"
+          "srl      %[q0_r],        %[q0_r],        16     \n\t"
+          "srl      %[q1_r],        %[q1_r],        16     \n\t"
+          "srl      %[q2_r],        %[q2_r],        16     \n\t"
+          "srl      %[q3_r],        %[q3_r],        16     \n\t"
+          "srl      %[q4_r],        %[q4_r],        16     \n\t"
+          "srl      %[q5_r],        %[q5_r],        16     \n\t"
+          "srl      %[q6_r],        %[q6_r],        16     \n\t"
+
+          : [q0_r] "+r" (q0_r), [q1_r] "+r" (q1_r),
+            [q2_r] "+r" (q2_r), [q3_r] "+r" (q3_r),
+            [q4_r] "+r" (q4_r), [q5_r] "+r" (q5_r),
+            [q6_r] "+r" (q6_r), [p6_r] "+r" (p6_r),
+            [p5_r] "+r" (p5_r), [p4_r] "+r" (p4_r),
+            [p3_r] "+r" (p3_r), [p2_r] "+r" (p2_r),
+            [p1_r] "+r" (p1_r), [p0_r] "+r" (p0_r)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
+          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
+          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
+          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
+          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
+          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_r_f1] "+r" (p2_r_f1), [p1_r_f1] "+r" (p1_r_f1),
+            [p0_r_f1] "+r" (p0_r_f1), [q0_r_f1] "+r" (q0_r_f1),
+            [q1_r_f1] "+r" (q1_r_f1), [q2_r_f1] "+r" (q2_r_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p6_r],    -7(%[s3])    \n\t"
+            "sb     %[p5_r],    -6(%[s3])    \n\t"
+            "sb     %[p4_r],    -5(%[s3])    \n\t"
+            "sb     %[p3_r],    -4(%[s3])    \n\t"
+            "sb     %[p2_r],    -3(%[s3])    \n\t"
+            "sb     %[p1_r],    -2(%[s3])    \n\t"
+            "sb     %[p0_r],    -1(%[s3])    \n\t"
+
+            :
+            : [p6_r] "r" (p6_r), [p5_r] "r" (p5_r), [p4_r] "r" (p4_r),
+              [p3_r] "r" (p3_r), [p2_r] "r" (p2_r), [p1_r] "r" (p1_r),
+              [p0_r] "r" (p0_r), [s3] "r" (s3)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_r],      (%[s3])    \n\t"
+            "sb     %[q1_r],    +1(%[s3])    \n\t"
+            "sb     %[q2_r],    +2(%[s3])    \n\t"
+            "sb     %[q3_r],    +3(%[s3])    \n\t"
+            "sb     %[q4_r],    +4(%[s3])    \n\t"
+            "sb     %[q5_r],    +5(%[s3])    \n\t"
+            "sb     %[q6_r],    +6(%[s3])    \n\t"
+
+            :
+            : [q0_r] "r" (q0_r), [q1_r] "r" (q1_r),
+              [q2_r] "r" (q2_r), [q3_r] "r" (q3_r),
+              [q4_r] "r" (q4_r), [q5_r] "r" (q5_r),
+              [q6_r] "r" (q6_r), [s3] "r" (s3)
+        );
+      } else if (mask & flat & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
+            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
+            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
+            "sb     %[q0_r_f1],       (%[s3])    \n\t"
+            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
+            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
+
+            :
+            : [p2_r_f1] "r" (p2_r_f1), [p1_r_f1] "r" (p1_r_f1),
+              [p0_r_f1] "r" (p0_r_f1), [q0_r_f1] "r" (q0_r_f1),
+              [q1_r_f1] "r" (q1_r_f1), [q2_r_f1] "r" (q2_r_f1),
+              [s3] "r" (s3)
+        );
+      } else if (mask & 0x0000FF00) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s3])    \n\t"
+            "sb     %[p0_f0],   -1(%[s3])    \n\t"
+            "sb     %[q0_f0],     (%[s3])    \n\t"
+            "sb     %[q1_f0],   +1(%[s3])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s3] "r" (s3)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
+          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
+          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
+          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
+
+          : [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s2])    \n\t"
+            "sb     %[p5_l],    -6(%[s2])    \n\t"
+            "sb     %[p4_l],    -5(%[s2])    \n\t"
+            "sb     %[p3_l],    -4(%[s2])    \n\t"
+            "sb     %[p2_l],    -3(%[s2])    \n\t"
+            "sb     %[p1_l],    -2(%[s2])    \n\t"
+            "sb     %[p0_l],    -1(%[s2])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l), [s2] "r" (s2)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],      (%[s2])    \n\t"
+            "sb     %[q1_l],    +1(%[s2])    \n\t"
+            "sb     %[q2_l],    +2(%[s2])    \n\t"
+            "sb     %[q3_l],    +3(%[s2])    \n\t"
+            "sb     %[q4_l],    +4(%[s2])    \n\t"
+            "sb     %[q5_l],    +5(%[s2])    \n\t"
+            "sb     %[q6_l],    +6(%[s2])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l), [s2] "r" (s2)
+        );
+      } else if (mask & flat & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
+            "sb     %[q0_l_f1],       (%[s2])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s2] "r" (s2)
+        );
+      } else if (mask & 0x00FF0000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s2])    \n\t"
+            "sb     %[p0_f0],   -1(%[s2])    \n\t"
+            "sb     %[q0_f0],     (%[s2])    \n\t"
+            "sb     %[q1_f0],   +1(%[s2])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s2] "r" (s2)
+        );
+      }
+
+      __asm__ __volatile__ (
+          "srl      %[p6_l],        %[p6_l],        16     \n\t"
+          "srl      %[p5_l],        %[p5_l],        16     \n\t"
+          "srl      %[p4_l],        %[p4_l],        16     \n\t"
+          "srl      %[p3_l],        %[p3_l],        16     \n\t"
+          "srl      %[p2_l],        %[p2_l],        16     \n\t"
+          "srl      %[p1_l],        %[p1_l],        16     \n\t"
+          "srl      %[p0_l],        %[p0_l],        16     \n\t"
+          "srl      %[q0_l],        %[q0_l],        16     \n\t"
+          "srl      %[q1_l],        %[q1_l],        16     \n\t"
+          "srl      %[q2_l],        %[q2_l],        16     \n\t"
+          "srl      %[q3_l],        %[q3_l],        16     \n\t"
+          "srl      %[q4_l],        %[q4_l],        16     \n\t"
+          "srl      %[q5_l],        %[q5_l],        16     \n\t"
+          "srl      %[q6_l],        %[q6_l],        16     \n\t"
+
+          : [q0_l] "+r" (q0_l), [q1_l] "+r" (q1_l), [q2_l] "+r" (q2_l),
+            [q3_l] "+r" (q3_l), [q4_l] "+r" (q4_l), [q5_l] "+r" (q5_l),
+            [q6_l] "+r" (q6_l), [p6_l] "+r" (p6_l), [p5_l] "+r" (p5_l),
+            [p4_l] "+r" (p4_l), [p3_l] "+r" (p3_l), [p2_l] "+r" (p2_l),
+            [p1_l] "+r" (p1_l), [p0_l] "+r" (p0_l)
+          :
+      );
+
+      __asm__ __volatile__ (
+          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
+          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
+          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
+          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
+          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
+          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
+          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
+          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
+          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
+          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
+
+          : [p2_l_f1] "+r" (p2_l_f1), [p1_l_f1] "+r" (p1_l_f1),
+            [p0_l_f1] "+r" (p0_l_f1), [q0_l_f1] "+r" (q0_l_f1),
+            [q1_l_f1] "+r" (q1_l_f1), [q2_l_f1] "+r" (q2_l_f1),
+            [p1_f0] "+r" (p1_f0), [p0_f0] "+r" (p0_f0),
+            [q0_f0] "+r" (q0_f0), [q1_f0] "+r" (q1_f0)
+          :
+      );
+
+      if (mask & flat & flat2 & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p6_l],    -7(%[s1])    \n\t"
+            "sb     %[p5_l],    -6(%[s1])    \n\t"
+            "sb     %[p4_l],    -5(%[s1])    \n\t"
+            "sb     %[p3_l],    -4(%[s1])    \n\t"
+            "sb     %[p2_l],    -3(%[s1])    \n\t"
+            "sb     %[p1_l],    -2(%[s1])    \n\t"
+            "sb     %[p0_l],    -1(%[s1])    \n\t"
+
+            :
+            : [p6_l] "r" (p6_l), [p5_l] "r" (p5_l), [p4_l] "r" (p4_l),
+              [p3_l] "r" (p3_l), [p2_l] "r" (p2_l), [p1_l] "r" (p1_l),
+              [p0_l] "r" (p0_l),
+              [s1] "r" (s1)
+        );
+
+        __asm__ __volatile__ (
+            "sb     %[q0_l],     (%[s1])    \n\t"
+            "sb     %[q1_l],    1(%[s1])    \n\t"
+            "sb     %[q2_l],    2(%[s1])    \n\t"
+            "sb     %[q3_l],    3(%[s1])    \n\t"
+            "sb     %[q4_l],    4(%[s1])    \n\t"
+            "sb     %[q5_l],    5(%[s1])    \n\t"
+            "sb     %[q6_l],    6(%[s1])    \n\t"
+
+            :
+            : [q0_l] "r" (q0_l), [q1_l] "r" (q1_l), [q2_l] "r" (q2_l),
+              [q3_l] "r" (q3_l), [q4_l] "r" (q4_l), [q5_l] "r" (q5_l),
+              [q6_l] "r" (q6_l),
+              [s1] "r" (s1)
+        );
+      } else if (mask & flat & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
+            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
+            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
+            "sb     %[q0_l_f1],       (%[s1])    \n\t"
+            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
+            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
+
+            :
+            : [p2_l_f1] "r" (p2_l_f1), [p1_l_f1] "r" (p1_l_f1),
+              [p0_l_f1] "r" (p0_l_f1), [q0_l_f1] "r" (q0_l_f1),
+              [q1_l_f1] "r" (q1_l_f1), [q2_l_f1] "r" (q2_l_f1),
+              [s1] "r" (s1)
+        );
+      } else if (mask & 0xFF000000) {
+        __asm__ __volatile__ (
+            "sb     %[p1_f0],   -2(%[s1])    \n\t"
+            "sb     %[p0_f0],   -1(%[s1])    \n\t"
+            "sb     %[q0_f0],     (%[s1])    \n\t"
+            "sb     %[q1_f0],   +1(%[s1])    \n\t"
+
+            :
+            : [p1_f0] "r" (p1_f0), [p0_f0] "r" (p0_f0),
+              [q0_f0] "r" (q0_f0), [q1_f0] "r" (q1_f0),
+              [s1] "r" (s1)
+        );
+      }
+    }
+  }
+}
+#endif  // #if HAVE_DSPR2
diff --git a/source/libvpx/vp9/common/vp9_alloccommon.c b/source/libvpx/vp9/common/vp9_alloccommon.c
index 0d65651..80c48d1 100644
--- a/source/libvpx/vp9/common/vp9_alloccommon.c
+++ b/source/libvpx/vp9/common/vp9_alloccommon.c
@@ -34,7 +34,7 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) {
 void vp9_free_frame_buffers(VP9_COMMON *cm) {
   int i;
 
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
+  for (i = 0; i < cm->fb_count; i++)
     vp9_free_frame_buffer(&cm->yv12_fb[i]);
 
   vp9_free_frame_buffer(&cm->post_proc_buffer);
@@ -75,10 +75,60 @@ static void setup_mi(VP9_COMMON *cm) {
              cm->mode_info_stride * (cm->mi_rows + 1) *
              sizeof(*cm->mi_grid_base));
 
-  vp9_update_mode_info_border(cm, cm->mip);
   vp9_update_mode_info_border(cm, cm->prev_mip);
 }
 
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
+  int mi_size;
+
+  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+                               VP9BORDERINPIXELS, NULL, NULL, NULL) < 0)
+    goto fail;
+
+  set_mb_mi(cm, aligned_width, aligned_height);
+
+  // Allocation
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+
+  vpx_free(cm->mip);
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
+    goto fail;
+
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
+    goto fail;
+
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (!cm->mi_grid_base)
+    goto fail;
+
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (!cm->prev_mi_grid_base)
+    goto fail;
+
+  setup_mi(cm);
+
+  // Create the segmentation map structure and set to 0.
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
+    goto fail;
+
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int i;
 
@@ -88,22 +138,34 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   const int ss_y = cm->subsampling_y;
   int mi_size;
 
+  if (cm->fb_count == 0) {
+    cm->fb_count = FRAME_BUFFERS;
+    CHECK_MEM_ERROR(cm, cm->yv12_fb,
+                    vpx_calloc(cm->fb_count, sizeof(*cm->yv12_fb)));
+    CHECK_MEM_ERROR(cm, cm->fb_idx_ref_cnt,
+                    vpx_calloc(cm->fb_count, sizeof(*cm->fb_idx_ref_cnt)));
+    if (cm->fb_lru) {
+      CHECK_MEM_ERROR(cm, cm->fb_idx_ref_lru,
+                      vpx_calloc(cm->fb_count, sizeof(*cm->fb_idx_ref_lru)));
+    }
+  }
+
   vp9_free_frame_buffers(cm);
 
-  for (i = 0; i < NUM_YV12_BUFFERS; i++) {
+  for (i = 0; i < cm->fb_count; i++) {
     cm->fb_idx_ref_cnt[i] = 0;
     if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y,
                                VP9BORDERINPIXELS) < 0)
       goto fail;
   }
 
-  cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+  cm->new_fb_idx = cm->fb_count - 1;
   cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
 
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++)
+  for (i = 0; i < REFS_PER_FRAME; i++)
     cm->active_ref_idx[i] = i;
 
-  for (i = 0; i < NUM_REF_FRAMES; i++) {
+  for (i = 0; i < REF_FRAMES; i++) {
     cm->ref_frame_map[i] = i;
     cm->fb_idx_ref_cnt[i] = 1;
   }
@@ -149,20 +211,22 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
 
 void vp9_create_common(VP9_COMMON *cm) {
   vp9_machine_specific_config(cm);
-
-  cm->tx_mode = ONLY_4X4;
-  cm->comp_pred_mode = HYBRID_PREDICTION;
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
   vp9_free_frame_buffers(cm);
+
+  vpx_free(cm->yv12_fb);
+  vpx_free(cm->fb_idx_ref_cnt);
+  vpx_free(cm->fb_idx_ref_lru);
+
+  cm->yv12_fb = NULL;
+  cm->fb_idx_ref_cnt = NULL;
+  cm->fb_idx_ref_lru = NULL;
 }
 
 void vp9_initialize_common() {
   vp9_init_neighbors();
-  vp9_coef_tree_initialize();
-  vp9_entropy_mode_init();
-  vp9_entropy_mv_init();
 }
 
 void vp9_update_frame_size(VP9_COMMON *cm) {
diff --git a/source/libvpx/vp9/common/vp9_alloccommon.h b/source/libvpx/vp9/common/vp9_alloccommon.h
index 5d5fae9..cf8dca5 100644
--- a/source/libvpx/vp9/common/vp9_alloccommon.h
+++ b/source/libvpx/vp9/common/vp9_alloccommon.h
@@ -21,6 +21,7 @@ void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi);
 void vp9_create_common(VP9_COMMON *cm);
 void vp9_remove_common(VP9_COMMON *cm);
 
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
 void vp9_free_frame_buffers(VP9_COMMON *cm);
 
diff --git a/source/libvpx/vp9/common/vp9_blockd.h b/source/libvpx/vp9/common/vp9_blockd.h
index bac40c5..93f96c8 100644
--- a/source/libvpx/vp9/common/vp9_blockd.h
+++ b/source/libvpx/vp9/common/vp9_blockd.h
@@ -26,8 +26,9 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_treecoder.h"
 
-#define BLOCK_SIZE_GROUPS   4
+#define BLOCK_SIZE_GROUPS 4
 #define MBSKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
@@ -37,8 +38,9 @@
 #define REF_CONTEXTS 5
 
 typedef enum {
-  PLANE_TYPE_Y_WITH_DC,
-  PLANE_TYPE_UV,
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
 } PLANE_TYPE;
 
 typedef char ENTROPY_CONTEXT;
@@ -74,10 +76,6 @@ typedef enum {
   MB_MODE_COUNT
 } MB_PREDICTION_MODE;
 
-static INLINE int is_intra_mode(MB_PREDICTION_MODE mode) {
-  return mode <= TM_PRED;
-}
-
 static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
@@ -86,9 +84,8 @@ static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) {
 
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
 
-static INLINE int inter_mode_offset(MB_PREDICTION_MODE mode) {
-  return (mode - NEARESTMV);
-}
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
 
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
@@ -158,6 +155,34 @@ static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
   return mbmi->ref_frame[1] > INTRA_FRAME;
 }
 
+static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
+                                          const MODE_INFO *left_mi, int b) {
+  if (b == 0 || b == 2) {
+    if (!left_mi || is_inter_block(&left_mi->mbmi))
+      return DC_PRED;
+
+    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
+                                             : left_mi->mbmi.mode;
+  } else {
+    assert(b == 1 || b == 3);
+    return cur_mi->bmi[b - 1].as_mode;
+  }
+}
+
+static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
+                                           const MODE_INFO *above_mi, int b) {
+  if (b == 0 || b == 1) {
+    if (!above_mi || is_inter_block(&above_mi->mbmi))
+      return DC_PRED;
+
+    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
+                                              : above_mi->mbmi.mode;
+  } else {
+    assert(b == 2 || b == 3);
+    return cur_mi->bmi[b - 2].as_mode;
+  }
+}
+
 enum mv_precision {
   MV_PRECISION_Q3,
   MV_PRECISION_Q4
@@ -175,9 +200,7 @@ struct buf_2d {
 };
 
 struct macroblockd_plane {
-  DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
-  DECLARE_ALIGNED(16, uint16_t, eobs[256]);
+  int16_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -212,6 +235,9 @@ typedef struct macroblockd {
   int mb_to_top_edge;
   int mb_to_bottom_edge;
 
+  /* pointers to reference frames */
+  const YV12_BUFFER_CONFIG *ref_buf[2];
+
   int lossless;
   /* Inverse transform function pointers. */
   void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
@@ -220,13 +246,6 @@ typedef struct macroblockd {
 
   int corrupted;
 
-  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
-  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
-  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
-  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
-
-  int q_index;
-
   /* Y,U,V,(A) */
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
   ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
@@ -250,45 +269,53 @@ static INLINE TX_TYPE get_tx_type_4x4(PLANE_TYPE plane_type,
   const MODE_INFO *const mi = xd->mi_8x8[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
-  if (plane_type != PLANE_TYPE_Y_WITH_DC ||
-      xd->lossless ||
-      is_inter_block(mbmi))
+  if (plane_type != PLANE_TYPE_Y || xd->lossless || is_inter_block(mbmi))
     return DCT_DCT;
 
-  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ?
-                       mi->bmi[ib].as_mode : mbmi->mode];
+  return mode2txfm_map[mbmi->sb_type < BLOCK_8X8 ? mi->bmi[ib].as_mode
+                                                 : mbmi->mode];
 }
 
 static INLINE TX_TYPE get_tx_type_8x8(PLANE_TYPE plane_type,
                                       const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
+  return plane_type == PLANE_TYPE_Y ? mode2txfm_map[xd->mi_8x8[0]->mbmi.mode]
+                                    : DCT_DCT;
 }
 
 static INLINE TX_TYPE get_tx_type_16x16(PLANE_TYPE plane_type,
                                         const MACROBLOCKD *xd) {
-  return plane_type == PLANE_TYPE_Y_WITH_DC ?
-             mode2txfm_map[xd->mi_8x8[0]->mbmi.mode] : DCT_DCT;
+  return plane_type == PLANE_TYPE_Y ? mode2txfm_map[xd->mi_8x8[0]->mbmi.mode]
+                                    : DCT_DCT;
 }
 
 static void setup_block_dptrs(MACROBLOCKD *xd, int ss_x, int ss_y) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC;
+    xd->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y;
     xd->plane[i].subsampling_x = i ? ss_x : 0;
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
 #if CONFIG_ALPHA
   // TODO(jkoleszar): Using the Y w/h for now
+  xd->plane[3].plane_type = PLANE_TYPE_Y;
   xd->plane[3].subsampling_x = 0;
   xd->plane[3].subsampling_y = 0;
 #endif
 }
 
+static TX_SIZE get_uv_tx_size_impl(TX_SIZE y_tx_size, BLOCK_SIZE bsize) {
+  if (bsize < BLOCK_8X8) {
+    return TX_4X4;
+  } else {
+    // TODO(dkovalev): Assuming YUV420 (ss_x == 1, ss_y == 1)
+    const BLOCK_SIZE plane_bsize = ss_size_lookup[bsize][1][1];
+    return MIN(y_tx_size, max_txsize_lookup[plane_bsize]);
+  }
+}
 
-static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
-  return MIN(mbmi->tx_size, max_uv_txsize_lookup[mbmi->sb_type]);
+static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) {
+  return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type);
 }
 
 static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
@@ -298,16 +325,6 @@ static BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
   return bs;
 }
 
-static INLINE int plane_block_width(BLOCK_SIZE bsize,
-                                    const struct macroblockd_plane* plane) {
-  return 4 << (b_width_log2(bsize) - plane->subsampling_x);
-}
-
-static INLINE int plane_block_height(BLOCK_SIZE bsize,
-                                     const struct macroblockd_plane* plane) {
-  return 4 << (b_height_log2(bsize) - plane->subsampling_y);
-}
-
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,
@@ -381,35 +398,6 @@ static INLINE void foreach_transformed_block_uv(
     foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
 }
 
-static int raster_block_offset(BLOCK_SIZE plane_bsize,
-                               int raster_block, int stride) {
-  const int bw = b_width_log2(plane_bsize);
-  const int y = 4 * (raster_block >> bw);
-  const int x = 4 * (raster_block & ((1 << bw) - 1));
-  return y * stride + x;
-}
-static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
-                                          int raster_block, int16_t *base) {
-  const int stride = 4 << b_width_log2(plane_bsize);
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
-                                          int raster_block, uint8_t *base,
-                                          int stride) {
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
-static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
-                                       TX_SIZE tx_size, int block) {
-  const int bwl = b_width_log2(plane_bsize);
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
-  const int y = (raster_mb >> tx_cols_log2) << tx_size;
-  return x + (y << bwl);
-}
-
 static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,
                                      int *x, int *y) {
@@ -421,22 +409,19 @@ static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
   *y = (raster_mb >> tx_cols_log2) << tx_size;
 }
 
-static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
-                             int plane, int block, TX_SIZE tx_size) {
+static void extend_for_intra(MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+                             int plane, int aoff, int loff) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   uint8_t *const buf = pd->dst.buf;
   const int stride = pd->dst.stride;
-
-  int x, y;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  x = x * 4 - 1;
-  y = y * 4 - 1;
+  const int x = aoff * 4 - 1;
+  const int y = loff * 4 - 1;
   // Copy a pixel into the umv if we are in a situation where the block size
   // extends into the UMV.
   // TODO(JBB): Should be able to do the full extend in place so we don't have
   // to do this multiple times.
   if (xd->mb_to_right_edge < 0) {
-    const int bw = 4 << b_width_log2(plane_bsize);
+    const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
     const int umv_border_start = bw + (xd->mb_to_right_edge >>
                                        (3 + pd->subsampling_x));
 
@@ -447,7 +432,7 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
 
   if (xd->mb_to_bottom_edge < 0) {
     if (xd->left_available || x >= 0) {
-      const int bh = 4 << b_height_log2(plane_bsize);
+      const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
       const int umv_border_start =
           bh + (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y));
 
@@ -461,57 +446,46 @@ static void extend_for_intra(MACROBLOCKD* const xd, BLOCK_SIZE plane_bsize,
     }
   }
 }
-static void set_contexts_on_border(MACROBLOCKD *xd,
-                                   struct macroblockd_plane *pd,
-                                   BLOCK_SIZE plane_bsize,
-                                   int tx_size_in_blocks, int has_eob,
-                                   int aoff, int loff,
-                                   ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) {
-  int mi_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
-  int mi_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
-  int above_contexts = tx_size_in_blocks;
-  int left_contexts = tx_size_in_blocks;
-  int pt;
-
-  // xd->mb_to_right_edge is in units of pixels * 8.  This converts
-  // it to 4x4 block sizes.
-  if (xd->mb_to_right_edge < 0)
-    mi_blocks_wide += (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-
-  if (xd->mb_to_bottom_edge < 0)
-    mi_blocks_high += (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
-
-  // this code attempts to avoid copying into contexts that are outside
-  // our border.  Any blocks that do are set to 0...
-  if (above_contexts + aoff > mi_blocks_wide)
-    above_contexts = mi_blocks_wide - aoff;
-
-  if (left_contexts + loff > mi_blocks_high)
-    left_contexts = mi_blocks_high - loff;
-
-  for (pt = 0; pt < above_contexts; pt++)
-    A[pt] = has_eob;
-  for (pt = above_contexts; pt < tx_size_in_blocks; pt++)
-    A[pt] = 0;
-  for (pt = 0; pt < left_contexts; pt++)
-    L[pt] = has_eob;
-  for (pt = left_contexts; pt < tx_size_in_blocks; pt++)
-    L[pt] = 0;
-}
 
-static void set_contexts(MACROBLOCKD *xd, struct macroblockd_plane *pd,
+static void set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          int has_eob, int aoff, int loff) {
-  ENTROPY_CONTEXT *const A = pd->above_context + aoff;
-  ENTROPY_CONTEXT *const L = pd->left_context + loff;
+  ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+  ENTROPY_CONTEXT *const l = pd->left_context + loff;
   const int tx_size_in_blocks = 1 << tx_size;
 
-  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) {
-    set_contexts_on_border(xd, pd, plane_bsize, tx_size_in_blocks, has_eob,
-                           aoff, loff, A, L);
+  // above
+  if (has_eob && xd->mb_to_right_edge < 0) {
+    int i;
+    const int blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize] +
+                            (xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+    int above_contexts = tx_size_in_blocks;
+    if (above_contexts + aoff > blocks_wide)
+      above_contexts = blocks_wide - aoff;
+
+    for (i = 0; i < above_contexts; ++i)
+      a[i] = has_eob;
+    for (i = above_contexts; i < tx_size_in_blocks; ++i)
+      a[i] = 0;
+  } else {
+    vpx_memset(a, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+  }
+
+  // left
+  if (has_eob && xd->mb_to_bottom_edge < 0) {
+    int i;
+    const int blocks_high = num_4x4_blocks_high_lookup[plane_bsize] +
+                            (xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+    int left_contexts = tx_size_in_blocks;
+    if (left_contexts + loff > blocks_high)
+      left_contexts = blocks_high - loff;
+
+    for (i = 0; i < left_contexts; ++i)
+      l[i] = has_eob;
+    for (i = left_contexts; i < tx_size_in_blocks; ++i)
+      l[i] = 0;
   } else {
-    vpx_memset(A, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
-    vpx_memset(L, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
+    vpx_memset(l, has_eob, sizeof(ENTROPY_CONTEXT) * tx_size_in_blocks);
   }
 }
 
diff --git a/source/libvpx/vp9/common/vp9_common_data.c b/source/libvpx/vp9/common/vp9_common_data.c
index f858900..886c0af 100644
--- a/source/libvpx/vp9/common/vp9_common_data.c
+++ b/source/libvpx/vp9/common/vp9_common_data.c
@@ -108,12 +108,6 @@ const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
   TX_16X16, TX_16X16, TX_16X16,
   TX_32X32, TX_32X32, TX_32X32, TX_32X32
 };
-const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16, TX_32X32
-};
 
 const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_4X4,  // ONLY_4X4
@@ -123,8 +117,6 @@ const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
   TX_32X32,  // TX_MODE_SELECT
 };
 
-
-
 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
 //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
 //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
@@ -143,4 +135,24 @@ const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
 
-
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES]= {
+  {15, 15},  // 4X4   - {0b1111, 0b1111}
+  {15, 14},  // 4X8   - {0b1111, 0b1110}
+  {14, 15},  // 8X4   - {0b1110, 0b1111}
+  {14, 14},  // 8X8   - {0b1110, 0b1110}
+  {14, 12},  // 8X16  - {0b1110, 0b1100}
+  {12, 14},  // 16X8  - {0b1100, 0b1110}
+  {12, 12},  // 16X16 - {0b1100, 0b1100}
+  {12, 8 },  // 16X32 - {0b1100, 0b1000}
+  {8,  12},  // 32X16 - {0b1000, 0b1100}
+  {8,  8 },  // 32X32 - {0b1000, 0b1000}
+  {8,  0 },  // 32X64 - {0b1000, 0b0000}
+  {0,  8 },  // 64X32 - {0b0000, 0b1000}
+  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+};
diff --git a/source/libvpx/vp9/common/vp9_common_data.h b/source/libvpx/vp9/common/vp9_common_data.h
index c1f6405..b6fc70a 100644
--- a/source/libvpx/vp9/common/vp9_common_data.h
+++ b/source/libvpx/vp9/common/vp9_common_data.h
@@ -26,7 +26,6 @@ extern const int num_pels_log2_lookup[BLOCK_SIZES];
 extern const PARTITION_TYPE partition_lookup[][BLOCK_SIZES];
 extern const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES];
 extern const TX_SIZE max_txsize_lookup[BLOCK_SIZES];
-extern const TX_SIZE max_uv_txsize_lookup[BLOCK_SIZES];
 extern const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES];
 extern const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2];
 
diff --git a/source/libvpx/vp9/common/vp9_convolve.c b/source/libvpx/vp9/common/vp9_convolve.c
index a2d864c..6edf7ea 100644
--- a/source/libvpx/vp9/common/vp9_convolve.c
+++ b/source/libvpx/vp9/common/vp9_convolve.c
@@ -18,40 +18,21 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x0, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4,
-                             int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const subpel_kernel *x_filters,
+                           int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
-    /* Initial phase offset */
-    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
-
+    int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[src_x + k] * filter_x[k];
-
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -59,41 +40,22 @@ static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x0, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4,
-                                 int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_x_base =
-      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source line */
-  src -= taps / 2 - 1;
-
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                               uint8_t *dst, ptrdiff_t dst_stride,
+                               const subpel_kernel *x_filters,
+                               int x0_q4, int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
   for (y = 0; y < h; ++y) {
-    /* Initial phase offset */
-    int x_q4 = (int)(filter_x0 - filter_x_base) / taps;
-
+    int x_q4 = x0_q4;
     for (x = 0; x < w; ++x) {
-      /* Per-pixel src offset */
-      const int src_x = x_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_x = filter_x_base +
-          (x_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[src_x + k] * filter_x[k];
-
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
-                   clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       x_q4 += x_step_q4;
     }
     src += src_stride;
@@ -101,41 +63,22 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y0, int y_step_q4,
-                            int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const subpel_kernel *y_filters,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
-    /* Initial phase offset */
-    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
-
+    int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-
-      /* Move to the next source pixel */
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       y_q4 += y_step_q4;
     }
     ++src;
@@ -143,41 +86,23 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y0, int y_step_q4,
-                                int w, int h, int taps) {
-  int x, y, k;
-
-  /* NOTE: This assumes that the filter table is 256-byte aligned. */
-  /* TODO(agrange) Modify to make independent of table alignment. */
-  const int16_t *const filter_y_base =
-      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-
-  /* Adjust base pointer address for this source column */
-  src -= src_stride * (taps / 2 - 1);
+static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const subpel_kernel *y_filters,
+                              int y0_q4, int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
 
   for (x = 0; x < w; ++x) {
-    /* Initial phase offset */
-    int y_q4 = (int)(filter_y0 - filter_y_base) / taps;
-
+    int y_q4 = y0_q4;
     for (y = 0; y < h; ++y) {
-      /* Per-pixel src offset */
-      const int src_y = y_q4 >> SUBPEL_BITS;
-      int sum = 0;
-
-      /* Pointer to filter to use */
-      const int16_t *const filter_y = filter_y_base +
-          (y_q4 & SUBPEL_MASK) * taps;
-
-      for (k = 0; k < taps; ++k)
-        sum += src[(src_y + k) * src_stride] * filter_y[k];
-
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
-           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-
-      /* Move to the next source pixel */
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
       y_q4 += y_step_q4;
     }
     ++src;
@@ -185,33 +110,42 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
-                       uint8_t *dst, ptrdiff_t dst_stride,
-                       const int16_t *filter_x, int x_step_q4,
-                       const int16_t *filter_y, int y_step_q4,
-                       int w, int h, int taps) {
-  /* Fixed size intermediate buffer places limits on parameters.
-   * Maximum intermediate_height is 324, for y_step_q4 == 80,
-   * h == 64, taps == 8.
-   * y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
-   */
+static void convolve(const uint8_t *src, ptrdiff_t src_stride,
+                     uint8_t *dst, ptrdiff_t dst_stride,
+                     const subpel_kernel *const x_filters,
+                     int x0_q4, int x_step_q4,
+                     const subpel_kernel *const y_filters,
+                     int y0_q4, int y_step_q4,
+                     int w, int h) {
+  // Fixed size intermediate buffer places limits on parameters.
+  // Maximum intermediate_height is 324, for y_step_q4 == 80,
+  // h == 64, taps == 8.
+  // y_step_q4 of 80 allows for 1/10 scale for 5 layer svc
   uint8_t temp[64 * 324];
-  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + taps;
+  int intermediate_height = (((h - 1) * y_step_q4 + 15) >> 4) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(taps <= 8);
   assert(y_step_q4 <= 80);
   assert(x_step_q4 <= 80);
 
   if (intermediate_height < h)
     intermediate_height = h;
 
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w,
-                   intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
-                  x_step_q4, filter_y, y_step_q4, w, h, taps);
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+                y_filters, y0_q4, y_step_q4, w, h);
+}
+
+static const subpel_kernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  // TODO(agrange) Modify to make independent of table alignment.
+  return (const subpel_kernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const subpel_kernel *base) {
+  return (const subpel_kernel *)(intptr_t)f - base;
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -219,8 +153,11 @@ void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
-  convolve_horiz_c(src, src_stride, dst, dst_stride,
-                   filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x,
+                 x0_q4, x_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -228,8 +165,11 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h) {
-  convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
-                       filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x,
+                     x0_q4, x_step_q4, w, h);
 }
 
 void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -237,8 +177,10 @@ void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           const int16_t *filter_x, int x_step_q4,
                           const int16_t *filter_y, int y_step_q4,
                           int w, int h) {
-  convolve_vert_c(src, src_stride, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -246,8 +188,10 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4,
                               int w, int h) {
-  convolve_avg_vert_c(src, src_stride, dst, dst_stride,
-                      filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y,
+                    y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -255,8 +199,15 @@ void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
                      const int16_t *filter_x, int x_step_q4,
                      const int16_t *filter_y, int y_step_q4,
                      int w, int h) {
-  convolve_c(src, src_stride, dst, dst_stride,
-             filter_x, x_step_q4, filter_y, y_step_q4, w, h, 8);
+  const subpel_kernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const subpel_kernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve(src, src_stride, dst, dst_stride,
+           filters_x, x0_q4, x_step_q4,
+           filters_y, y0_q4, y_step_q4, w, h);
 }
 
 void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -269,9 +220,9 @@ void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
   assert(w <= 64);
   assert(h <= 64);
 
-  vp9_convolve8(src, src_stride, temp, 64,
-               filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vp9_convolve_avg(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vp9_convolve8_c(src, src_stride, temp, 64,
+                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
+  vp9_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
 }
 
 void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
diff --git a/source/libvpx/vp9/common/vp9_default_coef_probs.h b/source/libvpx/vp9/common/vp9_default_coef_probs.h
deleted file mode 100644
index 3b512be..0000000
--- a/source/libvpx/vp9/common/vp9_default_coef_probs.h
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-*/
-#ifndef VP9_COMMON_DEFAULT_COEF_PROBS_H_
-#define VP9_COMMON_DEFAULT_COEF_PROBS_H_
-
-/*Generated file, included by vp9_entropy.c*/
-static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 195,  29, 183 },
-        {  84,  49, 136 },
-        {   8,  42,  71 }
-      }, { /* Coeff Band 1 */
-        {  31, 107, 169 },
-        {  35,  99, 159 },
-        {  17,  82, 140 },
-        {   8,  66, 114 },
-        {   2,  44,  76 },
-        {   1,  19,  32 }
-      }, { /* Coeff Band 2 */
-        {  40, 132, 201 },
-        {  29, 114, 187 },
-        {  13,  91, 157 },
-        {   7,  75, 127 },
-        {   3,  58,  95 },
-        {   1,  28,  47 }
-      }, { /* Coeff Band 3 */
-        {  69, 142, 221 },
-        {  42, 122, 201 },
-        {  15,  91, 159 },
-        {   6,  67, 121 },
-        {   1,  42,  77 },
-        {   1,  17,  31 }
-      }, { /* Coeff Band 4 */
-        { 102, 148, 228 },
-        {  67, 117, 204 },
-        {  17,  82, 154 },
-        {   6,  59, 114 },
-        {   2,  39,  75 },
-        {   1,  15,  29 }
-      }, { /* Coeff Band 5 */
-        { 156,  57, 233 },
-        { 119,  57, 212 },
-        {  58,  48, 163 },
-        {  29,  40, 124 },
-        {  12,  30,  81 },
-        {   3,  12,  31 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 191, 107, 226 },
-        { 124, 117, 204 },
-        {  25,  99, 155 }
-      }, { /* Coeff Band 1 */
-        {  29, 148, 210 },
-        {  37, 126, 194 },
-        {   8,  93, 157 },
-        {   2,  68, 118 },
-        {   1,  39,  69 },
-        {   1,  17,  33 }
-      }, { /* Coeff Band 2 */
-        {  41, 151, 213 },
-        {  27, 123, 193 },
-        {   3,  82, 144 },
-        {   1,  58, 105 },
-        {   1,  32,  60 },
-        {   1,  13,  26 }
-      }, { /* Coeff Band 3 */
-        {  59, 159, 220 },
-        {  23, 126, 198 },
-        {   4,  88, 151 },
-        {   1,  66, 114 },
-        {   1,  38,  71 },
-        {   1,  18,  34 }
-      }, { /* Coeff Band 4 */
-        { 114, 136, 232 },
-        {  51, 114, 207 },
-        {  11,  83, 155 },
-        {   3,  56, 105 },
-        {   1,  33,  65 },
-        {   1,  17,  34 }
-      }, { /* Coeff Band 5 */
-        { 149,  65, 234 },
-        { 121,  57, 215 },
-        {  61,  49, 166 },
-        {  28,  36, 114 },
-        {  12,  25,  76 },
-        {   3,  16,  42 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 214,  49, 220 },
-        { 132,  63, 188 },
-        {  42,  65, 137 }
-      }, { /* Coeff Band 1 */
-        {  85, 137, 221 },
-        { 104, 131, 216 },
-        {  49, 111, 192 },
-        {  21,  87, 155 },
-        {   2,  49,  87 },
-        {   1,  16,  28 }
-      }, { /* Coeff Band 2 */
-        {  89, 163, 230 },
-        {  90, 137, 220 },
-        {  29, 100, 183 },
-        {  10,  70, 135 },
-        {   2,  42,  81 },
-        {   1,  17,  33 }
-      }, { /* Coeff Band 3 */
-        { 108, 167, 237 },
-        {  55, 133, 222 },
-        {  15,  97, 179 },
-        {   4,  72, 135 },
-        {   1,  45,  85 },
-        {   1,  19,  38 }
-      }, { /* Coeff Band 4 */
-        { 124, 146, 240 },
-        {  66, 124, 224 },
-        {  17,  88, 175 },
-        {   4,  58, 122 },
-        {   1,  36,  75 },
-        {   1,  18,  37 }
-      }, { /* Coeff Band 5 */
-        { 141,  79, 241 },
-        { 126,  70, 227 },
-        {  66,  58, 182 },
-        {  30,  44, 136 },
-        {  12,  34,  96 },
-        {   2,  20,  47 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 229,  99, 249 },
-        { 143, 111, 235 },
-        {  46, 109, 192 }
-      }, { /* Coeff Band 1 */
-        {  82, 158, 236 },
-        {  94, 146, 224 },
-        {  25, 117, 191 },
-        {   9,  87, 149 },
-        {   3,  56,  99 },
-        {   1,  33,  57 }
-      }, { /* Coeff Band 2 */
-        {  83, 167, 237 },
-        {  68, 145, 222 },
-        {  10, 103, 177 },
-        {   2,  72, 131 },
-        {   1,  41,  79 },
-        {   1,  20,  39 }
-      }, { /* Coeff Band 3 */
-        {  99, 167, 239 },
-        {  47, 141, 224 },
-        {  10, 104, 178 },
-        {   2,  73, 133 },
-        {   1,  44,  85 },
-        {   1,  22,  47 }
-      }, { /* Coeff Band 4 */
-        { 127, 145, 243 },
-        {  71, 129, 228 },
-        {  17,  93, 177 },
-        {   3,  61, 124 },
-        {   1,  41,  84 },
-        {   1,  21,  52 }
-      }, { /* Coeff Band 5 */
-        { 157,  78, 244 },
-        { 140,  72, 231 },
-        {  69,  58, 184 },
-        {  31,  44, 137 },
-        {  14,  38, 105 },
-        {   8,  23,  61 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 125,  34, 187 },
-        {  52,  41, 133 },
-        {   6,  31,  56 }
-      }, { /* Coeff Band 1 */
-        {  37, 109, 153 },
-        {  51, 102, 147 },
-        {  23,  87, 128 },
-        {   8,  67, 101 },
-        {   1,  41,  63 },
-        {   1,  19,  29 }
-      }, { /* Coeff Band 2 */
-        {  31, 154, 185 },
-        {  17, 127, 175 },
-        {   6,  96, 145 },
-        {   2,  73, 114 },
-        {   1,  51,  82 },
-        {   1,  28,  45 }
-      }, { /* Coeff Band 3 */
-        {  23, 163, 200 },
-        {  10, 131, 185 },
-        {   2,  93, 148 },
-        {   1,  67, 111 },
-        {   1,  41,  69 },
-        {   1,  14,  24 }
-      }, { /* Coeff Band 4 */
-        {  29, 176, 217 },
-        {  12, 145, 201 },
-        {   3, 101, 156 },
-        {   1,  69, 111 },
-        {   1,  39,  63 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 5 */
-        {  57, 192, 233 },
-        {  25, 154, 215 },
-        {   6, 109, 167 },
-        {   3,  78, 118 },
-        {   1,  48,  69 },
-        {   1,  21,  29 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 202, 105, 245 },
-        { 108, 106, 216 },
-        {  18,  90, 144 }
-      }, { /* Coeff Band 1 */
-        {  33, 172, 219 },
-        {  64, 149, 206 },
-        {  14, 117, 177 },
-        {   5,  90, 141 },
-        {   2,  61,  95 },
-        {   1,  37,  57 }
-      }, { /* Coeff Band 2 */
-        {  33, 179, 220 },
-        {  11, 140, 198 },
-        {   1,  89, 148 },
-        {   1,  60, 104 },
-        {   1,  33,  57 },
-        {   1,  12,  21 }
-      }, { /* Coeff Band 3 */
-        {  30, 181, 221 },
-        {   8, 141, 198 },
-        {   1,  87, 145 },
-        {   1,  58, 100 },
-        {   1,  31,  55 },
-        {   1,  12,  20 }
-      }, { /* Coeff Band 4 */
-        {  32, 186, 224 },
-        {   7, 142, 198 },
-        {   1,  86, 143 },
-        {   1,  58, 100 },
-        {   1,  31,  55 },
-        {   1,  12,  22 }
-      }, { /* Coeff Band 5 */
-        {  57, 192, 227 },
-        {  20, 143, 204 },
-        {   3,  96, 154 },
-        {   1,  68, 112 },
-        {   1,  42,  69 },
-        {   1,  19,  32 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 212,  35, 215 },
-        { 113,  47, 169 },
-        {  29,  48, 105 }
-      }, { /* Coeff Band 1 */
-        {  74, 129, 203 },
-        { 106, 120, 203 },
-        {  49, 107, 178 },
-        {  19,  84, 144 },
-        {   4,  50,  84 },
-        {   1,  15,  25 }
-      }, { /* Coeff Band 2 */
-        {  71, 172, 217 },
-        {  44, 141, 209 },
-        {  15, 102, 173 },
-        {   6,  76, 133 },
-        {   2,  51,  89 },
-        {   1,  24,  42 }
-      }, { /* Coeff Band 3 */
-        {  64, 185, 231 },
-        {  31, 148, 216 },
-        {   8, 103, 175 },
-        {   3,  74, 131 },
-        {   1,  46,  81 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 4 */
-        {  65, 196, 235 },
-        {  25, 157, 221 },
-        {   5, 105, 174 },
-        {   1,  67, 120 },
-        {   1,  38,  69 },
-        {   1,  15,  30 }
-      }, { /* Coeff Band 5 */
-        {  65, 204, 238 },
-        {  30, 156, 224 },
-        {   7, 107, 177 },
-        {   2,  70, 124 },
-        {   1,  42,  73 },
-        {   1,  18,  34 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 225,  86, 251 },
-        { 144, 104, 235 },
-        {  42,  99, 181 }
-      }, { /* Coeff Band 1 */
-        {  85, 175, 239 },
-        { 112, 165, 229 },
-        {  29, 136, 200 },
-        {  12, 103, 162 },
-        {   6,  77, 123 },
-        {   2,  53,  84 }
-      }, { /* Coeff Band 2 */
-        {  75, 183, 239 },
-        {  30, 155, 221 },
-        {   3, 106, 171 },
-        {   1,  74, 128 },
-        {   1,  44,  76 },
-        {   1,  17,  28 }
-      }, { /* Coeff Band 3 */
-        {  73, 185, 240 },
-        {  27, 159, 222 },
-        {   2, 107, 172 },
-        {   1,  75, 127 },
-        {   1,  42,  73 },
-        {   1,  17,  29 }
-      }, { /* Coeff Band 4 */
-        {  62, 190, 238 },
-        {  21, 159, 222 },
-        {   2, 107, 172 },
-        {   1,  72, 122 },
-        {   1,  40,  71 },
-        {   1,  18,  32 }
-      }, { /* Coeff Band 5 */
-        {  61, 199, 240 },
-        {  27, 161, 226 },
-        {   4, 113, 180 },
-        {   1,  76, 129 },
-        {   1,  46,  80 },
-        {   1,  23,  41 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {   7,  27, 153 },
-        {   5,  30,  95 },
-        {   1,  16,  30 }
-      }, { /* Coeff Band 1 */
-        {  50,  75, 127 },
-        {  57,  75, 124 },
-        {  27,  67, 108 },
-        {  10,  54,  86 },
-        {   1,  33,  52 },
-        {   1,  12,  18 }
-      }, { /* Coeff Band 2 */
-        {  43, 125, 151 },
-        {  26, 108, 148 },
-        {   7,  83, 122 },
-        {   2,  59,  89 },
-        {   1,  38,  60 },
-        {   1,  17,  27 }
-      }, { /* Coeff Band 3 */
-        {  23, 144, 163 },
-        {  13, 112, 154 },
-        {   2,  75, 117 },
-        {   1,  50,  81 },
-        {   1,  31,  51 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 4 */
-        {  18, 162, 185 },
-        {   6, 123, 171 },
-        {   1,  78, 125 },
-        {   1,  51,  86 },
-        {   1,  31,  54 },
-        {   1,  14,  23 }
-      }, { /* Coeff Band 5 */
-        {  15, 199, 227 },
-        {   3, 150, 204 },
-        {   1,  91, 146 },
-        {   1,  55,  95 },
-        {   1,  30,  53 },
-        {   1,  11,  20 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  19,  55, 240 },
-        {  19,  59, 196 },
-        {   3,  52, 105 }
-      }, { /* Coeff Band 1 */
-        {  41, 166, 207 },
-        { 104, 153, 199 },
-        {  31, 123, 181 },
-        {  14, 101, 152 },
-        {   5,  72, 106 },
-        {   1,  36,  52 }
-      }, { /* Coeff Band 2 */
-        {  35, 176, 211 },
-        {  12, 131, 190 },
-        {   2,  88, 144 },
-        {   1,  60, 101 },
-        {   1,  36,  60 },
-        {   1,  16,  28 }
-      }, { /* Coeff Band 3 */
-        {  28, 183, 213 },
-        {   8, 134, 191 },
-        {   1,  86, 142 },
-        {   1,  56,  96 },
-        {   1,  30,  53 },
-        {   1,  12,  20 }
-      }, { /* Coeff Band 4 */
-        {  20, 190, 215 },
-        {   4, 135, 192 },
-        {   1,  84, 139 },
-        {   1,  53,  91 },
-        {   1,  28,  49 },
-        {   1,  11,  20 }
-      }, { /* Coeff Band 5 */
-        {  13, 196, 216 },
-        {   2, 137, 192 },
-        {   1,  86, 143 },
-        {   1,  57,  99 },
-        {   1,  32,  56 },
-        {   1,  13,  24 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 211,  29, 217 },
-        {  96,  47, 156 },
-        {  22,  43,  87 }
-      }, { /* Coeff Band 1 */
-        {  78, 120, 193 },
-        { 111, 116, 186 },
-        {  46, 102, 164 },
-        {  15,  80, 128 },
-        {   2,  49,  76 },
-        {   1,  18,  28 }
-      }, { /* Coeff Band 2 */
-        {  71, 161, 203 },
-        {  42, 132, 192 },
-        {  10,  98, 150 },
-        {   3,  69, 109 },
-        {   1,  44,  70 },
-        {   1,  18,  29 }
-      }, { /* Coeff Band 3 */
-        {  57, 186, 211 },
-        {  30, 140, 196 },
-        {   4,  93, 146 },
-        {   1,  62, 102 },
-        {   1,  38,  65 },
-        {   1,  16,  27 }
-      }, { /* Coeff Band 4 */
-        {  47, 199, 217 },
-        {  14, 145, 196 },
-        {   1,  88, 142 },
-        {   1,  57,  98 },
-        {   1,  36,  62 },
-        {   1,  15,  26 }
-      }, { /* Coeff Band 5 */
-        {  26, 219, 229 },
-        {   5, 155, 207 },
-        {   1,  94, 151 },
-        {   1,  60, 104 },
-        {   1,  36,  62 },
-        {   1,  16,  28 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 233,  29, 248 },
-        { 146,  47, 220 },
-        {  43,  52, 140 }
-      }, { /* Coeff Band 1 */
-        { 100, 163, 232 },
-        { 179, 161, 222 },
-        {  63, 142, 204 },
-        {  37, 113, 174 },
-        {  26,  89, 137 },
-        {  18,  68,  97 }
-      }, { /* Coeff Band 2 */
-        {  85, 181, 230 },
-        {  32, 146, 209 },
-        {   7, 100, 164 },
-        {   3,  71, 121 },
-        {   1,  45,  77 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 3 */
-        {  65, 187, 230 },
-        {  20, 148, 207 },
-        {   2,  97, 159 },
-        {   1,  68, 116 },
-        {   1,  40,  70 },
-        {   1,  14,  29 }
-      }, { /* Coeff Band 4 */
-        {  40, 194, 227 },
-        {   8, 147, 204 },
-        {   1,  94, 155 },
-        {   1,  65, 112 },
-        {   1,  39,  66 },
-        {   1,  14,  26 }
-      }, { /* Coeff Band 5 */
-        {  16, 208, 228 },
-        {   3, 151, 207 },
-        {   1,  98, 160 },
-        {   1,  67, 117 },
-        {   1,  41,  74 },
-        {   1,  17,  31 }
-      }
-    }
-  }
-};
-static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = {
-  { /* block Type 0 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        {  17,  38, 140 },
-        {   7,  34,  80 },
-        {   1,  17,  29 }
-      }, { /* Coeff Band 1 */
-        {  37,  75, 128 },
-        {  41,  76, 128 },
-        {  26,  66, 116 },
-        {  12,  52,  94 },
-        {   2,  32,  55 },
-        {   1,  10,  16 }
-      }, { /* Coeff Band 2 */
-        {  50, 127, 154 },
-        {  37, 109, 152 },
-        {  16,  82, 121 },
-        {   5,  59,  85 },
-        {   1,  35,  54 },
-        {   1,  13,  20 }
-      }, { /* Coeff Band 3 */
-        {  40, 142, 167 },
-        {  17, 110, 157 },
-        {   2,  71, 112 },
-        {   1,  44,  72 },
-        {   1,  27,  45 },
-        {   1,  11,  17 }
-      }, { /* Coeff Band 4 */
-        {  30, 175, 188 },
-        {   9, 124, 169 },
-        {   1,  74, 116 },
-        {   1,  48,  78 },
-        {   1,  30,  49 },
-        {   1,  11,  18 }
-      }, { /* Coeff Band 5 */
-        {  10, 222, 223 },
-        {   2, 150, 194 },
-        {   1,  83, 128 },
-        {   1,  48,  79 },
-        {   1,  27,  45 },
-        {   1,  11,  17 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        {  36,  41, 235 },
-        {  29,  36, 193 },
-        {  10,  27, 111 }
-      }, { /* Coeff Band 1 */
-        {  85, 165, 222 },
-        { 177, 162, 215 },
-        { 110, 135, 195 },
-        {  57, 113, 168 },
-        {  23,  83, 120 },
-        {  10,  49,  61 }
-      }, { /* Coeff Band 2 */
-        {  85, 190, 223 },
-        {  36, 139, 200 },
-        {   5,  90, 146 },
-        {   1,  60, 103 },
-        {   1,  38,  65 },
-        {   1,  18,  30 }
-      }, { /* Coeff Band 3 */
-        {  72, 202, 223 },
-        {  23, 141, 199 },
-        {   2,  86, 140 },
-        {   1,  56,  97 },
-        {   1,  36,  61 },
-        {   1,  16,  27 }
-      }, { /* Coeff Band 4 */
-        {  55, 218, 225 },
-        {  13, 145, 200 },
-        {   1,  86, 141 },
-        {   1,  57,  99 },
-        {   1,  35,  61 },
-        {   1,  13,  22 }
-      }, { /* Coeff Band 5 */
-        {  15, 235, 212 },
-        {   1, 132, 184 },
-        {   1,  84, 139 },
-        {   1,  57,  97 },
-        {   1,  34,  56 },
-        {   1,  14,  23 }
-      }
-    }
-  }, { /* block Type 1 */
-    { /* Intra */
-      { /* Coeff Band 0 */
-        { 181,  21, 201 },
-        {  61,  37, 123 },
-        {  10,  38,  71 }
-      }, { /* Coeff Band 1 */
-        {  47, 106, 172 },
-        {  95, 104, 173 },
-        {  42,  93, 159 },
-        {  18,  77, 131 },
-        {   4,  50,  81 },
-        {   1,  17,  23 }
-      }, { /* Coeff Band 2 */
-        {  62, 147, 199 },
-        {  44, 130, 189 },
-        {  28, 102, 154 },
-        {  18,  75, 115 },
-        {   2,  44,  65 },
-        {   1,  12,  19 }
-      }, { /* Coeff Band 3 */
-        {  55, 153, 210 },
-        {  24, 130, 194 },
-        {   3,  93, 146 },
-        {   1,  61,  97 },
-        {   1,  31,  50 },
-        {   1,  10,  16 }
-      }, { /* Coeff Band 4 */
-        {  49, 186, 223 },
-        {  17, 148, 204 },
-        {   1,  96, 142 },
-        {   1,  53,  83 },
-        {   1,  26,  44 },
-        {   1,  11,  17 }
-      }, { /* Coeff Band 5 */
-        {  13, 217, 212 },
-        {   2, 136, 180 },
-        {   1,  78, 124 },
-        {   1,  50,  83 },
-        {   1,  29,  49 },
-        {   1,  14,  23 }
-      }
-    }, { /* Inter */
-      { /* Coeff Band 0 */
-        { 197,  13, 247 },
-        {  82,  17, 222 },
-        {  25,  17, 162 }
-      }, { /* Coeff Band 1 */
-        { 126, 186, 247 },
-        { 234, 191, 243 },
-        { 176, 177, 234 },
-        { 104, 158, 220 },
-        {  66, 128, 186 },
-        {  55,  90, 137 }
-      }, { /* Coeff Band 2 */
-        { 111, 197, 242 },
-        {  46, 158, 219 },
-        {   9, 104, 171 },
-        {   2,  65, 125 },
-        {   1,  44,  80 },
-        {   1,  17,  91 }
-      }, { /* Coeff Band 3 */
-        { 104, 208, 245 },
-        {  39, 168, 224 },
-        {   3, 109, 162 },
-        {   1,  79, 124 },
-        {   1,  50, 102 },
-        {   1,  43, 102 }
-      }, { /* Coeff Band 4 */
-        {  84, 220, 246 },
-        {  31, 177, 231 },
-        {   2, 115, 180 },
-        {   1,  79, 134 },
-        {   1,  55,  77 },
-        {   1,  60,  79 }
-      }, { /* Coeff Band 5 */
-        {  43, 243, 240 },
-        {   8, 180, 217 },
-        {   1, 115, 166 },
-        {   1,  84, 121 },
-        {   1,  51,  67 },
-        {   1,  16,   6 }
-      }
-    }
-  }
-};
-
-#endif  // VP9_COMMON_DEFAULT_COEF_PROBS_H_
diff --git a/source/libvpx/vp9/common/vp9_entropy.c b/source/libvpx/vp9/common/vp9_entropy.c
index 2640ac7..adab33f 100644
--- a/source/libvpx/vp9/common/vp9_entropy.c
+++ b/source/libvpx/vp9/common/vp9_entropy.c
@@ -15,7 +15,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
 
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -36,57 +35,86 @@ DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]) = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 5
+  4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
-  5, 5, 5, 5, 5, 5
 };
 
-DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-
-
-/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-
-const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
-  -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
-  -ZERO_TOKEN, 4,                             /* 1 = ZERO */
-  -ONE_TOKEN, 6,                              /* 2 = ONE */
-  8, 12,                                      /* 3 = LOW_VAL */
-  -TWO_TOKEN, 10,                            /* 4 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                /* 5 = THREE */
-  14, 16,                                   /* 6 = HIGH_LOW */
-  -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2,   /* 7 = CAT_ONE */
-  18, 20,                                   /* 8 = CAT_THREEFOUR */
-  -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4,   /* 9 = CAT_THREE */
-  -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6    /* 10 = CAT_FIVE */
-};
-
-struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
-/* Trees for extra bits.  Probabilities are constant and
-   do not depend on previously encoded bits */
-
-static const vp9_prob Pcat1[] = { 159};
-static const vp9_prob Pcat2[] = { 165, 145};
-static const vp9_prob Pcat3[] = { 173, 148, 140};
-static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
-static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
-static const vp9_prob Pcat6[] = {
-  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
-};
-
-const vp9_tree_index vp9_coefmodel_tree[6] = {
-  -DCT_EOB_MODEL_TOKEN, 2,                      /* 0 = EOB */
-  -ZERO_TOKEN, 4,                               /* 1 = ZERO */
+const vp9_tree_index vp9_coefmodel_tree[TREE_SIZE(UNCONSTRAINED_NODES + 1)] = {
+  -EOB_MODEL_TOKEN, 2,
+  -ZERO_TOKEN, 4,
   -ONE_TOKEN, -TWO_TOKEN,
 };
 
@@ -99,198 +127,617 @@ const vp9_tree_index vp9_coefmodel_tree[6] = {
 // the probabilities for the rest of the nodes.
 
 // beta = 8
-static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+
+// Every odd line in this table can be generated from the even lines
+// by averaging :
+// vp9_pareto8_full[l][node] = (vp9_pareto8_full[l-1][node] +
+//                              vp9_pareto8_full[l+1][node] ) >> 1;
+const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
+  {  6,  86, 128,  11,  87,  42,  91,  52},
   {  9,  86, 129,  17,  88,  61,  94,  76},
+  { 12,  86, 129,  22,  88,  77,  97,  93},
   { 15,  87, 129,  28,  89,  93, 100, 110},
+  { 17,  87, 129,  33,  90, 105, 103, 123},
   { 20,  88, 130,  38,  91, 118, 106, 136},
+  { 23,  88, 130,  43,  91, 128, 108, 146},
   { 26,  89, 131,  48,  92, 139, 111, 156},
+  { 28,  89, 131,  53,  93, 147, 114, 163},
   { 31,  90, 131,  58,  94, 156, 117, 171},
+  { 34,  90, 131,  62,  94, 163, 119, 177},
   { 37,  90, 132,  66,  95, 171, 122, 184},
+  { 39,  90, 132,  70,  96, 177, 124, 189},
   { 42,  91, 132,  75,  97, 183, 127, 194},
+  { 44,  91, 132,  79,  97, 188, 129, 198},
   { 47,  92, 133,  83,  98, 193, 132, 202},
+  { 49,  92, 133,  86,  99, 197, 134, 205},
   { 52,  93, 133,  90, 100, 201, 137, 208},
+  { 54,  93, 133,  94, 100, 204, 139, 211},
   { 57,  94, 134,  98, 101, 208, 142, 214},
+  { 59,  94, 134, 101, 102, 211, 144, 216},
   { 62,  94, 135, 105, 103, 214, 146, 218},
+  { 64,  94, 135, 108, 103, 216, 148, 220},
   { 66,  95, 135, 111, 104, 219, 151, 222},
+  { 68,  95, 135, 114, 105, 221, 153, 223},
   { 71,  96, 136, 117, 106, 224, 155, 225},
+  { 73,  96, 136, 120, 106, 225, 157, 226},
   { 76,  97, 136, 123, 107, 227, 159, 228},
+  { 78,  97, 136, 126, 108, 229, 160, 229},
   { 80,  98, 137, 129, 109, 231, 162, 231},
+  { 82,  98, 137, 131, 109, 232, 164, 232},
   { 84,  98, 138, 134, 110, 234, 166, 233},
+  { 86,  98, 138, 137, 111, 235, 168, 234},
   { 89,  99, 138, 140, 112, 236, 170, 235},
+  { 91,  99, 138, 142, 112, 237, 171, 235},
   { 93, 100, 139, 145, 113, 238, 173, 236},
+  { 95, 100, 139, 147, 114, 239, 174, 237},
   { 97, 101, 140, 149, 115, 240, 176, 238},
+  { 99, 101, 140, 151, 115, 241, 177, 238},
   {101, 102, 140, 154, 116, 242, 179, 239},
+  {103, 102, 140, 156, 117, 242, 180, 239},
   {105, 103, 141, 158, 118, 243, 182, 240},
+  {107, 103, 141, 160, 118, 243, 183, 240},
   {109, 104, 141, 162, 119, 244, 185, 241},
+  {111, 104, 141, 164, 119, 244, 186, 241},
   {113, 104, 142, 166, 120, 245, 187, 242},
+  {114, 104, 142, 168, 121, 245, 188, 242},
   {116, 105, 143, 170, 122, 246, 190, 243},
+  {118, 105, 143, 171, 122, 246, 191, 243},
   {120, 106, 143, 173, 123, 247, 192, 244},
+  {121, 106, 143, 175, 124, 247, 193, 244},
   {123, 107, 144, 177, 125, 248, 195, 244},
+  {125, 107, 144, 178, 125, 248, 196, 244},
   {127, 108, 145, 180, 126, 249, 197, 245},
+  {128, 108, 145, 181, 127, 249, 198, 245},
   {130, 109, 145, 183, 128, 249, 199, 245},
+  {132, 109, 145, 184, 128, 249, 200, 245},
   {134, 110, 146, 186, 129, 250, 201, 246},
+  {135, 110, 146, 187, 130, 250, 202, 246},
   {137, 111, 147, 189, 131, 251, 203, 246},
+  {138, 111, 147, 190, 131, 251, 204, 246},
   {140, 112, 147, 192, 132, 251, 205, 247},
+  {141, 112, 147, 193, 132, 251, 206, 247},
   {143, 113, 148, 194, 133, 251, 207, 247},
+  {144, 113, 148, 195, 134, 251, 207, 247},
   {146, 114, 149, 197, 135, 252, 208, 248},
+  {147, 114, 149, 198, 135, 252, 209, 248},
   {149, 115, 149, 199, 136, 252, 210, 248},
+  {150, 115, 149, 200, 137, 252, 210, 248},
   {152, 115, 150, 201, 138, 252, 211, 248},
+  {153, 115, 150, 202, 138, 252, 212, 248},
   {155, 116, 151, 204, 139, 253, 213, 249},
+  {156, 116, 151, 205, 139, 253, 213, 249},
   {158, 117, 151, 206, 140, 253, 214, 249},
+  {159, 117, 151, 207, 141, 253, 215, 249},
   {161, 118, 152, 208, 142, 253, 216, 249},
+  {162, 118, 152, 209, 142, 253, 216, 249},
   {163, 119, 153, 210, 143, 253, 217, 249},
+  {164, 119, 153, 211, 143, 253, 217, 249},
   {166, 120, 153, 212, 144, 254, 218, 250},
+  {167, 120, 153, 212, 145, 254, 219, 250},
   {168, 121, 154, 213, 146, 254, 220, 250},
+  {169, 121, 154, 214, 146, 254, 220, 250},
   {171, 122, 155, 215, 147, 254, 221, 250},
+  {172, 122, 155, 216, 147, 254, 221, 250},
   {173, 123, 155, 217, 148, 254, 222, 250},
+  {174, 123, 155, 217, 149, 254, 222, 250},
   {176, 124, 156, 218, 150, 254, 223, 250},
+  {177, 124, 156, 219, 150, 254, 223, 250},
   {178, 125, 157, 220, 151, 254, 224, 251},
+  {179, 125, 157, 220, 151, 254, 224, 251},
   {180, 126, 157, 221, 152, 254, 225, 251},
+  {181, 126, 157, 221, 152, 254, 225, 251},
   {183, 127, 158, 222, 153, 254, 226, 251},
+  {184, 127, 158, 223, 154, 254, 226, 251},
   {185, 128, 159, 224, 155, 255, 227, 251},
+  {186, 128, 159, 224, 155, 255, 227, 251},
   {187, 129, 160, 225, 156, 255, 228, 251},
+  {188, 130, 160, 225, 156, 255, 228, 251},
   {189, 131, 160, 226, 157, 255, 228, 251},
+  {190, 131, 160, 226, 158, 255, 228, 251},
   {191, 132, 161, 227, 159, 255, 229, 251},
+  {192, 132, 161, 227, 159, 255, 229, 251},
   {193, 133, 162, 228, 160, 255, 230, 252},
+  {194, 133, 162, 229, 160, 255, 230, 252},
   {195, 134, 163, 230, 161, 255, 231, 252},
+  {196, 134, 163, 230, 161, 255, 231, 252},
   {197, 135, 163, 231, 162, 255, 231, 252},
+  {198, 135, 163, 231, 162, 255, 231, 252},
   {199, 136, 164, 232, 163, 255, 232, 252},
+  {200, 136, 164, 232, 164, 255, 232, 252},
+  {201, 137, 165, 233, 165, 255, 233, 252},
   {201, 137, 165, 233, 165, 255, 233, 252},
   {202, 138, 166, 233, 166, 255, 233, 252},
+  {203, 138, 166, 233, 166, 255, 233, 252},
   {204, 139, 166, 234, 167, 255, 234, 252},
+  {205, 139, 166, 234, 167, 255, 234, 252},
+  {206, 140, 167, 235, 168, 255, 235, 252},
   {206, 140, 167, 235, 168, 255, 235, 252},
   {207, 141, 168, 236, 169, 255, 235, 252},
+  {208, 141, 168, 236, 170, 255, 235, 252},
   {209, 142, 169, 237, 171, 255, 236, 252},
+  {209, 143, 169, 237, 171, 255, 236, 252},
   {210, 144, 169, 237, 172, 255, 236, 252},
+  {211, 144, 169, 237, 172, 255, 236, 252},
   {212, 145, 170, 238, 173, 255, 237, 252},
+  {213, 145, 170, 238, 173, 255, 237, 252},
   {214, 146, 171, 239, 174, 255, 237, 253},
+  {214, 146, 171, 239, 174, 255, 237, 253},
+  {215, 147, 172, 240, 175, 255, 238, 253},
   {215, 147, 172, 240, 175, 255, 238, 253},
   {216, 148, 173, 240, 176, 255, 238, 253},
+  {217, 148, 173, 240, 176, 255, 238, 253},
   {218, 149, 173, 241, 177, 255, 239, 253},
+  {218, 149, 173, 241, 178, 255, 239, 253},
   {219, 150, 174, 241, 179, 255, 239, 253},
+  {219, 151, 174, 241, 179, 255, 239, 253},
   {220, 152, 175, 242, 180, 255, 240, 253},
+  {221, 152, 175, 242, 180, 255, 240, 253},
   {222, 153, 176, 242, 181, 255, 240, 253},
+  {222, 153, 176, 242, 181, 255, 240, 253},
+  {223, 154, 177, 243, 182, 255, 240, 253},
   {223, 154, 177, 243, 182, 255, 240, 253},
   {224, 155, 178, 244, 183, 255, 241, 253},
+  {224, 155, 178, 244, 183, 255, 241, 253},
   {225, 156, 178, 244, 184, 255, 241, 253},
+  {225, 157, 178, 244, 184, 255, 241, 253},
   {226, 158, 179, 244, 185, 255, 242, 253},
+  {227, 158, 179, 244, 185, 255, 242, 253},
+  {228, 159, 180, 245, 186, 255, 242, 253},
   {228, 159, 180, 245, 186, 255, 242, 253},
   {229, 160, 181, 245, 187, 255, 242, 253},
+  {229, 160, 181, 245, 187, 255, 242, 253},
   {230, 161, 182, 246, 188, 255, 243, 253},
+  {230, 162, 182, 246, 188, 255, 243, 253},
+  {231, 163, 183, 246, 189, 255, 243, 253},
   {231, 163, 183, 246, 189, 255, 243, 253},
   {232, 164, 184, 247, 190, 255, 243, 253},
+  {232, 164, 184, 247, 190, 255, 243, 253},
+  {233, 165, 185, 247, 191, 255, 244, 253},
   {233, 165, 185, 247, 191, 255, 244, 253},
   {234, 166, 185, 247, 192, 255, 244, 253},
+  {234, 167, 185, 247, 192, 255, 244, 253},
   {235, 168, 186, 248, 193, 255, 244, 253},
+  {235, 168, 186, 248, 193, 255, 244, 253},
+  {236, 169, 187, 248, 194, 255, 244, 253},
   {236, 169, 187, 248, 194, 255, 244, 253},
   {236, 170, 188, 248, 195, 255, 245, 253},
+  {236, 170, 188, 248, 195, 255, 245, 253},
   {237, 171, 189, 249, 196, 255, 245, 254},
+  {237, 172, 189, 249, 196, 255, 245, 254},
+  {238, 173, 190, 249, 197, 255, 245, 254},
   {238, 173, 190, 249, 197, 255, 245, 254},
   {239, 174, 191, 249, 198, 255, 245, 254},
+  {239, 174, 191, 249, 198, 255, 245, 254},
   {240, 175, 192, 249, 199, 255, 246, 254},
+  {240, 176, 192, 249, 199, 255, 246, 254},
+  {240, 177, 193, 250, 200, 255, 246, 254},
   {240, 177, 193, 250, 200, 255, 246, 254},
   {241, 178, 194, 250, 201, 255, 246, 254},
+  {241, 178, 194, 250, 201, 255, 246, 254},
   {242, 179, 195, 250, 202, 255, 246, 254},
+  {242, 180, 195, 250, 202, 255, 246, 254},
+  {242, 181, 196, 250, 203, 255, 247, 254},
   {242, 181, 196, 250, 203, 255, 247, 254},
   {243, 182, 197, 251, 204, 255, 247, 254},
+  {243, 183, 197, 251, 204, 255, 247, 254},
+  {244, 184, 198, 251, 205, 255, 247, 254},
   {244, 184, 198, 251, 205, 255, 247, 254},
   {244, 185, 199, 251, 206, 255, 247, 254},
+  {244, 185, 199, 251, 206, 255, 247, 254},
   {245, 186, 200, 251, 207, 255, 247, 254},
+  {245, 187, 200, 251, 207, 255, 247, 254},
+  {246, 188, 201, 252, 207, 255, 248, 254},
   {246, 188, 201, 252, 207, 255, 248, 254},
   {246, 189, 202, 252, 208, 255, 248, 254},
+  {246, 190, 202, 252, 208, 255, 248, 254},
+  {247, 191, 203, 252, 209, 255, 248, 254},
   {247, 191, 203, 252, 209, 255, 248, 254},
   {247, 192, 204, 252, 210, 255, 248, 254},
+  {247, 193, 204, 252, 210, 255, 248, 254},
+  {248, 194, 205, 252, 211, 255, 248, 254},
   {248, 194, 205, 252, 211, 255, 248, 254},
   {248, 195, 206, 252, 212, 255, 249, 254},
+  {248, 196, 206, 252, 212, 255, 249, 254},
+  {249, 197, 207, 253, 213, 255, 249, 254},
   {249, 197, 207, 253, 213, 255, 249, 254},
   {249, 198, 208, 253, 214, 255, 249, 254},
+  {249, 199, 209, 253, 214, 255, 249, 254},
+  {250, 200, 210, 253, 215, 255, 249, 254},
   {250, 200, 210, 253, 215, 255, 249, 254},
   {250, 201, 211, 253, 215, 255, 249, 254},
+  {250, 202, 211, 253, 215, 255, 249, 254},
+  {250, 203, 212, 253, 216, 255, 249, 254},
   {250, 203, 212, 253, 216, 255, 249, 254},
   {251, 204, 213, 253, 217, 255, 250, 254},
+  {251, 205, 213, 253, 217, 255, 250, 254},
   {251, 206, 214, 254, 218, 255, 250, 254},
+  {251, 206, 215, 254, 218, 255, 250, 254},
   {252, 207, 216, 254, 219, 255, 250, 254},
+  {252, 208, 216, 254, 219, 255, 250, 254},
   {252, 209, 217, 254, 220, 255, 250, 254},
+  {252, 210, 217, 254, 220, 255, 250, 254},
   {252, 211, 218, 254, 221, 255, 250, 254},
+  {252, 212, 218, 254, 221, 255, 250, 254},
   {253, 213, 219, 254, 222, 255, 250, 254},
+  {253, 213, 220, 254, 222, 255, 250, 254},
   {253, 214, 221, 254, 223, 255, 250, 254},
+  {253, 215, 221, 254, 223, 255, 250, 254},
   {253, 216, 222, 254, 224, 255, 251, 254},
+  {253, 217, 223, 254, 224, 255, 251, 254},
   {253, 218, 224, 254, 225, 255, 251, 254},
+  {253, 219, 224, 254, 225, 255, 251, 254},
   {254, 220, 225, 254, 225, 255, 251, 254},
+  {254, 221, 226, 254, 225, 255, 251, 254},
   {254, 222, 227, 255, 226, 255, 251, 254},
+  {254, 223, 227, 255, 226, 255, 251, 254},
   {254, 224, 228, 255, 227, 255, 251, 254},
+  {254, 225, 229, 255, 227, 255, 251, 254},
   {254, 226, 230, 255, 228, 255, 251, 254},
+  {254, 227, 230, 255, 229, 255, 251, 254},
   {255, 228, 231, 255, 230, 255, 251, 254},
+  {255, 229, 232, 255, 230, 255, 251, 254},
   {255, 230, 233, 255, 231, 255, 252, 254},
+  {255, 231, 234, 255, 231, 255, 252, 254},
   {255, 232, 235, 255, 232, 255, 252, 254},
+  {255, 233, 236, 255, 232, 255, 252, 254},
   {255, 235, 237, 255, 233, 255, 252, 254},
+  {255, 236, 238, 255, 234, 255, 252, 254},
   {255, 238, 240, 255, 235, 255, 252, 255},
+  {255, 239, 241, 255, 235, 255, 252, 254},
   {255, 241, 243, 255, 236, 255, 252, 254},
-  {255, 246, 247, 255, 239, 255, 253, 255}
+  {255, 243, 245, 255, 237, 255, 252, 254},
+  {255, 246, 247, 255, 239, 255, 253, 255},
+  {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
-static void extend_model_to_full_distribution(vp9_prob p,
-                                              vp9_prob *tree_probs) {
-  const int l = (p - 1) / 2;
-  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
-  if (p & 1) {
-    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
-               model[l], MODEL_NODES * sizeof(vp9_prob));
-  } else {
-    // interpolate
-    int i;
-    for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
-      tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +
-                       model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;
+static const vp9_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 195,  29, 183 }, {  84,  49, 136 }, {   8,  42,  71 }
+      }, {  // Band 1
+        {  31, 107, 169 }, {  35,  99, 159 }, {  17,  82, 140 },
+        {   8,  66, 114 }, {   2,  44,  76 }, {   1,  19,  32 }
+      }, {  // Band 2
+        {  40, 132, 201 }, {  29, 114, 187 }, {  13,  91, 157 },
+        {   7,  75, 127 }, {   3,  58,  95 }, {   1,  28,  47 }
+      }, {  // Band 3
+        {  69, 142, 221 }, {  42, 122, 201 }, {  15,  91, 159 },
+        {   6,  67, 121 }, {   1,  42,  77 }, {   1,  17,  31 }
+      }, {  // Band 4
+        { 102, 148, 228 }, {  67, 117, 204 }, {  17,  82, 154 },
+        {   6,  59, 114 }, {   2,  39,  75 }, {   1,  15,  29 }
+      }, {  // Band 5
+        { 156,  57, 233 }, { 119,  57, 212 }, {  58,  48, 163 },
+        {  29,  40, 124 }, {  12,  30,  81 }, {   3,  12,  31 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 191, 107, 226 }, { 124, 117, 204 }, {  25,  99, 155 }
+      }, {  // Band 1
+        {  29, 148, 210 }, {  37, 126, 194 }, {   8,  93, 157 },
+        {   2,  68, 118 }, {   1,  39,  69 }, {   1,  17,  33 }
+      }, {  // Band 2
+        {  41, 151, 213 }, {  27, 123, 193 }, {   3,  82, 144 },
+        {   1,  58, 105 }, {   1,  32,  60 }, {   1,  13,  26 }
+      }, {  // Band 3
+        {  59, 159, 220 }, {  23, 126, 198 }, {   4,  88, 151 },
+        {   1,  66, 114 }, {   1,  38,  71 }, {   1,  18,  34 }
+      }, {  // Band 4
+        { 114, 136, 232 }, {  51, 114, 207 }, {  11,  83, 155 },
+        {   3,  56, 105 }, {   1,  33,  65 }, {   1,  17,  34 }
+      }, {  // Band 5
+        { 149,  65, 234 }, { 121,  57, 215 }, {  61,  49, 166 },
+        {  28,  36, 114 }, {  12,  25,  76 }, {   3,  16,  42 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 214,  49, 220 }, { 132,  63, 188 }, {  42,  65, 137 }
+      }, {  // Band 1
+        {  85, 137, 221 }, { 104, 131, 216 }, {  49, 111, 192 },
+        {  21,  87, 155 }, {   2,  49,  87 }, {   1,  16,  28 }
+      }, {  // Band 2
+        {  89, 163, 230 }, {  90, 137, 220 }, {  29, 100, 183 },
+        {  10,  70, 135 }, {   2,  42,  81 }, {   1,  17,  33 }
+      }, {  // Band 3
+        { 108, 167, 237 }, {  55, 133, 222 }, {  15,  97, 179 },
+        {   4,  72, 135 }, {   1,  45,  85 }, {   1,  19,  38 }
+      }, {  // Band 4
+        { 124, 146, 240 }, {  66, 124, 224 }, {  17,  88, 175 },
+        {   4,  58, 122 }, {   1,  36,  75 }, {   1,  18,  37 }
+      }, {  //  Band 5
+        { 141,  79, 241 }, { 126,  70, 227 }, {  66,  58, 182 },
+        {  30,  44, 136 }, {  12,  34,  96 }, {   2,  20,  47 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 229,  99, 249 }, { 143, 111, 235 }, {  46, 109, 192 }
+      }, {  // Band 1
+        {  82, 158, 236 }, {  94, 146, 224 }, {  25, 117, 191 },
+        {   9,  87, 149 }, {   3,  56,  99 }, {   1,  33,  57 }
+      }, {  // Band 2
+        {  83, 167, 237 }, {  68, 145, 222 }, {  10, 103, 177 },
+        {   2,  72, 131 }, {   1,  41,  79 }, {   1,  20,  39 }
+      }, {  // Band 3
+        {  99, 167, 239 }, {  47, 141, 224 }, {  10, 104, 178 },
+        {   2,  73, 133 }, {   1,  44,  85 }, {   1,  22,  47 }
+      }, {  // Band 4
+        { 127, 145, 243 }, {  71, 129, 228 }, {  17,  93, 177 },
+        {   3,  61, 124 }, {   1,  41,  84 }, {   1,  21,  52 }
+      }, {  // Band 5
+        { 157,  78, 244 }, { 140,  72, 231 }, {  69,  58, 184 },
+        {  31,  44, 137 }, {  14,  38, 105 }, {   8,  23,  61 }
+      }
+    }
   }
-}
-
-void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
-  if (full != model)
-    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-  extend_model_to_full_distribution(model[PIVOT_NODE], full);
-}
+};
 
-static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+static const vp9_coeff_probs_model default_coef_probs_8x8[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        { 125,  34, 187 }, {  52,  41, 133 }, {   6,  31,  56 }
+      }, {  // Band 1
+        {  37, 109, 153 }, {  51, 102, 147 }, {  23,  87, 128 },
+        {   8,  67, 101 }, {   1,  41,  63 }, {   1,  19,  29 }
+      }, {  // Band 2
+        {  31, 154, 185 }, {  17, 127, 175 }, {   6,  96, 145 },
+        {   2,  73, 114 }, {   1,  51,  82 }, {   1,  28,  45 }
+      }, {  // Band 3
+        {  23, 163, 200 }, {  10, 131, 185 }, {   2,  93, 148 },
+        {   1,  67, 111 }, {   1,  41,  69 }, {   1,  14,  24 }
+      }, {  // Band 4
+        {  29, 176, 217 }, {  12, 145, 201 }, {   3, 101, 156 },
+        {   1,  69, 111 }, {   1,  39,  63 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  57, 192, 233 }, {  25, 154, 215 }, {   6, 109, 167 },
+        {   3,  78, 118 }, {   1,  48,  69 }, {   1,  21,  29 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 202, 105, 245 }, { 108, 106, 216 }, {  18,  90, 144 }
+      }, {  // Band 1
+        {  33, 172, 219 }, {  64, 149, 206 }, {  14, 117, 177 },
+        {   5,  90, 141 }, {   2,  61,  95 }, {   1,  37,  57 }
+      }, {  // Band 2
+        {  33, 179, 220 }, {  11, 140, 198 }, {   1,  89, 148 },
+        {   1,  60, 104 }, {   1,  33,  57 }, {   1,  12,  21 }
+      }, {  // Band 3
+        {  30, 181, 221 }, {   8, 141, 198 }, {   1,  87, 145 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  32, 186, 224 }, {   7, 142, 198 }, {   1,  86, 143 },
+        {   1,  58, 100 }, {   1,  31,  55 }, {   1,  12,  22 }
+      }, {  // Band 5
+        {  57, 192, 227 }, {  20, 143, 204 }, {   3,  96, 154 },
+        {   1,  68, 112 }, {   1,  42,  69 }, {   1,  19,  32 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 212,  35, 215 }, { 113,  47, 169 }, {  29,  48, 105 }
+      }, {  // Band 1
+        {  74, 129, 203 }, { 106, 120, 203 }, {  49, 107, 178 },
+        {  19,  84, 144 }, {   4,  50,  84 }, {   1,  15,  25 }
+      }, {  // Band 2
+        {  71, 172, 217 }, {  44, 141, 209 }, {  15, 102, 173 },
+        {   6,  76, 133 }, {   2,  51,  89 }, {   1,  24,  42 }
+      }, {  // Band 3
+        {  64, 185, 231 }, {  31, 148, 216 }, {   8, 103, 175 },
+        {   3,  74, 131 }, {   1,  46,  81 }, {   1,  18,  30 }
+      }, {  // Band 4
+        {  65, 196, 235 }, {  25, 157, 221 }, {   5, 105, 174 },
+        {   1,  67, 120 }, {   1,  38,  69 }, {   1,  15,  30 }
+      }, {  // Band 5
+        {  65, 204, 238 }, {  30, 156, 224 }, {   7, 107, 177 },
+        {   2,  70, 124 }, {   1,  42,  73 }, {   1,  18,  34 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 225,  86, 251 }, { 144, 104, 235 }, {  42,  99, 181 }
+      }, {  // Band 1
+        {  85, 175, 239 }, { 112, 165, 229 }, {  29, 136, 200 },
+        {  12, 103, 162 }, {   6,  77, 123 }, {   2,  53,  84 }
+      }, {  // Band 2
+        {  75, 183, 239 }, {  30, 155, 221 }, {   3, 106, 171 },
+        {   1,  74, 128 }, {   1,  44,  76 }, {   1,  17,  28 }
+      }, {  // Band 3
+        {  73, 185, 240 }, {  27, 159, 222 }, {   2, 107, 172 },
+        {   1,  75, 127 }, {   1,  42,  73 }, {   1,  17,  29 }
+      }, {  // Band 4
+        {  62, 190, 238 }, {  21, 159, 222 }, {   2, 107, 172 },
+        {   1,  72, 122 }, {   1,  40,  71 }, {   1,  18,  32 }
+      }, {  // Band 5
+        {  61, 199, 240 }, {  27, 161, 226 }, {   4, 113, 180 },
+        {   1,  76, 129 }, {   1,  46,  80 }, {   1,  23,  41 }
+      }
+    }
+  }
+};
 
-static void init_bit_tree(vp9_tree_index *p, int n) {
-  int i = 0;
+static const vp9_coeff_probs_model default_coef_probs_16x16[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {   7,  27, 153 }, {   5,  30,  95 }, {   1,  16,  30 }
+      }, {  // Band 1
+        {  50,  75, 127 }, {  57,  75, 124 }, {  27,  67, 108 },
+        {  10,  54,  86 }, {   1,  33,  52 }, {   1,  12,  18 }
+      }, {  // Band 2
+        {  43, 125, 151 }, {  26, 108, 148 }, {   7,  83, 122 },
+        {   2,  59,  89 }, {   1,  38,  60 }, {   1,  17,  27 }
+      }, {  // Band 3
+        {  23, 144, 163 }, {  13, 112, 154 }, {   2,  75, 117 },
+        {   1,  50,  81 }, {   1,  31,  51 }, {   1,  14,  23 }
+      }, {  // Band 4
+        {  18, 162, 185 }, {   6, 123, 171 }, {   1,  78, 125 },
+        {   1,  51,  86 }, {   1,  31,  54 }, {   1,  14,  23 }
+      }, {  // Band 5
+        {  15, 199, 227 }, {   3, 150, 204 }, {   1,  91, 146 },
+        {   1,  55,  95 }, {   1,  30,  53 }, {   1,  11,  20 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  19,  55, 240 }, {  19,  59, 196 }, {   3,  52, 105 }
+      }, {  // Band 1
+        {  41, 166, 207 }, { 104, 153, 199 }, {  31, 123, 181 },
+        {  14, 101, 152 }, {   5,  72, 106 }, {   1,  36,  52 }
+      }, {  // Band 2
+        {  35, 176, 211 }, {  12, 131, 190 }, {   2,  88, 144 },
+        {   1,  60, 101 }, {   1,  36,  60 }, {   1,  16,  28 }
+      }, {  // Band 3
+        {  28, 183, 213 }, {   8, 134, 191 }, {   1,  86, 142 },
+        {   1,  56,  96 }, {   1,  30,  53 }, {   1,  12,  20 }
+      }, {  // Band 4
+        {  20, 190, 215 }, {   4, 135, 192 }, {   1,  84, 139 },
+        {   1,  53,  91 }, {   1,  28,  49 }, {   1,  11,  20 }
+      }, {  // Band 5
+        {  13, 196, 216 }, {   2, 137, 192 }, {   1,  86, 143 },
+        {   1,  57,  99 }, {   1,  32,  56 }, {   1,  13,  24 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 211,  29, 217 }, {  96,  47, 156 }, {  22,  43,  87 }
+      }, {  // Band 1
+        {  78, 120, 193 }, { 111, 116, 186 }, {  46, 102, 164 },
+        {  15,  80, 128 }, {   2,  49,  76 }, {   1,  18,  28 }
+      }, {  // Band 2
+        {  71, 161, 203 }, {  42, 132, 192 }, {  10,  98, 150 },
+        {   3,  69, 109 }, {   1,  44,  70 }, {   1,  18,  29 }
+      }, {  // Band 3
+        {  57, 186, 211 }, {  30, 140, 196 }, {   4,  93, 146 },
+        {   1,  62, 102 }, {   1,  38,  65 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  47, 199, 217 }, {  14, 145, 196 }, {   1,  88, 142 },
+        {   1,  57,  98 }, {   1,  36,  62 }, {   1,  15,  26 }
+      }, {  // Band 5
+        {  26, 219, 229 }, {   5, 155, 207 }, {   1,  94, 151 },
+        {   1,  60, 104 }, {   1,  36,  62 }, {   1,  16,  28 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 233,  29, 248 }, { 146,  47, 220 }, {  43,  52, 140 }
+      }, {  // Band 1
+        { 100, 163, 232 }, { 179, 161, 222 }, {  63, 142, 204 },
+        {  37, 113, 174 }, {  26,  89, 137 }, {  18,  68,  97 }
+      }, {  // Band 2
+        {  85, 181, 230 }, {  32, 146, 209 }, {   7, 100, 164 },
+        {   3,  71, 121 }, {   1,  45,  77 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  65, 187, 230 }, {  20, 148, 207 }, {   2,  97, 159 },
+        {   1,  68, 116 }, {   1,  40,  70 }, {   1,  14,  29 }
+      }, {  // Band 4
+        {  40, 194, 227 }, {   8, 147, 204 }, {   1,  94, 155 },
+        {   1,  65, 112 }, {   1,  39,  66 }, {   1,  14,  26 }
+      }, {  // Band 5
+        {  16, 208, 228 }, {   3, 151, 207 }, {   1,  98, 160 },
+        {   1,  67, 117 }, {   1,  41,  74 }, {   1,  17,  31 }
+      }
+    }
+  }
+};
 
-  while (++i < n) {
-    p[0] = p[1] = i << 1;
-    p += 2;
+static const vp9_coeff_probs_model default_coef_probs_32x32[PLANE_TYPES] = {
+  {  // Y plane
+    {  // Intra
+      {  // Band 0
+        {  17,  38, 140 }, {   7,  34,  80 }, {   1,  17,  29 }
+      }, {  // Band 1
+        {  37,  75, 128 }, {  41,  76, 128 }, {  26,  66, 116 },
+        {  12,  52,  94 }, {   2,  32,  55 }, {   1,  10,  16 }
+      }, {  // Band 2
+        {  50, 127, 154 }, {  37, 109, 152 }, {  16,  82, 121 },
+        {   5,  59,  85 }, {   1,  35,  54 }, {   1,  13,  20 }
+      }, {  // Band 3
+        {  40, 142, 167 }, {  17, 110, 157 }, {   2,  71, 112 },
+        {   1,  44,  72 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }, {  // Band 4
+        {  30, 175, 188 }, {   9, 124, 169 }, {   1,  74, 116 },
+        {   1,  48,  78 }, {   1,  30,  49 }, {   1,  11,  18 }
+      }, {  // Band 5
+        {  10, 222, 223 }, {   2, 150, 194 }, {   1,  83, 128 },
+        {   1,  48,  79 }, {   1,  27,  45 }, {   1,  11,  17 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        {  36,  41, 235 }, {  29,  36, 193 }, {  10,  27, 111 }
+      }, {  // Band 1
+        {  85, 165, 222 }, { 177, 162, 215 }, { 110, 135, 195 },
+        {  57, 113, 168 }, {  23,  83, 120 }, {  10,  49,  61 }
+      }, {  // Band 2
+        {  85, 190, 223 }, {  36, 139, 200 }, {   5,  90, 146 },
+        {   1,  60, 103 }, {   1,  38,  65 }, {   1,  18,  30 }
+      }, {  // Band 3
+        {  72, 202, 223 }, {  23, 141, 199 }, {   2,  86, 140 },
+        {   1,  56,  97 }, {   1,  36,  61 }, {   1,  16,  27 }
+      }, {  // Band 4
+        {  55, 218, 225 }, {  13, 145, 200 }, {   1,  86, 141 },
+        {   1,  57,  99 }, {   1,  35,  61 }, {   1,  13,  22 }
+      }, {  // Band 5
+        {  15, 235, 212 }, {   1, 132, 184 }, {   1,  84, 139 },
+        {   1,  57,  97 }, {   1,  34,  56 }, {   1,  14,  23 }
+      }
+    }
+  }, {  // UV plane
+    {  // Intra
+      {  // Band 0
+        { 181,  21, 201 }, {  61,  37, 123 }, {  10,  38,  71 }
+      }, {  // Band 1
+        {  47, 106, 172 }, {  95, 104, 173 }, {  42,  93, 159 },
+        {  18,  77, 131 }, {   4,  50,  81 }, {   1,  17,  23 }
+      }, {  // Band 2
+        {  62, 147, 199 }, {  44, 130, 189 }, {  28, 102, 154 },
+        {  18,  75, 115 }, {   2,  44,  65 }, {   1,  12,  19 }
+      }, {  // Band 3
+        {  55, 153, 210 }, {  24, 130, 194 }, {   3,  93, 146 },
+        {   1,  61,  97 }, {   1,  31,  50 }, {   1,  10,  16 }
+      }, {  // Band 4
+        {  49, 186, 223 }, {  17, 148, 204 }, {   1,  96, 142 },
+        {   1,  53,  83 }, {   1,  26,  44 }, {   1,  11,  17 }
+      }, {  // Band 5
+        {  13, 217, 212 }, {   2, 136, 180 }, {   1,  78, 124 },
+        {   1,  50,  83 }, {   1,  29,  49 }, {   1,  14,  23 }
+      }
+    }, {  // Inter
+      {  // Band 0
+        { 197,  13, 247 }, {  82,  17, 222 }, {  25,  17, 162 }
+      }, {  // Band 1
+        { 126, 186, 247 }, { 234, 191, 243 }, { 176, 177, 234 },
+        { 104, 158, 220 }, {  66, 128, 186 }, {  55,  90, 137 }
+      }, {  // Band 2
+        { 111, 197, 242 }, {  46, 158, 219 }, {   9, 104, 171 },
+        {   2,  65, 125 }, {   1,  44,  80 }, {   1,  17,  91 }
+      }, {  // Band 3
+        { 104, 208, 245 }, {  39, 168, 224 }, {   3, 109, 162 },
+        {   1,  79, 124 }, {   1,  50, 102 }, {   1,  43, 102 }
+      }, {  // Band 4
+        {  84, 220, 246 }, {  31, 177, 231 }, {   2, 115, 180 },
+        {   1,  79, 134 }, {   1,  55,  77 }, {   1,  60,  79 }
+      }, {  // Band 5
+        {  43, 243, 240 }, {   8, 180, 217 }, {   1, 115, 166 },
+        {   1,  84, 121 }, {   1,  51,  67 }, {   1,  16,   6 }
+      }
+    }
   }
+};
 
-  p[0] = p[1] = 0;
+static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
+  vpx_memcpy(probs, vp9_pareto8_full[p = 0 ? 0 : p - 1],
+             MODEL_NODES * sizeof(vp9_prob));
 }
 
-static void init_bit_trees() {
-  init_bit_tree(cat1, 1);
-  init_bit_tree(cat2, 2);
-  init_bit_tree(cat3, 3);
-  init_bit_tree(cat4, 4);
-  init_bit_tree(cat5, 5);
-  init_bit_tree(cat6, 14);
+void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
+  if (full != model)
+    vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
-const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  { 0, 0, 0, 0},
-  { 0, 0, 0, 1},
-  { 0, 0, 0, 2},
-  { 0, 0, 0, 3},
-  { 0, 0, 0, 4},
-  { cat1, Pcat1, 1, 5},
-  { cat2, Pcat2, 2, 7},
-  { cat3, Pcat3, 3, 11},
-  { cat4, Pcat4, 4, 19},
-  { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 14, 67},
-  { 0, 0, 0, 0}
-};
-
-#include "vp9/common/vp9_default_coef_probs.h"
-
 void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_4X4], default_coef_probs_4x4);
   vp9_copy(cm->fc.coef_probs[TX_8X8], default_coef_probs_8x8);
@@ -298,13 +745,6 @@ void vp9_default_coef_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.coef_probs[TX_32X32], default_coef_probs_32x32);
 }
 
-void vp9_coef_tree_initialize() {
-  init_bit_trees();
-  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
-}
-
-// #define COEF_COUNT_TESTING
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24
@@ -316,31 +756,30 @@ static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[tx_size];
-  const vp9_coeff_probs_model *pre_coef_probs = pre_fc->coef_probs[tx_size];
-  vp9_coeff_count_model *coef_counts = cm->counts.coef[tx_size];
-  unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+  vp9_coeff_probs_model *const probs = cm->fc.coef_probs[tx_size];
+  const vp9_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
+  vp9_coeff_count_model *counts = cm->counts.coef[tx_size];
+  unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cm->counts.eob_branch[tx_size];
-  int t, i, j, k, l;
-  unsigned int branch_ct[UNCONSTRAINED_NODES][2];
-  vp9_prob coef_probs[UNCONSTRAINED_NODES];
+  int i, j, k, l, m;
 
-  for (i = 0; i < BLOCK_TYPES; ++i)
+  for (i = 0; i < PLANE_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
-          vp9_tree_probs_from_distribution(vp9_coefmodel_tree, coef_probs,
-                                           branch_ct, coef_counts[i][j][k][l],
-                                           0);
-          branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          for (t = 0; t < UNCONSTRAINED_NODES; ++t)
-            dst_coef_probs[i][j][k][l][t] = merge_probs(
-                pre_coef_probs[i][j][k][l][t], coef_probs[t],
-                branch_ct[t], count_sat, update_factor);
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+          const int n0 = counts[i][j][k][l][ZERO_TOKEN];
+          const int n1 = counts[i][j][k][l][ONE_TOKEN];
+          const int n2 = counts[i][j][k][l][TWO_TOKEN];
+          const int neob = counts[i][j][k][l][EOB_MODEL_TOKEN];
+          const unsigned int branch_ct[UNCONSTRAINED_NODES][2] = {
+            { neob, eob_counts[i][j][k][l] - neob },
+            { n0, n1 + n2 },
+            { n1, n2 }
+          };
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m],
+                                               branch_ct[m],
+                                               count_sat, update_factor);
         }
 }
 
diff --git a/source/libvpx/vp9/common/vp9_entropy.h b/source/libvpx/vp9/common/vp9_entropy.h
index ec7d09a..65b679a 100644
--- a/source/libvpx/vp9/common/vp9_entropy.h
+++ b/source/libvpx/vp9/common/vp9_entropy.h
@@ -17,56 +17,48 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_entropymode.h"
 
 #define DIFF_UPDATE_PROB 252
 
-/* Coefficient token alphabet */
+// Coefficient token alphabet
+#define ZERO_TOKEN      0   // 0     Extra Bits 0+0
+#define ONE_TOKEN       1   // 1     Extra Bits 0+1
+#define TWO_TOKEN       2   // 2     Extra Bits 0+1
+#define THREE_TOKEN     3   // 3     Extra Bits 0+1
+#define FOUR_TOKEN      4   // 4     Extra Bits 0+1
+#define CATEGORY1_TOKEN 5   // 5-6   Extra Bits 1+1
+#define CATEGORY2_TOKEN 6   // 7-10  Extra Bits 2+1
+#define CATEGORY3_TOKEN 7   // 11-18 Extra Bits 3+1
+#define CATEGORY4_TOKEN 8   // 19-34 Extra Bits 4+1
+#define CATEGORY5_TOKEN 9   // 35-66 Extra Bits 5+1
+#define CATEGORY6_TOKEN 10  // 67+   Extra Bits 14+1
+#define EOB_TOKEN       11  // EOB   Extra Bits 0+0
 
-#define ZERO_TOKEN              0       /* 0         Extra Bits 0+0 */
-#define ONE_TOKEN               1       /* 1         Extra Bits 0+1 */
-#define TWO_TOKEN               2       /* 2         Extra Bits 0+1 */
-#define THREE_TOKEN             3       /* 3         Extra Bits 0+1 */
-#define FOUR_TOKEN              4       /* 4         Extra Bits 0+1 */
-#define DCT_VAL_CATEGORY1       5       /* 5-6       Extra Bits 1+1 */
-#define DCT_VAL_CATEGORY2       6       /* 7-10      Extra Bits 2+1 */
-#define DCT_VAL_CATEGORY3       7       /* 11-18     Extra Bits 3+1 */
-#define DCT_VAL_CATEGORY4       8       /* 19-34     Extra Bits 4+1 */
-#define DCT_VAL_CATEGORY5       9       /* 35-66     Extra Bits 5+1 */
-#define DCT_VAL_CATEGORY6       10      /* 67+       Extra Bits 14+1 */
-#define DCT_EOB_TOKEN           11      /* EOB       Extra Bits 0+0 */
-#define MAX_ENTROPY_TOKENS      12
-#define ENTROPY_NODES           11
-#define EOSB_TOKEN              127     /* Not signalled, encoder only */
+#define ENTROPY_TOKENS 12
 
-#define INTER_MODE_CONTEXTS     7
+#define ENTROPY_NODES 11
 
-extern DECLARE_ALIGNED(16, const uint8_t,
-                       vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
+extern DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[ENTROPY_TOKENS]);
 
-extern const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)];
-
-#define DCT_EOB_MODEL_TOKEN     3      /* EOB       Extra Bits 0+0 */
+#define EOB_MODEL_TOKEN 3
 extern const vp9_tree_index vp9_coefmodel_tree[];
 
-extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS];
-
 typedef struct {
-  vp9_tree_index *tree;
+  const vp9_tree_index *tree;
   const vp9_prob *prob;
   int len;
   int base_val;
 } vp9_extra_bit;
 
 // indexed by token value
-extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
+extern const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS];
 
 #define MAX_PROB                255
 #define DCT_MAX_VALUE           16384
 
 /* Coefficients are predicted via a 3-dimensional probability table. */
 
-/* Outside dimension.  0 = Y with DC, 1 = UV */
-#define BLOCK_TYPES 2
 #define REF_TYPES 2  // intra=0, inter=1
 
 /* Middle dimension reflects the coefficient position within the transform. */
@@ -88,13 +80,14 @@ extern const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS];
    coefficient band (and since zigzag positions 0, 1, and 2 are in
    distinct bands). */
 
-#define PREV_COEF_CONTEXTS          6
+#define COEFF_CONTEXTS 6
+#define BAND_COEFF_CONTEXTS(band) ((band) == 0 ? 3 : COEFF_CONTEXTS)
 
 // #define ENTROPY_STATS
 
-typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                                    [MAX_ENTROPY_TOKENS];
-typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
+typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
+                                    [ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS]
                                     [ENTROPY_NODES][2];
 
 #define SUBEXP_PARAM                4   /* Subexponential code parameter */
@@ -102,8 +95,6 @@ typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
 
 struct VP9Common;
 void vp9_default_coef_probs(struct VP9Common *cm);
-
-void vp9_coef_tree_initialize();
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
 static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
@@ -120,41 +111,41 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
 
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
-// For 4x4 blocks the index is less but to keep things common the lookup
-// table for 4x4 is padded out to this index.
+// This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
-extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
+extern const uint8_t vp9_coefband_trans_8x8plus[1024];
+extern const uint8_t vp9_coefband_trans_4x4[16];
 
-
-static int get_coef_band(const uint8_t * band_translate, int coef_index) {
-  return (coef_index > MAXBAND_INDEX)
-    ? (COEF_BANDS-1) : band_translate[coef_index];
+static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+                           : vp9_coefband_trans_8x8plus;
 }
 
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
 // In between probabilities are interpolated linearly
 
-#define COEFPROB_MODELS             128
+#define COEFF_PROB_MODELS 256
 
 #define UNCONSTRAINED_NODES         3
 
 #define PIVOT_NODE                  2   // which node is pivot
 
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
-                                      [PREV_COEF_CONTEXTS]
-                                      [UNCONSTRAINED_NODES];
+                                      [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
 
 typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS]
-                                          [PREV_COEF_CONTEXTS]
+                                          [COEFF_CONTEXTS]
                                           [UNCONSTRAINED_NODES + 1];
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
-static int get_entropy_context(TX_SIZE tx_size,
-                               ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
+static int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+                                                const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
 
   switch (tx_size) {
@@ -163,48 +154,35 @@ static int get_entropy_context(TX_SIZE tx_size,
       left_ec = l[0] != 0;
       break;
     case TX_8X8:
-      above_ec = !!*(uint16_t *)a;
-      left_ec  = !!*(uint16_t *)l;
+      above_ec = !!*(const uint16_t *)a;
+      left_ec  = !!*(const uint16_t *)l;
       break;
     case TX_16X16:
-      above_ec = !!*(uint32_t *)a;
-      left_ec  = !!*(uint32_t *)l;
+      above_ec = !!*(const uint32_t *)a;
+      left_ec  = !!*(const uint32_t *)l;
       break;
     case TX_32X32:
-      above_ec = !!*(uint64_t *)a;
-      left_ec  = !!*(uint64_t *)l;
+      above_ec = !!*(const uint64_t *)a;
+      left_ec  = !!*(const uint64_t *)l;
       break;
     default:
-      assert(!"Invalid transform size.");
+      assert(0 && "Invalid transform size.");
   }
 
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static const uint8_t *get_band_translate(TX_SIZE tx_size) {
-  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
-                           : vp9_coefband_trans_8x8plus;
-}
-
-static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                     PLANE_TYPE type, int block_idx,
-                     const int16_t **scan, const int16_t **scan_nb) {
-  switch (tx_size) {
-    case TX_4X4:
-      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
-      break;
-    case TX_8X8:
-      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
-      break;
-    case TX_16X16:
-      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
-      break;
-    case TX_32X32:
-      *scan = vp9_default_scan_32x32;
-      *scan_nb = vp9_default_scan_32x32_neighbors;
-      break;
-    default:
-      assert(!"Invalid transform size.");
+static const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                  PLANE_TYPE type, int block_idx) {
+  const MODE_INFO *const mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+
+  if (is_inter_block(mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+    return &vp9_default_scan_orders[tx_size];
+  } else {
+    const MB_PREDICTION_MODE mode =
+        mbmi->sb_type < BLOCK_8X8 ? mi->bmi[block_idx].as_mode : mbmi->mode;
+    return &vp9_scan_orders[tx_size][mode2txfm_map[mode]];
   }
 }
 
diff --git a/source/libvpx/vp9/common/vp9_entropymode.c b/source/libvpx/vp9/common/vp9_entropymode.c
index 21c91d6..83281b2 100644
--- a/source/libvpx/vp9/common/vp9_entropymode.c
+++ b/source/libvpx/vp9/common/vp9_entropymode.c
@@ -161,51 +161,52 @@ static const vp9_prob default_if_uv_probs[INTRA_MODES][INTRA_MODES - 1] = {
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-static const vp9_prob default_partition_probs[FRAME_TYPES][PARTITION_CONTEXTS]
+const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                     [PARTITION_TYPES - 1] = {
+  // 8x8 -> 4x4
+  { 158,  97,  94 },  // a/l both not split
+  {  93,  24,  99 },  // a split, l not split
+  {  85, 119,  44 },  // l split, a not split
+  {  62,  59,  67 },  // a/l both split
+  // 16x16 -> 8x8
+  { 149,  53,  53 },  // a/l both not split
+  {  94,  20,  48 },  // a split, l not split
+  {  83,  53,  24 },  // l split, a not split
+  {  52,  18,  18 },  // a/l both split
+  // 32x32 -> 16x16
+  { 150,  40,  39 },  // a/l both not split
+  {  78,  12,  26 },  // a split, l not split
+  {  67,  33,  11 },  // l split, a not split
+  {  24,   7,   5 },  // a/l both split
+  // 64x64 -> 32x32
+  { 174,  35,  49 },  // a/l both not split
+  {  68,  11,  27 },  // a split, l not split
+  {  57,  15,   9 },  // l split, a not split
+  {  12,   3,   3 },  // a/l both split
+};
+
+static const vp9_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
-  {  // frame_type = keyframe
-    // 8x8 -> 4x4
-    { 158,  97,  94 },  // a/l both not split
-    {  93,  24,  99 },  // a split, l not split
-    {  85, 119,  44 },  // l split, a not split
-    {  62,  59,  67 },  // a/l both split
-    // 16x16 -> 8x8
-    { 149,  53,  53 },  // a/l both not split
-    {  94,  20,  48 },  // a split, l not split
-    {  83,  53,  24 },  // l split, a not split
-    {  52,  18,  18 },  // a/l both split
-    // 32x32 -> 16x16
-    { 150,  40,  39 },  // a/l both not split
-    {  78,  12,  26 },  // a split, l not split
-    {  67,  33,  11 },  // l split, a not split
-    {  24,   7,   5 },  // a/l both split
-    // 64x64 -> 32x32
-    { 174,  35,  49 },  // a/l both not split
-    {  68,  11,  27 },  // a split, l not split
-    {  57,  15,   9 },  // l split, a not split
-    {  12,   3,   3 },  // a/l both split
-  }, {  // frame_type = interframe
-    // 8x8 -> 4x4
-    { 199, 122, 141 },  // a/l both not split
-    { 147,  63, 159 },  // a split, l not split
-    { 148, 133, 118 },  // l split, a not split
-    { 121, 104, 114 },  // a/l both split
-    // 16x16 -> 8x8
-    { 174,  73,  87 },  // a/l both not split
-    {  92,  41,  83 },  // a split, l not split
-    {  82,  99,  50 },  // l split, a not split
-    {  53,  39,  39 },  // a/l both split
-    // 32x32 -> 16x16
-    { 177,  58,  59 },  // a/l both not split
-    {  68,  26,  63 },  // a split, l not split
-    {  52,  79,  25 },  // l split, a not split
-    {  17,  14,  12 },  // a/l both split
-    // 64x64 -> 32x32
-    { 222,  34,  30 },  // a/l both not split
-    {  72,  16,  44 },  // a split, l not split
-    {  58,  32,  12 },  // l split, a not split
-    {  10,   7,   6 },  // a/l both split
-  }
+  // 8x8 -> 4x4
+  { 199, 122, 141 },  // a/l both not split
+  { 147,  63, 159 },  // a split, l not split
+  { 148, 133, 118 },  // l split, a not split
+  { 121, 104, 114 },  // a/l both split
+  // 16x16 -> 8x8
+  { 174,  73,  87 },  // a/l both not split
+  {  92,  41,  83 },  // a split, l not split
+  {  82,  99,  50 },  // l split, a not split
+  {  53,  39,  39 },  // a/l both split
+  // 32x32 -> 16x16
+  { 177,  58,  59 },  // a/l both not split
+  {  68,  26,  63 },  // a split, l not split
+  {  52,  79,  25 },  // l split, a not split
+  {  17,  14,  12 },  // a/l both split
+  // 64x64 -> 32x32
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
 };
 
 static const vp9_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
@@ -231,21 +232,18 @@ const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -D63_PRED, 16,                    /* 7 = D63_NODE */
   -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
 };
-struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
 
 const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
-  -ZEROMV, 2,
-  -NEARESTMV, 4,
-  -NEARMV, -NEWMV
+  -INTER_OFFSET(ZEROMV), 2,
+  -INTER_OFFSET(NEARESTMV), 4,
+  -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
 };
-struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
 
 const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
-struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
 
 static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
@@ -328,6 +326,7 @@ void vp9_init_mbmode_probs(VP9_COMMON *cm) {
   vp9_copy(cm->fc.single_ref_prob, default_single_ref_p);
   cm->fc.tx_probs = default_tx_probs;
   vp9_copy(cm->fc.mbskip_probs, default_mbskip_probs);
+  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 }
 
 const vp9_tree_index vp9_switchable_interp_tree
@@ -335,43 +334,19 @@ const vp9_tree_index vp9_switchable_interp_tree
   -EIGHTTAP, 2,
   -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
 };
-struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init() {
-  vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree);
-  vp9_tokens_from_tree(vp9_switchable_interp_encodings,
-                       vp9_switchable_interp_tree);
-  vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree);
-  vp9_tokens_from_tree_offset(vp9_inter_mode_encodings,
-                              vp9_inter_mode_tree, NEARESTMV);
-}
 
 #define COUNT_SAT 20
 #define MAX_UPDATE_FACTOR 128
 
-static int update_ct(vp9_prob pre_prob, vp9_prob prob,
-                     const unsigned int ct[2]) {
-  return merge_probs(pre_prob, prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
+static int adapt_prob(vp9_prob pre_prob, const unsigned int ct[2]) {
+  return merge_probs(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
 }
 
-static int update_ct2(vp9_prob pre_prob, const unsigned int ct[2]) {
-  return merge_probs2(pre_prob, ct, COUNT_SAT, MAX_UPDATE_FACTOR);
-}
-
-static void update_mode_probs(int n_modes,
-                              const vp9_tree_index *tree,
-                              const unsigned int *cnt,
-                              const vp9_prob *pre_probs, vp9_prob *dst_probs,
-                              unsigned int tok0_offset) {
-#define MAX_PROBS 32
-  vp9_prob probs[MAX_PROBS];
-  unsigned int branch_ct[MAX_PROBS][2];
-  int t;
-
-  assert(n_modes - 1 < MAX_PROBS);
-  vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset);
-  for (t = 0; t < n_modes - 1; ++t)
-    dst_probs[t] = update_ct(pre_probs[t], probs[t], branch_ct[t]);
+static void adapt_probs(const vp9_tree_index *tree,
+                        const vp9_prob *pre_probs, const unsigned int *counts,
+                        vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, COUNT_SAT, MAX_UPDATE_FACTOR,
+                   probs);
 }
 
 void vp9_adapt_mode_probs(VP9_COMMON *cm) {
@@ -381,46 +356,39 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = update_ct2(pre_fc->intra_inter_prob[i],
+    fc->intra_inter_prob[i] = adapt_prob(pre_fc->intra_inter_prob[i],
                                          counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = update_ct2(pre_fc->comp_inter_prob[i],
+    fc->comp_inter_prob[i] = adapt_prob(pre_fc->comp_inter_prob[i],
                                         counts->comp_inter[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = update_ct2(pre_fc->comp_ref_prob[i],
+    fc->comp_ref_prob[i] = adapt_prob(pre_fc->comp_ref_prob[i],
                                       counts->comp_ref[i]);
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = update_ct2(pre_fc->single_ref_prob[i][j],
+      fc->single_ref_prob[i][j] = adapt_prob(pre_fc->single_ref_prob[i][j],
                                              counts->single_ref[i][j]);
 
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    update_mode_probs(INTER_MODES, vp9_inter_mode_tree,
-                      counts->inter_mode[i], pre_fc->inter_mode_probs[i],
-                      fc->inter_mode_probs[i], NEARESTMV);
+    adapt_probs(vp9_inter_mode_tree, pre_fc->inter_mode_probs[i],
+                counts->inter_mode[i], fc->inter_mode_probs[i]);
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->y_mode[i], pre_fc->y_mode_prob[i],
-                      fc->y_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->y_mode_prob[i],
+                counts->y_mode[i], fc->y_mode_prob[i]);
 
   for (i = 0; i < INTRA_MODES; ++i)
-    update_mode_probs(INTRA_MODES, vp9_intra_mode_tree,
-                      counts->uv_mode[i], pre_fc->uv_mode_prob[i],
-                      fc->uv_mode_prob[i], 0);
+    adapt_probs(vp9_intra_mode_tree, pre_fc->uv_mode_prob[i],
+                counts->uv_mode[i], fc->uv_mode_prob[i]);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    update_mode_probs(PARTITION_TYPES, vp9_partition_tree,
-                      counts->partition[i],
-                      pre_fc->partition_prob[INTER_FRAME][i],
-                      fc->partition_prob[INTER_FRAME][i], 0);
+    adapt_probs(vp9_partition_tree, pre_fc->partition_prob[i],
+                counts->partition[i], fc->partition_prob[i]);
 
   if (cm->mcomp_filter_type == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-      update_mode_probs(SWITCHABLE_FILTERS, vp9_switchable_interp_tree,
-                        counts->switchable_interp[i],
-                        pre_fc->switchable_interp_prob[i],
-                        fc->switchable_interp_prob[i], 0);
+      adapt_probs(vp9_switchable_interp_tree, pre_fc->switchable_interp_prob[i],
+                  counts->switchable_interp[i], fc->switchable_interp_prob[i]);
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
@@ -432,23 +400,23 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
     for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
       tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
       for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = update_ct2(pre_fc->tx_probs.p8x8[i][j],
+        fc->tx_probs.p8x8[i][j] = adapt_prob(pre_fc->tx_probs.p8x8[i][j],
                                              branch_ct_8x8p[j]);
 
       tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
       for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = update_ct2(pre_fc->tx_probs.p16x16[i][j],
+        fc->tx_probs.p16x16[i][j] = adapt_prob(pre_fc->tx_probs.p16x16[i][j],
                                                branch_ct_16x16p[j]);
 
       tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
       for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = update_ct2(pre_fc->tx_probs.p32x32[i][j],
+        fc->tx_probs.p32x32[i][j] = adapt_prob(pre_fc->tx_probs.p32x32[i][j],
                                                branch_ct_32x32p[j]);
     }
   }
 
   for (i = 0; i < MBSKIP_CONTEXTS; ++i)
-    fc->mbskip_probs[i] = update_ct2(pre_fc->mbskip_probs[i],
+    fc->mbskip_probs[i] = adapt_prob(pre_fc->mbskip_probs[i],
                                      counts->mbskip[i]);
 }
 
@@ -487,12 +455,11 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_default_coef_probs(cm);
   vp9_init_mbmode_probs(cm);
   vp9_init_mv_probs(cm);
-  vp9_copy(cm->fc.inter_mode_probs, default_inter_mode_probs);
 
   if (cm->frame_type == KEY_FRAME ||
       cm->error_resilient_mode || cm->reset_frame_context == 3) {
     // Reset all frame contexts.
-    for (i = 0; i < NUM_FRAME_CONTEXTS; ++i)
+    for (i = 0; i < FRAME_CONTEXTS; ++i)
       cm->frame_contexts[i] = cm->fc;
   } else if (cm->reset_frame_context == 2) {
     // Reset only the frame context specified in the frame header.
@@ -504,9 +471,6 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vpx_memset(cm->mip, 0,
              cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO));
 
-  vp9_update_mode_info_border(cm, cm->mip);
-  vp9_update_mode_info_border(cm, cm->prev_mip);
-
   vp9_zero(cm->ref_frame_sign_bias);
 
   cm->frame_context_idx = 0;
diff --git a/source/libvpx/vp9/common/vp9_entropymode.h b/source/libvpx/vp9/common/vp9_entropymode.h
index ea96555..df58bea 100644
--- a/source/libvpx/vp9/common/vp9_entropymode.h
+++ b/source/libvpx/vp9/common/vp9_entropymode.h
@@ -37,21 +37,13 @@ struct tx_counts {
 extern const vp9_prob vp9_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
 extern const vp9_prob vp9_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-
+extern const vp9_prob vp9_kf_partition_probs[PARTITION_CONTEXTS]
+                                            [PARTITION_TYPES - 1];
 extern const vp9_tree_index vp9_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
-extern struct vp9_token vp9_intra_mode_encodings[INTRA_MODES];
-
 extern const vp9_tree_index vp9_inter_mode_tree[TREE_SIZE(INTER_MODES)];
-extern struct vp9_token vp9_inter_mode_encodings[INTER_MODES];
-
 extern const vp9_tree_index vp9_partition_tree[TREE_SIZE(PARTITION_TYPES)];
-extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES];
-
 extern const vp9_tree_index vp9_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-extern struct vp9_token vp9_switchable_interp_encodings[SWITCHABLE_FILTERS];
-
-void vp9_entropy_mode_init();
 
 void vp9_setup_past_independence(struct VP9Common *cm);
 
diff --git a/source/libvpx/vp9/common/vp9_entropymv.c b/source/libvpx/vp9/common/vp9_entropymv.c
index f70b571..60ae79f 100644
--- a/source/libvpx/vp9/common/vp9_entropymv.c
+++ b/source/libvpx/vp9/common/vp9_entropymv.c
@@ -23,7 +23,6 @@ const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)] = {
   -MV_JOINT_HNZVZ, 4,
   -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ
 };
-struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
 
 const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_0, 2,
@@ -37,19 +36,16 @@ const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
   -MV_CLASS_7, -MV_CLASS_8,
   -MV_CLASS_9, -MV_CLASS_10,
 };
-struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
 
 const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
   -0, -1,
 };
-struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
 
-const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)] = {
+const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = {
   -0, 2,
   -1, 4,
   -2, -3
 };
-struct vp9_token vp9_mv_fp_encodings[4];
 
 static const nmv_context default_nmv_context = {
   {32, 64, 96},
@@ -191,71 +187,50 @@ void vp9_inc_mv(const MV *mv, nmv_context_counts *counts) {
 }
 
 static vp9_prob adapt_prob(vp9_prob prep, const unsigned int ct[2]) {
-  return merge_probs2(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
+  return merge_probs(prep, ct, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR);
 }
 
-static unsigned int adapt_probs(unsigned int i,
-                                vp9_tree tree,
-                                vp9_prob this_probs[],
-                                const vp9_prob last_probs[],
-                                const unsigned int num_events[]) {
-  const unsigned int left = tree[i] <= 0
-          ? num_events[-tree[i]]
-          : adapt_probs(tree[i], tree, this_probs, last_probs, num_events);
-
-  const unsigned int right = tree[i + 1] <= 0
-          ? num_events[-tree[i + 1]]
-          : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events);
-  const unsigned int ct[2] = { left, right };
-  this_probs[i >> 1] = adapt_prob(last_probs[i >> 1], ct);
-  return left + right;
+static void adapt_probs(const vp9_tree_index *tree, const vp9_prob *pre_probs,
+                        const unsigned int *counts, vp9_prob *probs) {
+  tree_merge_probs(tree, pre_probs, counts, MV_COUNT_SAT, MV_MAX_UPDATE_FACTOR,
+                   probs);
 }
 
-
 void vp9_adapt_mv_probs(VP9_COMMON *cm, int allow_hp) {
   int i, j;
 
-  const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
-
-  nmv_context *ctx = &cm->fc.nmvc;
-  const nmv_context *pre_ctx = &pre_fc->nmvc;
-  const nmv_context_counts *cts = &cm->counts.mv;
+  nmv_context *fc = &cm->fc.nmvc;
+  const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
+  const nmv_context_counts *counts = &cm->counts.mv;
 
-  adapt_probs(0, vp9_mv_joint_tree, ctx->joints, pre_ctx->joints, cts->joints);
+  adapt_probs(vp9_mv_joint_tree, pre_fc->joints, counts->joints, fc->joints);
 
   for (i = 0; i < 2; ++i) {
-    ctx->comps[i].sign = adapt_prob(pre_ctx->comps[i].sign, cts->comps[i].sign);
-    adapt_probs(0, vp9_mv_class_tree, ctx->comps[i].classes,
-                pre_ctx->comps[i].classes, cts->comps[i].classes);
-    adapt_probs(0, vp9_mv_class0_tree, ctx->comps[i].class0,
-                pre_ctx->comps[i].class0, cts->comps[i].class0);
+    nmv_component *comp = &fc->comps[i];
+    const nmv_component *pre_comp = &pre_fc->comps[i];
+    const nmv_component_counts *c = &counts->comps[i];
+
+    comp->sign = adapt_prob(pre_comp->sign, c->sign);
+    adapt_probs(vp9_mv_class_tree, pre_comp->classes, c->classes,
+                comp->classes);
+    adapt_probs(vp9_mv_class0_tree, pre_comp->class0, c->class0, comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-        ctx->comps[i].bits[j] = adapt_prob(pre_ctx->comps[i].bits[j],
-                                           cts->comps[i].bits[j]);
+      comp->bits[j] = adapt_prob(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
-      adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].class0_fp[j],
-                  pre_ctx->comps[i].class0_fp[j], cts->comps[i].class0_fp[j]);
+      adapt_probs(vp9_mv_fp_tree, pre_comp->class0_fp[j], c->class0_fp[j],
+                  comp->class0_fp[j]);
 
-    adapt_probs(0, vp9_mv_fp_tree, ctx->comps[i].fp, pre_ctx->comps[i].fp,
-                cts->comps[i].fp);
+    adapt_probs(vp9_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      ctx->comps[i].class0_hp = adapt_prob(pre_ctx->comps[i].class0_hp,
-                                           cts->comps[i].class0_hp);
-      ctx->comps[i].hp = adapt_prob(pre_ctx->comps[i].hp, cts->comps[i].hp);
+      comp->class0_hp = adapt_prob(pre_comp->class0_hp, c->class0_hp);
+      comp->hp = adapt_prob(pre_comp->hp, c->hp);
     }
   }
 }
 
-void vp9_entropy_mv_init() {
-  vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree);
-  vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree);
-  vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree);
-  vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree);
-}
-
 void vp9_init_mv_probs(VP9_COMMON *cm) {
   cm->fc.nmvc = default_nmv_context;
 }
diff --git a/source/libvpx/vp9/common/vp9_entropymv.h b/source/libvpx/vp9/common/vp9_entropymv.h
index d843f5b..3175a1e 100644
--- a/source/libvpx/vp9/common/vp9_entropymv.h
+++ b/source/libvpx/vp9/common/vp9_entropymv.h
@@ -18,7 +18,6 @@
 
 struct VP9Common;
 
-void vp9_entropy_mv_init();
 void vp9_init_mv_probs(struct VP9Common *cm);
 
 void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
@@ -62,6 +61,7 @@ typedef enum {
 #define CLASS0_BITS    1  /* bits at integer precision for class 0 */
 #define CLASS0_SIZE    (1 << CLASS0_BITS)
 #define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_FP_SIZE 4
 
 #define MV_MAX_BITS    (MV_CLASSES + CLASS0_BITS + 2)
 #define MV_MAX         ((1 << MV_MAX_BITS) - 1)
@@ -71,25 +71,18 @@ typedef enum {
 #define MV_UPP   ((1 << MV_IN_USE_BITS) - 1)
 #define MV_LOW   (-(1 << MV_IN_USE_BITS))
 
-extern const vp9_tree_index vp9_mv_joint_tree[TREE_SIZE(MV_JOINTS)];
-extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS];
-
-extern const vp9_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)];
-extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES];
-
-extern const vp9_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)];
-extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE];
-
-extern const vp9_tree_index vp9_mv_fp_tree[TREE_SIZE(4)];
-extern struct vp9_token vp9_mv_fp_encodings[4];
+extern const vp9_tree_index vp9_mv_joint_tree[];
+extern const vp9_tree_index vp9_mv_class_tree[];
+extern const vp9_tree_index vp9_mv_class0_tree[];
+extern const vp9_tree_index vp9_mv_fp_tree[];
 
 typedef struct {
   vp9_prob sign;
   vp9_prob classes[MV_CLASSES - 1];
   vp9_prob class0[CLASS0_SIZE - 1];
   vp9_prob bits[MV_OFFSET_BITS];
-  vp9_prob class0_fp[CLASS0_SIZE][4 - 1];
-  vp9_prob fp[4 - 1];
+  vp9_prob class0_fp[CLASS0_SIZE][MV_FP_SIZE - 1];
+  vp9_prob fp[MV_FP_SIZE - 1];
   vp9_prob class0_hp;
   vp9_prob hp;
 } nmv_component;
@@ -116,8 +109,8 @@ typedef struct {
   unsigned int classes[MV_CLASSES];
   unsigned int class0[CLASS0_SIZE];
   unsigned int bits[MV_OFFSET_BITS][2];
-  unsigned int class0_fp[CLASS0_SIZE][4];
-  unsigned int fp[4];
+  unsigned int class0_fp[CLASS0_SIZE][MV_FP_SIZE];
+  unsigned int fp[MV_FP_SIZE];
   unsigned int class0_hp[2];
   unsigned int hp[2];
 } nmv_component_counts;
diff --git a/source/libvpx/vp9/common/vp9_enums.h b/source/libvpx/vp9/common/vp9_enums.h
index 1651b90..34411a3 100644
--- a/source/libvpx/vp9/common/vp9_enums.h
+++ b/source/libvpx/vp9/common/vp9_enums.h
@@ -52,20 +52,22 @@ typedef enum PARTITION_TYPE {
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
 #define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
 
+// block transform size
 typedef enum {
-  TX_4X4 = 0,                      // 4x4 dct transform
-  TX_8X8 = 1,                      // 8x8 dct transform
-  TX_16X16 = 2,                    // 16x16 dct transform
-  TX_32X32 = 3,                    // 32x32 dct transform
+  TX_4X4 = 0,                      // 4x4 transform
+  TX_8X8 = 1,                      // 8x8 transform
+  TX_16X16 = 2,                    // 16x16 transform
+  TX_32X32 = 3,                    // 32x32 transform
   TX_SIZES
 } TX_SIZE;
 
+// frame transform mode
 typedef enum {
-  ONLY_4X4            = 0,
-  ALLOW_8X8           = 1,
-  ALLOW_16X16         = 2,
-  ALLOW_32X32         = 3,
-  TX_MODE_SELECT      = 4,
+  ONLY_4X4            = 0,        // only 4x4 transform used
+  ALLOW_8X8           = 1,        // allow block transform size up to 8x8
+  ALLOW_16X16         = 2,        // allow block transform size up to 16x16
+  ALLOW_32X32         = 3,        // allow block transform size up to 32x32
+  TX_MODE_SELECT      = 4,        // transform specified for each block
   TX_MODES            = 5,
 } TX_MODE;
 
@@ -73,7 +75,8 @@ typedef enum {
   DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
   ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
   DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
-  ADST_ADST = 3                       // ADST in both directions
+  ADST_ADST = 3,                      // ADST in both directions
+  TX_TYPES = 4
 } TX_TYPE;
 
 typedef enum {
diff --git a/source/libvpx/vp9/common/vp9_filter.c b/source/libvpx/vp9/common/vp9_filter.c
index 8f24052..79ace14 100644
--- a/source/libvpx/vp9/common/vp9_filter.c
+++ b/source/libvpx/vp9/common/vp9_filter.c
@@ -97,19 +97,15 @@ DECLARE_ALIGNED(256, const subpel_kernel,
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
 
+
+static const subpel_kernel* vp9_filter_kernels[4] = {
+  vp9_sub_pel_filters_8,
+  vp9_sub_pel_filters_8lp,
+  vp9_sub_pel_filters_8s,
+  vp9_bilinear_filters
+};
+
 const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type) {
-  switch (type) {
-    case EIGHTTAP:
-      return vp9_sub_pel_filters_8;
-    case EIGHTTAP_SMOOTH:
-      return vp9_sub_pel_filters_8lp;
-    case EIGHTTAP_SHARP:
-      return vp9_sub_pel_filters_8s;
-    case BILINEAR:
-      return vp9_bilinear_filters;
-    default:
-      assert(!"Invalid interpolation type.");
-      return NULL;
-  }
+  return vp9_filter_kernels[type];
 }
 
diff --git a/source/libvpx/vp9/common/vp9_filter.h b/source/libvpx/vp9/common/vp9_filter.h
index 8652a6e..b1e7e64 100644
--- a/source/libvpx/vp9/common/vp9_filter.h
+++ b/source/libvpx/vp9/common/vp9_filter.h
@@ -39,7 +39,6 @@ struct subpix_fn_table {
 const subpel_kernel *vp9_get_filter_kernel(INTERPOLATION_TYPE type);
 
 extern const subpel_kernel vp9_bilinear_filters[SUBPEL_SHIFTS];
-extern const subpel_kernel vp9_sub_pel_filters_6[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8s[SUBPEL_SHIFTS];
 extern const subpel_kernel vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS];
diff --git a/source/libvpx/vp9/common/vp9_findnearmv.c b/source/libvpx/vp9/common/vp9_findnearmv.c
index b91c501..7cdf2c1 100644
--- a/source/libvpx/vp9/common/vp9_findnearmv.c
+++ b/source/libvpx/vp9/common/vp9_findnearmv.c
@@ -36,50 +36,49 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, int allow_hp,
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
-                                   int_mv *dst_nearest,
-                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col) {
-  int_mv dst_list[MAX_MV_REF_CANDIDATES];
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest, int_mv *near) {
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
   MODE_INFO *const mi = xd->mi_8x8[0];
+  b_mode_info *bmi = mi->bmi;
+  int n;
 
-  assert(ref_idx == 0 || ref_idx == 1);
-  assert(MAX_MV_REF_CANDIDATES == 2);  // makes code here slightly easier
+  assert(MAX_MV_REF_CANDIDATES == 2);
 
-  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi,
-                       mi->mbmi.ref_frame[ref_idx],
-                       mv_list, block_idx, mi_row, mi_col);
+  vp9_find_mv_refs_idx(cm, xd, tile, mi, xd->last_mi, mi->mbmi.ref_frame[ref],
+                       mv_list, block, mi_row, mi_col);
 
-  dst_list[1].as_int = 0;
-  if (block_idx == 0) {
-    vpx_memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv));
-  } else if (block_idx == 1 || block_idx == 2) {
-    int dst = 0, n;
-    b_mode_info *bmi = mi->bmi;
+  near->as_int = 0;
+  switch (block) {
+    case 0:
+      nearest->as_int = mv_list[0].as_int;
+      near->as_int = mv_list[1].as_int;
+      break;
+    case 1:
+    case 2:
+      nearest->as_int = bmi[0].as_mv[ref].as_int;
+      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest->as_int != mv_list[n].as_int) {
+          near->as_int = mv_list[n].as_int;
+          break;
+        }
+      break;
+    case 3: {
+      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
+      candidates[0] = bmi[1].as_mv[ref];
+      candidates[1] = bmi[0].as_mv[ref];
+      candidates[2] = mv_list[0];
+      candidates[3] = mv_list[1];
 
-    dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
-    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
-                n < MAX_MV_REF_CANDIDATES; n++)
-      if (mv_list[n].as_int != dst_list[0].as_int)
-        dst_list[dst++].as_int = mv_list[n].as_int;
-  } else {
-    int dst = 0, n;
-    b_mode_info *bmi = mi->bmi;
-
-    assert(block_idx == 3);
-    dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int;
-    if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int)
-      dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int;
-    if (dst < MAX_MV_REF_CANDIDATES &&
-        dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int)
-      dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int;
-    for (n = 0; dst < MAX_MV_REF_CANDIDATES &&
-                n < MAX_MV_REF_CANDIDATES; n++)
-      if (mv_list[n].as_int != dst_list[0].as_int)
-        dst_list[dst++].as_int = mv_list[n].as_int;
+      nearest->as_int = bmi[2].as_mv[ref].as_int;
+      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
+        if (nearest->as_int != candidates[n].as_int) {
+          near->as_int = candidates[n].as_int;
+          break;
+        }
+      break;
+    }
+    default:
+      assert("Invalid block index.");
   }
-
-  dst_nearest->as_int = dst_list[0].as_int;
-  dst_near->as_int = dst_list[1].as_int;
 }
diff --git a/source/libvpx/vp9/common/vp9_findnearmv.h b/source/libvpx/vp9/common/vp9_findnearmv.h
index 2362caa..5028af7 100644
--- a/source/libvpx/vp9/common/vp9_findnearmv.h
+++ b/source/libvpx/vp9/common/vp9_findnearmv.h
@@ -36,37 +36,7 @@ static void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
 
 void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
                                    const TileInfo *const tile,
-                                   int_mv *dst_nearest,
-                                   int_mv *dst_near,
-                                   int block_idx, int ref_idx,
-                                   int mi_row, int mi_col);
-
-static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mi,
-                                          const MODE_INFO *left_mi, int b) {
-  if (b == 0 || b == 2) {
-    if (!left_mi || is_inter_block(&left_mi->mbmi))
-      return DC_PRED;
-
-    return left_mi->mbmi.sb_type < BLOCK_8X8 ? left_mi->bmi[b + 1].as_mode
-                                             : left_mi->mbmi.mode;
-  } else {
-    assert(b == 1 || b == 3);
-    return cur_mi->bmi[b - 1].as_mode;
-  }
-}
-
-static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mi,
-                                           const MODE_INFO *above_mi, int b) {
-  if (b == 0 || b == 1) {
-    if (!above_mi || is_inter_block(&above_mi->mbmi))
-      return DC_PRED;
-
-    return above_mi->mbmi.sb_type < BLOCK_8X8 ? above_mi->bmi[b + 2].as_mode
-                                              : above_mi->mbmi.mode;
-  } else {
-    assert(b == 2 || b == 3);
-    return cur_mi->bmi[b - 2].as_mode;
-  }
-}
+                                   int block, int ref, int mi_row, int mi_col,
+                                   int_mv *nearest, int_mv *near);
 
 #endif  // VP9_COMMON_VP9_FINDNEARMV_H_
diff --git a/source/libvpx/vp9/common/vp9_idct.c b/source/libvpx/vp9/common/vp9_idct.c
index ea8683e..533f7f3 100644
--- a/source/libvpx/vp9/common/vp9_idct.c
+++ b/source/libvpx/vp9/common/vp9_idct.c
@@ -835,7 +835,8 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);  }
+                                        + dest[j * stride + i]);
+  }
 }
 
 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
@@ -1276,7 +1277,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+                                        + dest[j * stride + i]);
   }
 }
 
@@ -1344,43 +1345,37 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
   // coefficients. Use eobs to decide what to do.
   // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
   // Combine that with code here.
-  if (eob) {
-    if (eob == 1)
-      // DC only DCT coefficient
-      vp9_idct8x8_1_add(input, dest, stride);
-    else if (eob <= 10)
-      vp9_idct8x8_10_add(input, dest, stride);
-    else
-      vp9_idct8x8_64_add(input, dest, stride);
-  }
+  if (eob == 1)
+    // DC only DCT coefficient
+    vp9_idct8x8_1_add(input, dest, stride);
+  else if (eob <= 10)
+    vp9_idct8x8_10_add(input, dest, stride);
+  else
+    vp9_idct8x8_64_add(input, dest, stride);
 }
 
 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
                        int eob) {
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
-  if (eob) {
-    if (eob == 1)
-      /* DC only DCT coefficient. */
-      vp9_idct16x16_1_add(input, dest, stride);
-    else if (eob <= 10)
-      vp9_idct16x16_10_add(input, dest, stride);
-    else
-      vp9_idct16x16_256_add(input, dest, stride);
-  }
+  if (eob == 1)
+    /* DC only DCT coefficient. */
+    vp9_idct16x16_1_add(input, dest, stride);
+  else if (eob <= 10)
+    vp9_idct16x16_10_add(input, dest, stride);
+  else
+    vp9_idct16x16_256_add(input, dest, stride);
 }
 
 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
                        int eob) {
-  if (eob) {
-    if (eob == 1)
-      vp9_idct32x32_1_add(input, dest, stride);
-    else if (eob <= 34)
-      // non-zero coeff only in upper-left 8x8
-      vp9_idct32x32_34_add(input, dest, stride);
-    else
-      vp9_idct32x32_1024_add(input, dest, stride);
-  }
+  if (eob == 1)
+    vp9_idct32x32_1_add(input, dest, stride);
+  else if (eob <= 34)
+    // non-zero coeff only in upper-left 8x8
+    vp9_idct32x32_34_add(input, dest, stride);
+  else
+    vp9_idct32x32_1024_add(input, dest, stride);
 }
 
 // iht
@@ -1397,9 +1392,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
   if (tx_type == DCT_DCT) {
     vp9_idct8x8_add(input, dest, stride, eob);
   } else {
-    if (eob > 0) {
-      vp9_iht8x8_64_add(input, dest, stride, tx_type);
-    }
+    vp9_iht8x8_64_add(input, dest, stride, tx_type);
   }
 }
 
@@ -1408,8 +1401,6 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
   if (tx_type == DCT_DCT) {
     vp9_idct16x16_add(input, dest, stride, eob);
   } else {
-    if (eob > 0) {
-      vp9_iht16x16_256_add(input, dest, stride, tx_type);
-    }
+    vp9_iht16x16_256_add(input, dest, stride, tx_type);
   }
 }
diff --git a/source/libvpx/vp9/common/vp9_idct.h b/source/libvpx/vp9/common/vp9_idct.h
index 2b3f35f..183c50a 100644
--- a/source/libvpx/vp9/common/vp9_idct.h
+++ b/source/libvpx/vp9/common/vp9_idct.h
@@ -77,8 +77,7 @@ static const int sinpi_4_9 = 15212;
 
 static INLINE int dct_const_round_shift(int input) {
   int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
+  return (int16_t)rv;
 }
 
 typedef void (*transform_1d)(const int16_t*, int16_t*);
diff --git a/source/libvpx/vp9/common/vp9_loopfilter.c b/source/libvpx/vp9/common/vp9_loopfilter.c
index 218e12e..40d8ffd 100644
--- a/source/libvpx/vp9/common/vp9_loopfilter.c
+++ b/source/libvpx/vp9/common/vp9_loopfilter.c
@@ -32,6 +32,8 @@ typedef struct {
   uint16_t left_uv[TX_SIZES];
   uint16_t above_uv[TX_SIZES];
   uint16_t int_4x4_uv;
+  uint8_t lfl_y[64];
+  uint8_t lfl_uv[16];
 } LOOP_FILTER_MASK;
 
 // 64 bit masks for left transform size.  Each 1 represents a position where
@@ -281,10 +283,10 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  const int n_shift = default_filt_lvl >> 5;
+  const int scale = 1 << (default_filt_lvl >> 5);
   loop_filter_info_n *const lfi = &cm->lf_info;
   struct loopfilter *const lf = &cm->lf;
-  struct segmentation *const seg = &cm->seg;
+  const struct segmentation *const seg = &cm->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -293,9 +295,7 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
   }
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
-
-    // Set the baseline filter values for each segment
+    int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
@@ -307,77 +307,118 @@ void vp9_loop_filter_frame_init(VP9_COMMON *cm, int default_filt_lvl) {
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
       vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
-      continue;
-    }
-
-    intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
-    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
-      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-        const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
-                                      + lf->mode_deltas[mode] * (1 << n_shift);
-        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale
+                                        + lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
       }
+    }
   }
 }
 
-static int build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi,
-                     const loop_filter_thresh **lfi) {
-  const int seg = mbmi->segment_id;
-  const int ref = mbmi->ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-
-  if (filter_level > 0) {
-    *lfi = &lfi_n->lfthr[filter_level];
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-static void filter_selectively_vert(uint8_t *s, int pitch,
-                                    unsigned int mask_16x16,
-                                    unsigned int mask_8x8,
-                                    unsigned int mask_4x4,
-                                    unsigned int mask_4x4_int,
-                                    const loop_filter_thresh **p_lfi) {
+static void filter_selectively_vert_row2(PLANE_TYPE plane_type,
+                                         uint8_t *s, int pitch,
+                                         unsigned int mask_16x16_l,
+                                         unsigned int mask_8x8_l,
+                                         unsigned int mask_4x4_l,
+                                         unsigned int mask_4x4_int_l,
+                                         const loop_filter_info_n *lfi_n,
+                                         const uint8_t *lfl) {
+  const int mask_shift = plane_type ? 4 : 8;
+  const int mask_cutoff = plane_type ? 0xf : 0xff;
+  const int lfl_forward = plane_type ? 4 : 8;
+
+  unsigned int mask_16x16_0 = mask_16x16_l & mask_cutoff;
+  unsigned int mask_8x8_0 = mask_8x8_l & mask_cutoff;
+  unsigned int mask_4x4_0 = mask_4x4_l & mask_cutoff;
+  unsigned int mask_4x4_int_0 = mask_4x4_int_l & mask_cutoff;
+  unsigned int mask_16x16_1 = (mask_16x16_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_8x8_1 = (mask_8x8_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_1 = (mask_4x4_l >> mask_shift) & mask_cutoff;
+  unsigned int mask_4x4_int_1 = (mask_4x4_int_l >> mask_shift) & mask_cutoff;
   unsigned int mask;
 
-  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
-       mask; mask >>= 1) {
-    const loop_filter_thresh *lfi = *p_lfi;
+  for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_4x4_int_0 |
+      mask_16x16_1 | mask_8x8_1 | mask_4x4_1 | mask_4x4_int_1;
+      mask; mask >>= 1) {
+    const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+    const loop_filter_thresh *lfi1 = lfi_n->lfthr + *(lfl + lfl_forward);
 
+    // TODO(yunqingwang): count in loopfilter functions should be removed.
     if (mask & 1) {
-      if (mask_16x16 & 1) {
-        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                   lfi->hev_thr);
-        assert(!(mask_8x8 & 1));
-        assert(!(mask_4x4 & 1));
-        assert(!(mask_4x4_int & 1));
-      } else if (mask_8x8 & 1) {
-        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_4x4 & 1));
-      } else if (mask_4x4 & 1) {
-        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_8x8 & 1));
+      if ((mask_16x16_0 | mask_16x16_1) & 1) {
+        if ((mask_16x16_0 & mask_16x16_1) & 1) {
+          vp9_mb_lpf_vertical_edge_w_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr);
+        } else if (mask_16x16_0 & 1) {
+          vp9_mb_lpf_vertical_edge_w(s, pitch, lfi0->mblim, lfi0->lim,
+                                     lfi0->hev_thr);
+        } else {
+          vp9_mb_lpf_vertical_edge_w(s + 8 *pitch, pitch, lfi1->mblim,
+                                     lfi1->lim, lfi1->hev_thr);
+        }
+      }
+
+      if ((mask_8x8_0 | mask_8x8_1) & 1) {
+        if ((mask_8x8_0 & mask_8x8_1) & 1) {
+          vp9_mbloop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, lfi1->mblim,
+                                          lfi1->lim, lfi1->hev_thr);
+        } else if (mask_8x8_0 & 1) {
+          vp9_mbloop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
+                                          lfi0->hev_thr, 1);
+        } else {
+          vp9_mbloop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
+                                          lfi1->lim, lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_0 | mask_4x4_1) & 1) {
+        if ((mask_4x4_0 & mask_4x4_1) & 1) {
+          vp9_loop_filter_vertical_edge_16(s, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
+        } else if (mask_4x4_0 & 1) {
+          vp9_loop_filter_vertical_edge(s, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, 1);
+        } else {
+          vp9_loop_filter_vertical_edge(s + 8 *pitch, pitch, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr, 1);
+        }
+      }
+
+      if ((mask_4x4_int_0 | mask_4x4_int_1) & 1) {
+        if ((mask_4x4_int_0 & mask_4x4_int_1) & 1) {
+          vp9_loop_filter_vertical_edge_16(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr);
+        } else if (mask_4x4_int_0 & 1) {
+          vp9_loop_filter_vertical_edge(s + 4, pitch, lfi0->mblim, lfi0->lim,
+                                        lfi0->hev_thr, 1);
+        } else {
+          vp9_loop_filter_vertical_edge(s + 8 *pitch + 4, pitch, lfi1->mblim,
+                                        lfi1->lim, lfi1->hev_thr, 1);
+        }
       }
     }
-    if (mask_4x4_int & 1)
-      vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 1);
+
     s += 8;
-    p_lfi++;
-    mask_16x16 >>= 1;
-    mask_8x8 >>= 1;
-    mask_4x4 >>= 1;
-    mask_4x4_int >>= 1;
+    lfl += 1;
+    mask_16x16_0 >>= 1;
+    mask_8x8_0 >>= 1;
+    mask_4x4_0 >>= 1;
+    mask_4x4_int_0 >>= 1;
+    mask_16x16_1 >>= 1;
+    mask_8x8_1 >>= 1;
+    mask_4x4_1 >>= 1;
+    mask_4x4_int_1 >>= 1;
   }
 }
 
@@ -386,49 +427,98 @@ static void filter_selectively_horiz(uint8_t *s, int pitch,
                                      unsigned int mask_8x8,
                                      unsigned int mask_4x4,
                                      unsigned int mask_4x4_int,
-                                     int only_4x4_1,
-                                     const loop_filter_thresh **p_lfi) {
+                                     const loop_filter_info_n *lfi_n,
+                                     const uint8_t *lfl) {
   unsigned int mask;
   int count;
 
   for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
        mask; mask >>= count) {
-    const loop_filter_thresh *lfi = *p_lfi;
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
 
     count = 1;
     if (mask & 1) {
-      if (!only_4x4_1) {
-        if (mask_16x16 & 1) {
-          if ((mask_16x16 & 3) == 3) {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 2);
-            count = 2;
+      if (mask_16x16 & 1) {
+        if ((mask_16x16 & 3) == 3) {
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 2);
+          count = 2;
+        } else {
+          vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                       lfi->hev_thr, 1);
+        }
+      } else if (mask_8x8 & 1) {
+        if ((mask_8x8 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_mbloop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
+
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
           } else {
-            vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
-                                         lfi->hev_thr, 1);
+            if (mask_4x4_int & 1)
+              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                              lfin->mblim, lfin->lim,
+                                              lfin->hev_thr, 1);
           }
-          assert(!(mask_8x8 & 1));
-          assert(!(mask_4x4 & 1));
-          assert(!(mask_4x4_int & 1));
-        } else if (mask_8x8 & 1) {
+          count = 2;
+        } else {
           vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
                                             lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_4x4 & 1));
-        } else if (mask_4x4 & 1) {
-          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                          lfi->hev_thr, 1);
-          assert(!(mask_16x16 & 1));
-          assert(!(mask_8x8 & 1));
+
+          if (mask_4x4_int & 1)
+            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                            lfi->lim, lfi->hev_thr, 1);
         }
-      }
+      } else if (mask_4x4 & 1) {
+        if ((mask_4x4 & 3) == 3) {
+          // Next block's thresholds
+          const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
+
+          vp9_loop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr);
+          if ((mask_4x4_int & 3) == 3) {
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
+          } else {
+            if (mask_4x4_int & 1)
+              vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                              lfi->lim, lfi->hev_thr, 1);
+            else if (mask_4x4_int & 2)
+              vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
+                                              lfin->mblim, lfin->lim,
+                                              lfin->hev_thr, 1);
+          }
+          count = 2;
+        } else {
+        vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
 
-      if (mask_4x4_int & 1)
+        if (mask_4x4_int & 1)
+          vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+        }
+      } else if (mask_4x4_int & 1) {
         vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1);
+      }
     }
     s += 8 * count;
-    p_lfi += count;
+    lfl += count;
     mask_16x16 >>= count;
     mask_8x8 >>= count;
     mask_4x4 >>= count;
@@ -461,10 +551,20 @@ static void build_masks(const loop_filter_info_n *const lfi_n,
   uint16_t *left_uv = &lfm->left_uv[tx_size_uv];
   uint16_t *above_uv = &lfm->above_uv[tx_size_uv];
   uint16_t *int_4x4_uv = &lfm->int_4x4_uv;
+  int i;
+  int w = num_8x8_blocks_wide_lookup[block_size];
+  int h = num_8x8_blocks_high_lookup[block_size];
 
   // If filter level is 0 we don't loop filter.
-  if (!filter_level)
+  if (!filter_level) {
     return;
+  } else {
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   // These set 1 in the current block size for the block size edges.
   // For instance if the block size is 32x16,   we'll set :
@@ -530,9 +630,19 @@ static void build_y_mask(const loop_filter_info_n *const lfi_n,
   uint64_t *left_y = &lfm->left_y[tx_size_y];
   uint64_t *above_y = &lfm->above_y[tx_size_y];
   uint64_t *int_4x4_y = &lfm->int_4x4_y;
+  int i;
+  int w = num_8x8_blocks_wide_lookup[block_size];
+  int h = num_8x8_blocks_high_lookup[block_size];
 
-  if (!filter_level)
+  if (!filter_level) {
     return;
+  } else {
+    int index = shift_y;
+    for (i = 0; i < h; i++) {
+      vpx_memset(&lfm->lfl_y[index], filter_level, w);
+      index += 8;
+    }
+  }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
@@ -784,8 +894,74 @@ static void setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       lfm->left_uv[i] &= 0xeeee;
     }
   }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
+
 #if CONFIG_NON420
+static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi) {
+  const int seg = mbmi->segment_id;
+  const int ref = mbmi->ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+
+  return filter_level;
+}
+
+static void filter_selectively_vert(uint8_t *s, int pitch,
+                                    unsigned int mask_16x16,
+                                    unsigned int mask_8x8,
+                                    unsigned int mask_4x4,
+                                    unsigned int mask_4x4_int,
+                                    const loop_filter_info_n *lfi_n,
+                                    const uint8_t *lfl) {
+  unsigned int mask;
+
+  for (mask = mask_16x16 | mask_8x8 | mask_4x4 | mask_4x4_int;
+       mask; mask >>= 1) {
+    const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+
+    if (mask & 1) {
+      if (mask_16x16 & 1) {
+        vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
+                                   lfi->hev_thr);
+      } else if (mask_8x8 & 1) {
+        vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+      } else if (mask_4x4 & 1) {
+        vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 1);
+      }
+    }
+    if (mask_4x4_int & 1)
+      vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
+    s += 8;
+    lfl += 1;
+    mask_16x16 >>= 1;
+    mask_8x8 >>= 1;
+    mask_4x4 >>= 1;
+    mask_4x4_int >>= 1;
+  }
+}
+
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       MODE_INFO **mi_8x8,
@@ -801,7 +977,7 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
   unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
   unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
+  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
   int r, c;
 
   for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
@@ -830,7 +1006,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
       // Filter level can vary per MI
-      if (!build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]))
+      if (!(lfl[(r << 3) + (c >> ss_x)] =
+          build_lfi(&cm->lf_info, &mi[0].mbmi)))
         continue;
 
       // Build masks based on the transform size of each block
@@ -887,7 +1064,8 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
                             mask_16x16_c & border_mask,
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
-                            mask_4x4_int[r], lfi[r]);
+                            mask_4x4_int[r],
+                            &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
     mi_8x8 += row_step_stride;
   }
@@ -898,11 +1076,26 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
+    unsigned int mask_16x16_r;
+    unsigned int mask_8x8_r;
+    unsigned int mask_4x4_r;
+
+    if (mi_row + r == 0) {
+      mask_16x16_r = 0;
+      mask_8x8_r = 0;
+      mask_4x4_r = 0;
+    } else {
+      mask_16x16_r = mask_16x16[r];
+      mask_8x8_r = mask_8x8[r];
+      mask_4x4_r = mask_4x4[r];
+    }
+
     filter_selectively_horiz(dst->buf, dst->stride,
-                             mask_16x16[r],
-                             mask_8x8[r],
-                             mask_4x4[r],
-                             mask_4x4_int_r, mi_row + r == 0, lfi[r]);
+                             mask_16x16_r,
+                             mask_8x8_r,
+                             mask_4x4_r,
+                             mask_4x4_int_r,
+                             &cm->lf_info, &lfl[r << 3]);
     dst->buf += 8 * dst->stride;
   }
 }
@@ -910,81 +1103,154 @@ static void filter_block_plane_non420(VP9_COMMON *cm,
 
 static void filter_block_plane(VP9_COMMON *const cm,
                                struct macroblockd_plane *const plane,
-                               MODE_INFO **mi_8x8,
-                               int mi_row, int mi_col,
+                               int mi_row,
                                LOOP_FILTER_MASK *lfm) {
-  const int ss_x = plane->subsampling_x;
-  const int ss_y = plane->subsampling_y;
-  const int row_step = 1 << ss_x;
-  const int col_step = 1 << ss_y;
-  const int row_step_stride = cm->mode_info_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  const loop_filter_thresh *lfi[MI_BLOCK_SIZE][MI_BLOCK_SIZE];
   int r, c;
-  int row_shift = 3 - ss_x;
-  int row_mask = 0xff >> (ss_x << 2);
 
-#define MASK_ROW(value) ((value >> (r_sampled << row_shift)) & row_mask)
+  if (!plane->plane_type) {
+    uint64_t mask_16x16 = lfm->left_y[TX_16X16];
+    uint64_t mask_8x8 = lfm->left_y[TX_8X8];
+    uint64_t mask_4x4 = lfm->left_y[TX_4X4];
+    uint64_t mask_4x4_int = lfm->int_4x4_y;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    int r_sampled = r >> ss_x;
+    // Vertical pass: do 2 rows at one time
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      unsigned int mask_16x16_l = mask_16x16 & 0xffff;
+      unsigned int mask_8x8_l = mask_8x8 & 0xffff;
+      unsigned int mask_4x4_l = mask_4x4 & 0xffff;
+      unsigned int mask_4x4_int_l = mask_4x4_int & 0xffff;
 
-    // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
-
-      build_lfi(&cm->lf_info, &mi[0].mbmi, &lfi[r][c >> ss_x]);
-    }
-    if (!plane->plane_type) {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_y);
       // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_y[TX_16X16]),
-                              MASK_ROW(lfm->left_y[TX_8X8]),
-                              MASK_ROW(lfm->left_y[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_y),
-                              lfi[r]);
-    } else {
-      mask_4x4_int[r] = MASK_ROW(lfm->int_4x4_uv);
-      // Disable filtering on the leftmost column
-      filter_selectively_vert(dst->buf, dst->stride,
-                              MASK_ROW(lfm->left_uv[TX_16X16]),
-                              MASK_ROW(lfm->left_uv[TX_8X8]),
-                              MASK_ROW(lfm->left_uv[TX_4X4]),
-                              MASK_ROW(lfm->int_4x4_uv),
-                              lfi[r]);
+      filter_selectively_vert_row2(plane->plane_type,
+                                   dst->buf, dst->stride,
+                                   mask_16x16_l,
+                                   mask_8x8_l,
+                                   mask_4x4_l,
+                                   mask_4x4_int_l,
+                                   &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 16 * dst->stride;
+      mask_16x16 >>= 16;
+      mask_8x8 >>= 16;
+      mask_4x4 >>= 16;
+      mask_4x4_int >>= 16;
     }
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
-  }
 
-  // Now do horizontal pass
-  dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
-    const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
-    const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
-    int r_sampled = r >> ss_x;
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_y[TX_16X16];
+    mask_8x8 = lfm->above_y[TX_8X8];
+    mask_4x4 = lfm->above_y[TX_4X4];
+    mask_4x4_int = lfm->int_4x4_y;
+
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xff;
+        mask_8x8_r = mask_8x8 & 0xff;
+        mask_4x4_r = mask_4x4 & 0xff;
+      }
 
-    if (!plane->plane_type) {
       filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_y[TX_16X16]),
-                               MASK_ROW(lfm->above_y[TX_8X8]),
-                               MASK_ROW(lfm->above_y[TX_4X4]),
-                               MASK_ROW(lfm->int_4x4_y),
-                               mi_row + r == 0, lfi[r]);
-    } else {
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
+                               mask_4x4_int & 0xff,
+                               &cm->lf_info, &lfm->lfl_y[r << 3]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 8;
+      mask_8x8 >>= 8;
+      mask_4x4 >>= 8;
+      mask_4x4_int >>= 8;
+    }
+  } else {
+    uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
+    uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
+    uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
+    uint16_t mask_4x4_int = lfm->int_4x4_uv;
+
+    // Vertical pass: do 2 rows at one time
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
+      if (plane->plane_type == 1) {
+        for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
+          lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
+          lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) +
+                                                       (c << 1)];
+        }
+      }
+
+      {
+        unsigned int mask_16x16_l = mask_16x16 & 0xff;
+        unsigned int mask_8x8_l = mask_8x8 & 0xff;
+        unsigned int mask_4x4_l = mask_4x4 & 0xff;
+        unsigned int mask_4x4_int_l = mask_4x4_int & 0xff;
+
+        // Disable filtering on the leftmost column
+        filter_selectively_vert_row2(plane->plane_type,
+                                     dst->buf, dst->stride,
+                                     mask_16x16_l,
+                                     mask_8x8_l,
+                                     mask_4x4_l,
+                                     mask_4x4_int_l,
+                                     &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+        dst->buf += 16 * dst->stride;
+        mask_16x16 >>= 8;
+        mask_8x8 >>= 8;
+        mask_4x4 >>= 8;
+        mask_4x4_int >>= 8;
+      }
+    }
+
+    // Horizontal pass
+    dst->buf = dst0;
+    mask_16x16 = lfm->above_uv[TX_16X16];
+    mask_8x8 = lfm->above_uv[TX_8X8];
+    mask_4x4 = lfm->above_uv[TX_4X4];
+    mask_4x4_int = lfm->int_4x4_uv;
+
+    for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+      const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
+      const unsigned int mask_4x4_int_r = skip_border_4x4_r ?
+          0 : (mask_4x4_int & 0xf);
+      unsigned int mask_16x16_r;
+      unsigned int mask_8x8_r;
+      unsigned int mask_4x4_r;
+
+      if (mi_row + r == 0) {
+        mask_16x16_r = 0;
+        mask_8x8_r = 0;
+        mask_4x4_r = 0;
+      } else {
+        mask_16x16_r = mask_16x16 & 0xf;
+        mask_8x8_r = mask_8x8 & 0xf;
+        mask_4x4_r = mask_4x4 & 0xf;
+      }
+
       filter_selectively_horiz(dst->buf, dst->stride,
-                               MASK_ROW(lfm->above_uv[TX_16X16]),
-                               MASK_ROW(lfm->above_uv[TX_8X8]),
-                               MASK_ROW(lfm->above_uv[TX_4X4]),
+                               mask_16x16_r,
+                               mask_8x8_r,
+                               mask_4x4_r,
                                mask_4x4_int_r,
-                               mi_row + r == 0, lfi[r]);
+                               &cm->lf_info, &lfm->lfl_uv[r << 1]);
+
+      dst->buf += 8 * dst->stride;
+      mask_16x16 >>= 4;
+      mask_8x8 >>= 4;
+      mask_4x4 >>= 4;
+      mask_4x4_int >>= 4;
     }
-    dst->buf += 8 * dst->stride;
   }
-#undef MASK_ROW
 }
 
 void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
@@ -1017,8 +1283,7 @@ void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
 #if CONFIG_NON420
         if (use_420)
 #endif
-          filter_block_plane(cm, &xd->plane[plane], mi_8x8 + mi_col, mi_row,
-                             mi_col, &lfm);
+          filter_block_plane(cm, &xd->plane[plane], mi_row, &lfm);
 #if CONFIG_NON420
         else
           filter_block_plane_non420(cm, &xd->plane[plane], mi_8x8 + mi_col,
diff --git a/source/libvpx/vp9/common/vp9_loopfilter_filters.c b/source/libvpx/vp9/common/vp9_loopfilter_filters.c
index 2c4bf6c..f2e910f 100644
--- a/source/libvpx/vp9/common/vp9_loopfilter_filters.c
+++ b/source/libvpx/vp9/common/vp9_loopfilter_filters.c
@@ -121,6 +121,17 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */,
   }
 }
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int p,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge_c(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
@@ -141,6 +152,18 @@ void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_loop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                        const uint8_t *blimit0,
+                                        const uint8_t *limit0,
+                                        const uint8_t *thresh0,
+                                        const uint8_t *blimit1,
+                                        const uint8_t *limit1,
+                                        const uint8_t *thresh1) {
+  vp9_loop_filter_vertical_edge_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_vertical_edge_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                  thresh1, 1);
+}
+
 static INLINE void filter8(int8_t mask, uint8_t hev, uint8_t flat,
                            uint8_t *op3, uint8_t *op2,
                            uint8_t *op1, uint8_t *op0,
@@ -185,6 +208,17 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p,
   }
 }
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int p,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge_c(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge_c(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
@@ -205,6 +239,18 @@ void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
   }
 }
 
+void vp9_mbloop_filter_vertical_edge_16_c(uint8_t *s, int pitch,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  vp9_mbloop_filter_vertical_edge_c(s, pitch, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_vertical_edge_c(s + 8 * pitch, pitch, blimit1, limit1,
+                                    thresh1, 1);
+}
+
 static INLINE void filter16(int8_t mask, uint8_t hev,
                             uint8_t flat, uint8_t flat2,
                             uint8_t *op7, uint8_t *op6,
@@ -285,13 +331,14 @@ void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p,
   }
 }
 
-void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
-                                  const uint8_t *blimit,
-                                  const uint8_t *limit,
-                                  const uint8_t *thresh) {
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p,
+                                   const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh,
+                                   int count) {
   int i;
 
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
     const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
     const uint8_t q0 = s[0], q1 = s[1],  q2 = s[2], q3 = s[3];
     const int8_t mask = filter_mask(*limit, *blimit,
@@ -307,3 +354,17 @@ void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
     s += p;
   }
 }
+
+void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p,
+                                  const uint8_t *blimit,
+                                  const uint8_t *limit,
+                                  const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_c(uint8_t *s, int p,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16);
+}
diff --git a/source/libvpx/vp9/common/vp9_mv.h b/source/libvpx/vp9/common/vp9_mv.h
index 31a79b9..155c3f1 100644
--- a/source/libvpx/vp9/common/vp9_mv.h
+++ b/source/libvpx/vp9/common/vp9_mv.h
@@ -15,7 +15,7 @@
 
 #include "vp9/common/vp9_common.h"
 
-typedef struct {
+typedef struct mv {
   int16_t row;
   int16_t col;
 } MV;
@@ -25,7 +25,7 @@ typedef union int_mv {
   MV as_mv;
 } int_mv; /* facilitates faster equality tests and copies */
 
-typedef struct {
+typedef struct mv32 {
   int32_t row;
   int32_t col;
 } MV32;
diff --git a/source/libvpx/vp9/common/vp9_onyx.h b/source/libvpx/vp9/common/vp9_onyx.h
index acb4724..65a2a5e 100644
--- a/source/libvpx/vp9/common/vp9_onyx.h
+++ b/source/libvpx/vp9/common/vp9_onyx.h
@@ -64,6 +64,13 @@ extern "C"
     FRAMEFLAGS_ALTREF = 4,
   } FRAMETYPE_FLAGS;
 
+  typedef enum {
+    NO_AQ = 0,
+    VARIANCE_AQ = 1,
+    COMPLEXITY_AQ = 2,
+    AQ_MODES_COUNT  // This should always be the last member of the enum
+  } AQ_MODES;
+
   typedef struct {
     int version;  // 4 versions of bitstream defined:
                   //   0 - best quality/slowest decode,
@@ -128,6 +135,7 @@ extern "C"
     int best_allowed_q;
     int cq_level;
     int lossless;
+    int aq_mode;  // Adaptive Quantization mode
 
     // two pass datarate control
     int two_pass_vbrbias;        // two pass datarate control tweaks
@@ -185,7 +193,7 @@ extern "C"
                             int64_t end_time_stamp);
 
   int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags,
-                              unsigned long *size, unsigned char *dest,
+                              size_t *size, uint8_t *dest,
                               int64_t *time_stamp, int64_t *time_end,
                               int flush);
 
@@ -221,8 +229,6 @@ extern "C"
   int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
                            unsigned int height);
 
-  int vp9_switch_layer(VP9_PTR comp, int layer);
-
   void vp9_set_svc(VP9_PTR comp, int use_svc);
 
   int vp9_get_quantizer(VP9_PTR c);
diff --git a/source/libvpx/vp9/common/vp9_onyxc_int.h b/source/libvpx/vp9/common/vp9_onyxc_int.h
index ba2e9d8..bfb94e4 100644
--- a/source/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/source/libvpx/vp9/common/vp9_onyxc_int.h
@@ -25,24 +25,29 @@
 #include "vp9/common/vp9_postproc.h"
 #endif
 
-#define ALLOWED_REFS_PER_FRAME 3
+#define REFS_PER_FRAME 3
 
-#define NUM_REF_FRAMES_LOG2 3
-#define NUM_REF_FRAMES (1 << NUM_REF_FRAMES_LOG2)
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
 // 1 scratch frame for the new frame, 3 for scaled references on the encoder
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4)
+#define FRAME_BUFFERS (REF_FRAMES + 4)
 
-#define NUM_FRAME_CONTEXTS_LOG2 2
-#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
+#define FRAME_CONTEXTS_LOG2 2
+#define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
+
+extern const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES];
 
 typedef struct frame_contexts {
   vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-  vp9_prob partition_prob[FRAME_TYPES][PARTITION_CONTEXTS][PARTITION_TYPES - 1];
-  vp9_coeff_probs_model coef_probs[TX_SIZES][BLOCK_TYPES];
+  vp9_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+  vp9_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
   vp9_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
   vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
@@ -59,9 +64,9 @@ typedef struct {
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
-  vp9_coeff_count_model coef[TX_SIZES][BLOCK_TYPES];
-  unsigned int eob_branch[TX_SIZES][BLOCK_TYPES][REF_TYPES]
-                         [COEF_BANDS][PREV_COEF_CONTEXTS];
+  vp9_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
+  unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                         [COEF_BANDS][COEFF_CONTEXTS];
   unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
@@ -76,11 +81,11 @@ typedef struct {
 
 
 typedef enum {
-  SINGLE_PREDICTION_ONLY = 0,
-  COMP_PREDICTION_ONLY   = 1,
-  HYBRID_PREDICTION      = 2,
-  NB_PREDICTION_TYPES    = 3,
-} COMPPREDMODE_TYPE;
+  SINGLE_REFERENCE      = 0,
+  COMPOUND_REFERENCE    = 1,
+  REFERENCE_MODE_SELECT = 2,
+  REFERENCE_MODES       = 3,
+} REFERENCE_MODE;
 
 typedef struct VP9Common {
   struct vpx_internal_error_info  error;
@@ -108,17 +113,17 @@ typedef struct VP9Common {
 
   YV12_BUFFER_CONFIG *frame_to_show;
 
-  YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS];
-  int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */
-  int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */
+  YV12_BUFFER_CONFIG *yv12_fb;
+  int *fb_idx_ref_cnt; /* reference counts */
+  int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
 
   // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
   // roll new_fb_idx into it.
 
-  // Each frame can reference ALLOWED_REFS_PER_FRAME buffers
-  int active_ref_idx[ALLOWED_REFS_PER_FRAME];
-  struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME];
-  struct scale_factors_common active_ref_scale_comm[ALLOWED_REFS_PER_FRAME];
+  // Each frame can reference REFS_PER_FRAME buffers
+  int active_ref_idx[REFS_PER_FRAME];
+  struct scale_factors active_ref_scale[REFS_PER_FRAME];
+  struct scale_factors_common active_ref_scale_comm[REFS_PER_FRAME];
   int new_fb_idx;
 
   YV12_BUFFER_CONFIG post_proc_buffer;
@@ -190,10 +195,10 @@ typedef struct VP9Common {
   int allow_comp_inter_inter;
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
-  COMPPREDMODE_TYPE comp_pred_mode;
+  REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT fc;  /* this frame entropy */
-  FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS];
+  FRAME_CONTEXT frame_contexts[FRAME_CONTEXTS];
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
@@ -208,6 +213,15 @@ typedef struct VP9Common {
   int frame_parallel_decoding_mode;
 
   int log2_tile_cols, log2_tile_rows;
+
+  vpx_codec_frame_buffer_t *fb_list;  // External frame buffers
+  int fb_count;  // Total number of frame buffers
+  vpx_realloc_frame_buffer_cb_fn_t realloc_fb_cb;
+  void *user_priv;  // Private data associated with the external frame buffers.
+
+  int fb_lru;  // Flag telling if lru is on/off
+  uint32_t *fb_idx_ref_lru;  // Frame buffer lru cache
+  uint32_t fb_idx_ref_lru_count;
 } VP9_COMMON;
 
 // ref == 0 => LAST_FRAME
@@ -223,18 +237,34 @@ static YV12_BUFFER_CONFIG *get_frame_new_buffer(VP9_COMMON *cm) {
 
 static int get_free_fb(VP9_COMMON *cm) {
   int i;
-  for (i = 0; i < NUM_YV12_BUFFERS; i++)
-    if (cm->fb_idx_ref_cnt[i] == 0)
-      break;
+  uint32_t lru_count = cm->fb_idx_ref_lru_count + 1;
+  int free_buffer_idx = cm->fb_count;
+  for (i = 0; i < cm->fb_count; i++) {
+    if (!cm->fb_lru) {
+      if (cm->fb_idx_ref_cnt[i] == 0) {
+        free_buffer_idx = i;
+        break;
+      }
+    } else {
+      if (cm->fb_idx_ref_cnt[i] == 0 && cm->fb_idx_ref_lru[i] < lru_count) {
+        free_buffer_idx = i;
+        lru_count = cm->fb_idx_ref_lru[i];
+      }
+    }
+  }
 
-  assert(i < NUM_YV12_BUFFERS);
-  cm->fb_idx_ref_cnt[i] = 1;
-  return i;
+  assert(free_buffer_idx < cm->fb_count);
+  cm->fb_idx_ref_cnt[free_buffer_idx] = 1;
+  if (cm->fb_lru)
+    cm->fb_idx_ref_lru[free_buffer_idx] = ++cm->fb_idx_ref_lru_count;
+  return free_buffer_idx;
 }
 
 static void ref_cnt_fb(int *buf, int *idx, int new_idx) {
-  if (buf[*idx] > 0)
-    buf[*idx]--;
+  const int ref_index = *idx;
+
+  if (ref_index >= 0 && buf[ref_index] > 0)
+    buf[ref_index]--;
 
   *idx = new_idx;
 
@@ -245,6 +275,11 @@ static int mi_cols_aligned_to_sb(int n_mis) {
   return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
 }
 
+static INLINE const vp9_prob* get_partition_probs(VP9_COMMON *cm, int ctx) {
+  return cm->frame_type == KEY_FRAME ? vp9_kf_partition_probs[ctx]
+                                     : cm->fc.partition_prob[ctx];
+}
+
 static INLINE void set_skip_context(
     MACROBLOCKD *xd,
     ENTROPY_CONTEXT *above_context[MAX_MB_PLANE],
@@ -293,52 +328,40 @@ static INLINE int frame_is_intra_only(const VP9_COMMON *const cm) {
 static INLINE void update_partition_context(
     PARTITION_CONTEXT *above_seg_context,
     PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type,
-    BLOCK_SIZE sb_size) {
-  PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
-  PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
-
-  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  const int bwl = b_width_log2(sb_type);
-  const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
-  const char pcval0 = ~(0xe << boffset);
-  const char pcval1 = ~(0xf << boffset);
-  const char pcvalue[2] = {pcval0, pcval1};
-
-  assert(MAX(bwl, bhl) <= bsl);
+    int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = left_seg_context + (mi_row & MI_MASK);
+
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  vpx_memset(above_ctx, pcvalue[bwl == bsl], bs);
-  vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
+  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
 static INLINE int partition_plane_context(
     const PARTITION_CONTEXT *above_seg_context,
     const PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type) {
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
 
-  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  const int bsl = mi_width_log2(bsize);
+  const int bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
 
-  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+  assert(mi_width_log2(bsize) == mi_height_log2(bsize));
   assert(bsl >= 0);
-  assert(boffset >= 0);
 
-  for (i = 0; i < bs; i++)
-    above |= (above_ctx[i] & (1 << boffset));
-  for (i = 0; i < bs; i++)
-    left |= (left_ctx[i] & (1 << boffset));
-
-  above = (above > 0);
-  left  = (left > 0);
+  for (i = 0; i < bs; i++) {
+    above |= above_ctx[i];
+    left |= left_ctx[i];
+  }
+  above = (above & bs) > 0;
+  left  = (left & bs) > 0;
 
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
diff --git a/source/libvpx/vp9/common/vp9_pred_common.c b/source/libvpx/vp9/common/vp9_pred_common.c
index 57ca5c5..40cfc81 100644
--- a/source/libvpx/vp9/common/vp9_pred_common.c
+++ b/source/libvpx/vp9/common/vp9_pred_common.c
@@ -25,50 +25,37 @@ static INLINE const MB_MODE_INFO *get_left_mbmi(const MODE_INFO *const left) {
 }
 
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  // left
-  const int left_mv_pred = left_in_image ? is_inter_block(&left_mi->mbmi)
-                                         : 0;
-  const int left_interp = left_in_image && left_mv_pred
-                              ? left_mi->mbmi.interp_filter
-                              : SWITCHABLE_FILTERS;
-
-  // above
-  const int above_mv_pred = above_in_image ? is_inter_block(&above_mi->mbmi)
-                                           : 0;
-  const int above_interp = above_in_image && above_mv_pred
-                               ? above_mi->mbmi.interp_filter
-                               : SWITCHABLE_FILTERS;
+  const MODE_INFO *const left_mi = get_left_mi(xd);
+  const int has_left = left_mi != NULL ? is_inter_block(&left_mi->mbmi) : 0;
+  const int left_type = has_left ? left_mi->mbmi.interp_filter
+                                 : SWITCHABLE_FILTERS;
 
-  if (left_interp == above_interp)
-    return left_interp;
-  else if (left_interp == SWITCHABLE_FILTERS &&
-           above_interp != SWITCHABLE_FILTERS)
-    return above_interp;
-  else if (left_interp != SWITCHABLE_FILTERS &&
-           above_interp == SWITCHABLE_FILTERS)
-    return left_interp;
+  const MODE_INFO *const above_mi = get_above_mi(xd);
+  const int has_above = above_mi != NULL ? is_inter_block(&above_mi->mbmi) : 0;
+  const int above_type = has_above ? above_mi->mbmi.interp_filter
+                                   : SWITCHABLE_FILTERS;
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    return above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    return left_type;
   else
     return SWITCHABLE_FILTERS;
 }
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd) {
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+  const int above_intra = has_above ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = has_left ? !is_inter_block(left_mbmi) : 1;
 
   // The mode info data structure has a one element border above and to the
   // left of the entries corresponding to real macroblocks.
@@ -77,62 +64,60 @@ unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd) {
   // 1 - intra/inter, inter/intra
   // 2 - intra/--, --/intra
   // 3 - intra/intra
-  if (above_in_image && left_in_image)  // both edges available
+  if (has_above && has_left)  // both edges available
     return left_intra && above_intra ? 3
                                      : left_intra || above_intra;
-  else if (above_in_image || left_in_image)  // one edge available
-    return 2 * (above_in_image ? above_intra : left_intra);
+  else if (has_above || has_left)  // one edge available
+    return 2 * (has_above ? above_intra : left_intra);
   else
     return 0;
 }
-// Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
-  int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
+
+int vp9_get_reference_mode_context(const VP9_COMMON *cm,
+                                   const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
     if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
       // neither edge uses comp pred (0/1)
-      pred_context = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
-                     (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
+      ctx = (above_mbmi->ref_frame[0] == cm->comp_fixed_ref) ^
+            (left_mbmi->ref_frame[0] == cm->comp_fixed_ref);
     else if (!has_second_ref(above_mbmi))
       // one of two edges uses comp pred (2/3)
-      pred_context = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(above_mbmi));
+      ctx = 2 + (above_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(above_mbmi));
     else if (!has_second_ref(left_mbmi))
       // one of two edges uses comp pred (2/3)
-      pred_context = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
-                          !is_inter_block(left_mbmi));
+      ctx = 2 + (left_mbmi->ref_frame[0] == cm->comp_fixed_ref ||
+                 !is_inter_block(left_mbmi));
     else  // both edges use comp pred (4)
-      pred_context = 4;
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!has_second_ref(edge_mbmi))
       // edge does not use comp pred (0/1)
-      pred_context = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
+      ctx = edge_mbmi->ref_frame[0] == cm->comp_fixed_ref;
     else
       // edge uses comp pred (3)
-      pred_context = 3;
+      ctx = 3;
   } else {  // no edges available (1)
-    pred_context = 1;
+    ctx = 1;
   }
-  assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS);
-  return pred_context;
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
 }
 
 // Returns a context number for the given MB prediction signal
-unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
-                                              const MACROBLOCKD *xd) {
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
   int pred_context;
   const MODE_INFO *const above_mi = get_above_mi(xd);
   const MODE_INFO *const left_mi = get_left_mi(xd);
@@ -212,21 +197,20 @@ unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
 
   return pred_context;
 }
-unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+  const int above_intra = has_above ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = has_left ? !is_inter_block(left_mbmi) : 1;
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
@@ -259,8 +243,8 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
           pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME;
       }
     }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
     if (!is_inter_block(edge_mbmi)) {  // intra
       pred_context = 2;
     } else {  // inter
@@ -278,22 +262,20 @@ unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   return pred_context;
 }
 
-unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
   int pred_context;
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
-  const int above_intra = above_in_image ? !is_inter_block(above_mbmi) : 1;
-  const int left_intra = left_in_image ? !is_inter_block(left_mbmi) : 1;
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+  const int above_intra = has_above ? !is_inter_block(above_mbmi) : 1;
+  const int left_intra = has_left ? !is_inter_block(left_mbmi) : 1;
 
   // Note:
   // The mode info data structure has a one element border above and to the
   // left of the entries correpsonding to real macroblocks.
   // The prediction flags in these dummy entries are initialised to 0.
-  if (above_in_image && left_in_image) {  // both edges available
+  if (has_above && has_left) {  // both edges available
     if (above_intra && left_intra) {  // intra/intra
       pred_context = 2;
     } else if (above_intra || left_intra) {  // intra/inter or inter/intra
@@ -347,8 +329,8 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
       }
     }
-  } else if (above_in_image || left_in_image) {  // one edge available
-    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
 
     if (!is_inter_block(edge_mbmi) ||
         (edge_mbmi->ref_frame[0] == LAST_FRAME && !has_second_ref(edge_mbmi)))
@@ -368,43 +350,30 @@ unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
 // The prediction flags in these dummy entries are initialized to 0.
-unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd) {
-  const MODE_INFO *const above_mi = get_above_mi(xd);
-  const MODE_INFO *const left_mi = get_left_mi(xd);
-  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(above_mi);
-  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(left_mi);
-  const int above_in_image = above_mi != NULL;
-  const int left_in_image = left_mi != NULL;
+int vp9_get_tx_size_context(const MACROBLOCKD *xd) {
   const int max_tx_size = max_txsize_lookup[xd->mi_8x8[0]->mbmi.sb_type];
-  int above_context = max_tx_size;
-  int left_context = max_tx_size;
-
-  if (above_in_image)
-    above_context = above_mbmi->skip_coeff ? max_tx_size
-                                           : above_mbmi->tx_size;
-
-  if (left_in_image)
-    left_context = left_mbmi->skip_coeff ? max_tx_size
-                                         : left_mbmi->tx_size;
-
-  if (!left_in_image)
-    left_context = above_context;
-
-  if (!above_in_image)
-    above_context = left_context;
-
-  return above_context + left_context > max_tx_size;
-}
-
-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag) {
-  xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
+  const MB_MODE_INFO *const above_mbmi = get_above_mbmi(get_above_mi(xd));
+  const MB_MODE_INFO *const left_mbmi = get_left_mbmi(get_left_mi(xd));
+  const int has_above = above_mbmi != NULL;
+  const int has_left = left_mbmi != NULL;
+  int above_ctx = (has_above && !above_mbmi->skip_coeff) ? above_mbmi->tx_size
+                                                         : max_tx_size;
+  int left_ctx = (has_left && !left_mbmi->skip_coeff) ? left_mbmi->tx_size
+                                                      : max_tx_size;
+  if (!has_left)
+    left_ctx = above_ctx;
+
+  if (!has_above)
+    above_ctx = left_ctx;
+
+  return (above_ctx + left_ctx) > max_tx_size;
 }
 
 int vp9_get_segment_id(VP9_COMMON *cm, const uint8_t *segment_ids,
                        BLOCK_SIZE bsize, int mi_row, int mi_col) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y, segment_id = INT_MAX;
diff --git a/source/libvpx/vp9/common/vp9_pred_common.h b/source/libvpx/vp9/common/vp9_pred_common.h
index 19032bf..23722ba 100644
--- a/source/libvpx/vp9/common/vp9_pred_common.h
+++ b/source/libvpx/vp9/common/vp9_pred_common.h
@@ -40,50 +40,37 @@ static INLINE vp9_prob vp9_get_pred_prob_seg_id(struct segmentation *seg,
   return seg->pred_probs[vp9_get_pred_context_seg_id(xd)];
 }
 
-void vp9_set_pred_flag_seg_id(MACROBLOCKD *xd, uint8_t pred_flag);
-
-static INLINE int vp9_get_pred_context_mbskip(const MACROBLOCKD *xd) {
+static INLINE int vp9_get_skip_context(const MACROBLOCKD *xd) {
   const MODE_INFO *const above_mi = get_above_mi(xd);
   const MODE_INFO *const left_mi = get_left_mi(xd);
-  const int above_skip_coeff = (above_mi != NULL) ?
-                               above_mi->mbmi.skip_coeff : 0;
-  const int left_skip_coeff = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
-
-  return above_skip_coeff + left_skip_coeff;
+  const int above_skip = (above_mi != NULL) ? above_mi->mbmi.skip_coeff : 0;
+  const int left_skip = (left_mi != NULL) ? left_mi->mbmi.skip_coeff : 0;
+  return above_skip + left_skip;
 }
 
-static INLINE vp9_prob vp9_get_pred_prob_mbskip(const VP9_COMMON *cm,
-                                                const MACROBLOCKD *xd) {
-  return cm->fc.mbskip_probs[vp9_get_pred_context_mbskip(xd)];
+static INLINE vp9_prob vp9_get_skip_prob(const VP9_COMMON *cm,
+                                         const MACROBLOCKD *xd) {
+  return cm->fc.mbskip_probs[vp9_get_skip_context(xd)];
 }
 
-static INLINE unsigned char vp9_get_pred_flag_mbskip(const MACROBLOCKD *xd) {
-  return xd->mi_8x8[0]->mbmi.skip_coeff;
-}
+int vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
 
-unsigned char vp9_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+int vp9_get_intra_inter_context(const MACROBLOCKD *xd);
 
-unsigned char vp9_get_pred_context_intra_inter(const MACROBLOCKD *xd);
-
-static INLINE vp9_prob vp9_get_pred_prob_intra_inter(const VP9_COMMON *cm,
-                                                     const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_intra_inter(xd);
-  return cm->fc.intra_inter_prob[pred_context];
+static INLINE vp9_prob vp9_get_intra_inter_prob(const VP9_COMMON *cm,
+                                                const MACROBLOCKD *xd) {
+  return cm->fc.intra_inter_prob[vp9_get_intra_inter_context(xd)];
 }
 
-unsigned char vp9_get_pred_context_comp_inter_inter(const VP9_COMMON *cm,
-                                                    const MACROBLOCKD *xd);
-
+int vp9_get_reference_mode_context(const VP9_COMMON *cm, const MACROBLOCKD *xd);
 
-static INLINE
-vp9_prob vp9_get_pred_prob_comp_inter_inter(const VP9_COMMON *cm,
-                                            const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_comp_inter_inter(cm, xd);
-  return cm->fc.comp_inter_prob[pred_context];
+static INLINE vp9_prob vp9_get_reference_mode_prob(const VP9_COMMON *cm,
+                                                   const MACROBLOCKD *xd) {
+  return cm->fc.comp_inter_prob[vp9_get_reference_mode_context(cm, xd)];
 }
 
-unsigned char vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
-                                              const MACROBLOCKD *xd);
+int vp9_get_pred_context_comp_ref_p(const VP9_COMMON *cm,
+                                    const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
                                                     const MACROBLOCKD *xd) {
@@ -91,50 +78,55 @@ static INLINE vp9_prob vp9_get_pred_prob_comp_ref_p(const VP9_COMMON *cm,
   return cm->fc.comp_ref_prob[pred_context];
 }
 
-unsigned char vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+int vp9_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p1(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_single_ref_p1(xd);
-  return cm->fc.single_ref_prob[pred_context][0];
+  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p1(xd)][0];
 }
 
-unsigned char vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vp9_prob vp9_get_pred_prob_single_ref_p2(const VP9_COMMON *cm,
                                                        const MACROBLOCKD *xd) {
-  const int pred_context = vp9_get_pred_context_single_ref_p2(xd);
-  return cm->fc.single_ref_prob[pred_context][1];
+  return cm->fc.single_ref_prob[vp9_get_pred_context_single_ref_p2(xd)][1];
 }
 
-unsigned char vp9_get_pred_context_tx_size(const MACROBLOCKD *xd);
+int vp9_get_tx_size_context(const MACROBLOCKD *xd);
 
-static const vp9_prob *get_tx_probs(BLOCK_SIZE bsize, uint8_t context,
+static const vp9_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
                                     const struct tx_probs *tx_probs) {
-  if (bsize < BLOCK_16X16)
-    return tx_probs->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_probs->p16x16[context];
-  else
-    return tx_probs->p32x32[context];
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_probs->p8x8[ctx];
+    case TX_16X16:
+      return tx_probs->p16x16[ctx];
+    case TX_32X32:
+      return tx_probs->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
-static const vp9_prob *get_tx_probs2(const MACROBLOCKD *xd,
-                                     const struct tx_probs *tx_probs,
-                                     const MODE_INFO *m) {
-  const BLOCK_SIZE bsize = m->mbmi.sb_type;
-  const int context = vp9_get_pred_context_tx_size(xd);
-  return get_tx_probs(bsize, context, tx_probs);
+static const vp9_prob *get_tx_probs2(TX_SIZE max_tx_size, const MACROBLOCKD *xd,
+                                     const struct tx_probs *tx_probs) {
+  return get_tx_probs(max_tx_size, vp9_get_tx_size_context(xd), tx_probs);
 }
 
-static unsigned int *get_tx_counts(BLOCK_SIZE bsize, uint8_t context,
+static unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
                                    struct tx_counts *tx_counts) {
-  if (bsize < BLOCK_16X16)
-    return tx_counts->p8x8[context];
-  else if (bsize < BLOCK_32X32)
-    return tx_counts->p16x16[context];
-  else
-    return tx_counts->p32x32[context];
+  switch (max_tx_size) {
+    case TX_8X8:
+      return tx_counts->p8x8[ctx];
+    case TX_16X16:
+      return tx_counts->p16x16[ctx];
+    case TX_32X32:
+      return tx_counts->p32x32[ctx];
+    default:
+      assert(0 && "Invalid max_tx_size.");
+      return NULL;
+  }
 }
 
 #endif  // VP9_COMMON_VP9_PRED_COMMON_H_
diff --git a/source/libvpx/vp9/common/vp9_reconinter.c b/source/libvpx/vp9/common/vp9_reconinter.c
index 1c96788..b177252 100644
--- a/source/libvpx/vp9/common/vp9_reconinter.c
+++ b/source/libvpx/vp9/common/vp9_reconinter.c
@@ -20,37 +20,56 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE mcomp_filter_type,
-                              VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->mi_8x8[0]) {
-    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
-                          mbmi->ref_frame[1] - LAST_FRAME,
-                          cm->active_ref_scale);
-  } else {
-    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
-  }
+static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride,
+                             int x, int y, int b_w, int b_h, int w, int h) {
+  // Get a pointer to the start of the real data for this row.
+  const uint8_t *ref_row = src - x - y * stride;
+
+  if (y >= h)
+    ref_row += (h - 1) * stride;
+  else if (y > 0)
+    ref_row += y * stride;
+
+  do {
+    int right = 0, copy;
+    int left = x < 0 ? -x : 0;
+
+    if (left > b_w)
+      left = b_w;
+
+    if (x + b_w > w)
+      right = x + b_w - w;
+
+    if (right > b_w)
+      right = b_w;
+
+    copy = b_w - left - right;
+
+    if (left)
+      memset(dst, ref_row[0], left);
+
+    if (copy)
+      memmove(dst + left, ref_row + x + left, copy);
+
+    if (right)
+      memset(dst + left + copy, ref_row[w - 1], right);
 
-  xd->subpix.filter_x = xd->subpix.filter_y =
-      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
-                               EIGHTTAP : mcomp_filter_type);
+    dst += stride;
+    ++y;
 
-  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+    if (y > 0 && y < h)
+      ref_row += stride;
+  } while (--b_h);
 }
 
 static void inter_predictor(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
-                            const MV32 *mv,
+                            const int subpel_x,
+                            const int subpel_y,
                             const struct scale_factors *scale,
                             int w, int h, int ref,
                             const struct subpix_fn_table *subpix,
                             int xs, int ys) {
-  const int subpel_x = mv->col & SUBPEL_MASK;
-  const int subpel_y = mv->row & SUBPEL_MASK;
-
-  src += (mv->row >> SUBPEL_BITS) * src_stride + (mv->col >> SUBPEL_BITS);
   scale->sfc->predict[subpel_x != 0][subpel_y != 0][ref](
       src, src_stride, dst, dst_stride,
       subpix->filter_x[subpel_x], xs,
@@ -70,9 +89,12 @@ void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                      is_q4 ? src_mv->col : src_mv->col * 2 };
   const struct scale_factors_common *sfc = scale->sfc;
   const MV32 mv = sfc->scale_mv(&mv_q4, scale);
+  const int subpel_x = mv.col & SUBPEL_MASK;
+  const int subpel_y = mv.row & SUBPEL_MASK;
+  src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  inter_predictor(src, src_stride, dst, dst_stride, &mv, scale,
-                  w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
+  inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                  scale, w, h, ref, subpix, sfc->x_step_q4, sfc->y_step_q4);
 }
 
 static INLINE int round_mv_comp_q4(int value) {
@@ -117,31 +139,18 @@ MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd, const MV *src_mv,
   return clamped_mv;
 }
 
-struct build_inter_predictors_args {
-  MACROBLOCKD *xd;
-  int x, y;
-};
-
-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
-                                   int pred_w, int pred_h,
-                                   void *argv) {
-  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD *const xd = arg->xd;
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   int bw, int bh,
+                                   int x, int y, int w, int h,
+                                   int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bwl = b_width_log2(bsize) - pd->subsampling_x;
-  const int bw = 4 << bwl;
-  const int bh = plane_block_height(bsize, pd);
-  const int x = 4 * (block & ((1 << bwl) - 1));
-  const int y = 4 * (block >> bwl);
   const MODE_INFO *mi = xd->mi_8x8[0];
   const int is_compound = has_second_ref(&mi->mbmi);
   int ref;
 
-  assert(x < bw);
-  assert(y < bh);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_w == bw);
-  assert(mi->mbmi.sb_type < BLOCK_8X8 || 4 << pred_h == bh);
-
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     struct scale_factors *const scale = &xd->scale_factor[ref];
     struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -168,11 +177,11 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
 
     uint8_t *pre;
     MV32 scaled_mv;
-    int xs, ys;
+    int xs, ys, subpel_x, subpel_y;
 
     if (vp9_is_scaled(scale->sfc)) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
-      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+      scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
       scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
       xs = scale->sfc->x_step_q4;
       ys = scale->sfc->y_step_q4;
@@ -182,35 +191,13 @@ static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
     }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+    pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+           + (scaled_mv.col >> SUBPEL_BITS);
 
     inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                    &scaled_mv, scale,
-                    4 << pred_w, 4 << pred_h, ref,
-                    &xd->subpix, xs, ys);
-  }
-}
-
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
-    int i = 0, x, y;
-    assert(bsize == BLOCK_8X8);
-    for (y = 0; y < 1 << bhl; ++y)
-      for (x = 0; x < 1 << bwl; ++x)
-        visit(plane, i++, bsize, 0, 0, arg);
-  } else {
-    visit(plane, 0, bsize, bwl, bhl, arg);
+                    subpel_x, subpel_y, scale, w, h, ref, &xd->subpix, xs, ys);
   }
 }
 
@@ -218,12 +205,27 @@ static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
                                               int plane_from, int plane_to) {
   int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct build_inter_predictors_args args = {
-      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    };
-    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
-                                     &args);
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(xd, plane, i++, bw, bh,
+                                  4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bw, bh,
+                             0, 0, bw, bh, mi_x, mi_y);
+    }
   }
 }
 
@@ -242,12 +244,160 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     MAX_MB_PLANE - 1);
 }
 
+// TODO(jingning): This function serves as a placeholder for decoder prediction
+// using on demand border extension. It should be moved to /decoder/ directory.
+static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                       int bw, int bh,
+                                       int x, int y, int w, int h,
+                                       int mi_x, int mi_y) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi_8x8[0];
+  const int is_compound = has_second_ref(&mi->mbmi);
+  int ref;
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    struct scale_factors *const scale = &xd->scale_factor[ref];
+    struct buf_2d *const pre_buf = &pd->pre[ref];
+    struct buf_2d *const dst_buf = &pd->dst;
+    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+    // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the
+    // same MV (the average of the 4 luma MVs) but we could do something
+    // smarter for non-4:2:0. Just punt for now, pending the changes to get
+    // rid of SPLITMV mode entirely.
+    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+               ? (plane == 0 ? mi->bmi[block].as_mv[ref].as_mv
+                             : mi_mv_pred_q4(mi, ref))
+               : mi->mbmi.mv[ref].as_mv;
+
+    // TODO(jkoleszar): This clamping is done in the incorrect place for the
+    // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+    // MV. Note however that it performs the subsampling aware scaling so
+    // that the result is always q4.
+    // mv_precision precision is MV_PRECISION_Q4.
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                               pd->subsampling_x,
+                                               pd->subsampling_y);
+
+    MV32 scaled_mv;
+    int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width,
+        frame_height, subpel_x, subpel_y;
+    uint8_t *ref_frame, *buf_ptr;
+    const YV12_BUFFER_CONFIG *ref_buf = xd->ref_buf[ref];
+
+    // Get reference frame pointer, width and height.
+    if (plane == 0) {
+      frame_width = ref_buf->y_crop_width;
+      frame_height = ref_buf->y_crop_height;
+      ref_frame = ref_buf->y_buffer;
+    } else {
+      frame_width = ref_buf->uv_crop_width;
+      frame_height = ref_buf->uv_crop_height;
+      ref_frame = plane == 1 ? ref_buf->u_buffer : ref_buf->v_buffer;
+    }
+
+    // Get block position in current frame.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Precision of x0_16 and y0_16 is 1/16th pixel.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    if (vp9_is_scaled(scale->sfc)) {
+      scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
+      scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
+      xs = scale->sfc->x_step_q4;
+      ys = scale->sfc->y_step_q4;
+      // Get block position in the scaled reference frame.
+      x0 = scale->sfc->scale_value_x(x0, scale->sfc);
+      y0 = scale->sfc->scale_value_y(y0, scale->sfc);
+      x0_16 = scale->sfc->scale_value_x(x0_16, scale->sfc);
+      y0_16 = scale->sfc->scale_value_y(y0_16, scale->sfc);
+    } else {
+      scaled_mv.row = mv_q4.row;
+      scaled_mv.col = mv_q4.col;
+      xs = ys = 16;
+    }
+    subpel_x = scaled_mv.col & SUBPEL_MASK;
+    subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+    // Get reference block top left coordinate.
+    x0 += scaled_mv.col >> SUBPEL_BITS;
+    y0 += scaled_mv.row >> SUBPEL_BITS;
+    x0_16 += scaled_mv.col;
+    y0_16 += scaled_mv.row;
+
+    // Get reference block bottom right coordinate.
+    x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+    y1 = ((y0_16 + (h - 1) * xs) >> SUBPEL_BITS) + 1;
+
+    // Get reference block pointer.
+    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+
+    // Do border extension if there is motion or
+    // width/height is not a multiple of 8 pixels.
+    if (scaled_mv.col || scaled_mv.row ||
+        (frame_width & 0x7) || (frame_height & 0x7)) {
+
+      if (subpel_x) {
+        x0 -= VP9_INTERP_EXTEND - 1;
+        x1 += VP9_INTERP_EXTEND;
+      }
+
+      if (subpel_y) {
+        y0 -= VP9_INTERP_EXTEND - 1;
+        y1 += VP9_INTERP_EXTEND;
+      }
+
+      // Skip border extension if block is inside the frame.
+      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width ||
+          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+        // Extend the border.
+        build_mc_border(buf_ptr1, buf_ptr1, pre_buf->stride, x0, y0, x1 - x0,
+                        y1 - y0, frame_width, frame_height);
+      }
+    }
+
+    inter_predictor(buf_ptr, pre_buf->stride, dst, dst_buf->stride, subpel_x,
+                    subpel_y, scale, w, h, ref, &xd->subpix, xs, ys);
+  }
+}
+
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          dec_build_inter_predictors(xd, plane, i++, bw, bh,
+                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
+    } else {
+      dec_build_inter_predictors(xd, plane, 0, bw, bh,
+                                 0, 0, bw, bh, mi_x, mi_y);
+    }
+  }
+}
+
 // TODO(dkovalev: find better place for this function)
 void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
   const int ref = cm->active_ref_idx[i];
   struct scale_factors *const sf = &cm->active_ref_scale[i];
   struct scale_factors_common *const sfc = &cm->active_ref_scale_comm[i];
-  if (ref >= NUM_YV12_BUFFERS) {
+  if (ref >= cm->fb_count) {
     vp9_zero(*sf);
     vp9_zero(*sfc);
   } else {
@@ -255,9 +405,6 @@ void vp9_setup_scale_factors(VP9_COMMON *cm, int i) {
     vp9_setup_scale_factors_for_frame(sf, sfc,
                                       fb->y_crop_width, fb->y_crop_height,
                                       cm->width, cm->height);
-
-    if (vp9_is_scaled(sfc))
-      vp9_extend_frame_borders(fb, cm->subsampling_x, cm->subsampling_y);
   }
 }
 
diff --git a/source/libvpx/vp9/common/vp9_reconinter.h b/source/libvpx/vp9/common/vp9_reconinter.h
index 2c8a6e4..4a302f9 100644
--- a/source/libvpx/vp9/common/vp9_reconinter.h
+++ b/source/libvpx/vp9/common/vp9_reconinter.h
@@ -24,9 +24,8 @@ void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE filter,
-                              VP9_COMMON *cm);
+void vp9_dec_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                       BLOCK_SIZE bsize);
 
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
diff --git a/source/libvpx/vp9/common/vp9_reconintra.c b/source/libvpx/vp9/common/vp9_reconintra.c
index bd609dc..eb643b0 100644
--- a/source/libvpx/vp9/common/vp9_reconintra.c
+++ b/source/libvpx/vp9/common/vp9_reconintra.c
@@ -369,7 +369,7 @@ static void build_intra_predictors(const uint8_t *ref, int ref_stride,
   }
 }
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
                             TX_SIZE tx_size, int mode,
                             const uint8_t *ref, int ref_stride,
                             uint8_t *dst, int dst_stride) {
diff --git a/source/libvpx/vp9/common/vp9_reconintra.h b/source/libvpx/vp9/common/vp9_reconintra.h
index e9d0dbf..6e3f55c 100644
--- a/source/libvpx/vp9/common/vp9_reconintra.h
+++ b/source/libvpx/vp9/common/vp9_reconintra.h
@@ -14,8 +14,8 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_predict_intra_block(MACROBLOCKD *xd, int block_idx, int bwl_in,
-                            TX_SIZE tx_size, int mode,
-                            const uint8_t *ref, int ref_stride,
-                            uint8_t *dst, int dst_stride);
+void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in,
+                             TX_SIZE tx_size, int mode,
+                             const uint8_t *ref, int ref_stride,
+                             uint8_t *dst, int dst_stride);
 #endif  // VP9_COMMON_VP9_RECONINTRA_H_
diff --git a/source/libvpx/vp9/common/vp9_rtcd_defs.sh b/source/libvpx/vp9/common/vp9_rtcd_defs.sh
index 5e049c6..727f5c4 100755
--- a/source/libvpx/vp9/common/vp9_rtcd_defs.sh
+++ b/source/libvpx/vp9/common/vp9_rtcd_defs.sh
@@ -14,6 +14,7 @@ struct macroblock;
 struct vp9_variance_vtable;
 
 #define DEC_MVCOSTS int *mvjcost, int *mvcost[2]
+struct mv;
 union int_mv;
 struct yv12_buffer_config;
 EOF
@@ -41,7 +42,7 @@ prototype void vp9_d63_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_d63_predictor_4x4 $ssse3_x86inc
 
 prototype void vp9_h_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_4x4 $ssse3_x86inc
+specialize vp9_h_predictor_4x4 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_4x4
@@ -56,10 +57,10 @@ prototype void vp9_v_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint
 specialize vp9_v_predictor_4x4 $sse_x86inc
 
 prototype void vp9_tm_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_4x4 $sse_x86inc
+specialize vp9_tm_predictor_4x4 $sse_x86inc dspr2
 
 prototype void vp9_dc_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_4x4 $sse_x86inc
+specialize vp9_dc_predictor_4x4 $sse_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_4x4 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_4x4
@@ -80,7 +81,7 @@ prototype void vp9_d63_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const ui
 specialize vp9_d63_predictor_8x8 $ssse3_x86inc
 
 prototype void vp9_h_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_8x8 $ssse3_x86inc
+specialize vp9_h_predictor_8x8 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_8x8
@@ -95,10 +96,10 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint
 specialize vp9_v_predictor_8x8 $sse_x86inc
 
 prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_tm_predictor_8x8 $sse2_x86inc
+specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2
 
 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_8x8 $sse_x86inc
+specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_8x8
@@ -119,7 +120,7 @@ prototype void vp9_d63_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d63_predictor_16x16 $ssse3_x86inc
 
 prototype void vp9_h_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_16x16 $ssse3_x86inc
+specialize vp9_h_predictor_16x16 $ssse3_x86inc dspr2
 
 prototype void vp9_d117_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_16x16
@@ -137,7 +138,7 @@ prototype void vp9_tm_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const u
 specialize vp9_tm_predictor_16x16 $sse2_x86inc
 
 prototype void vp9_dc_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_dc_predictor_16x16 $sse2_x86inc
+specialize vp9_dc_predictor_16x16 $sse2_x86inc dspr2
 
 prototype void vp9_dc_top_predictor_16x16 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_dc_top_predictor_16x16
@@ -158,7 +159,7 @@ prototype void vp9_d63_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const
 specialize vp9_d63_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_h_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
-specialize vp9_h_predictor_32x32 $ssse3 x86inc
+specialize vp9_h_predictor_32x32 $ssse3_x86inc
 
 prototype void vp9_d117_predictor_32x32 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"
 specialize vp9_d117_predictor_32x32
@@ -191,22 +192,37 @@ specialize vp9_dc_128_predictor_32x32
 # Loopfilter
 #
 prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
-specialize vp9_mb_lpf_vertical_edge_w sse2 neon
+specialize vp9_mb_lpf_vertical_edge_w sse2 neon dspr2
+
+prototype void vp9_mb_lpf_vertical_edge_w_16 "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh"
+specialize vp9_mb_lpf_vertical_edge_w_16 sse2 neon dspr2
 
 prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_vertical_edge sse2 neon
+specialize vp9_mbloop_filter_vertical_edge sse2 neon dspr2
+
+prototype void vp9_mbloop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_vertical_edge_16 sse2 neon dspr2
 
 prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_vertical_edge mmx neon
+specialize vp9_loop_filter_vertical_edge mmx neon dspr2
+
+prototype void vp9_loop_filter_vertical_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_vertical_edge_16 sse2 neon dspr2
 
 prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mb_lpf_horizontal_edge_w sse2 neon
+specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2
 
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_mbloop_filter_horizontal_edge sse2 neon
+specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
+
+prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon dspr2
 
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
-specialize vp9_loop_filter_horizontal_edge mmx neon
+specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
+
+prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon dspr2
 
 #
 # post proc
@@ -296,10 +312,11 @@ prototype void vp9_idct32x32_1024_add "const int16_t *input, uint8_t *dest, int
 specialize vp9_idct32x32_1024_add sse2 neon dspr2
 
 prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2
+specialize vp9_idct32x32_34_add sse2 neon dspr2
+vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon
 
 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_1_add sse2 dspr2
+specialize vp9_idct32x32_1_add sse2 neon dspr2
 
 prototype void vp9_iht4x4_16_add "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_iht4x4_16_add sse2 neon dspr2
@@ -691,48 +708,51 @@ fi
 
 # fdct functions
 prototype void vp9_short_fht4x4 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht4x4 sse2
+specialize vp9_short_fht4x4 sse2 avx2
 
 prototype void vp9_short_fht8x8 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht8x8 sse2
+specialize vp9_short_fht8x8 sse2 avx2
 
 prototype void vp9_short_fht16x16 "const int16_t *input, int16_t *output, int stride, int tx_type"
-specialize vp9_short_fht16x16 sse2
+specialize vp9_short_fht16x16 sse2 avx2
 
 prototype void vp9_fwht4x4 "const int16_t *input, int16_t *output, int stride"
 specialize vp9_fwht4x4
 
 prototype void vp9_fdct4x4 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct4x4 sse2
+specialize vp9_fdct4x4 sse2 avx2
 
 prototype void vp9_fdct8x8 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct8x8 sse2
+specialize vp9_fdct8x8 sse2 avx2
 
 prototype void vp9_fdct16x16 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct16x16 sse2
+specialize vp9_fdct16x16 sse2 avx2
 
 prototype void vp9_fdct32x32 "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32 sse2
+specialize vp9_fdct32x32 sse2 avx2
 
 prototype void vp9_fdct32x32_rd "const int16_t *input, int16_t *output, int stride"
-specialize vp9_fdct32x32_rd sse2
+specialize vp9_fdct32x32_rd sse2 avx2
 
 #
 # Motion search
 #
-prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n"
+prototype int vp9_full_search_sad "struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv, int n"
 specialize vp9_full_search_sad sse3 sse4_1
 vp9_full_search_sad_sse3=vp9_full_search_sadx3
 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8
 
-prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_refining_search_sad "struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_refining_search_sad sse3
 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4
 
-prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv"
+prototype int vp9_diamond_search_sad "struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
 specialize vp9_diamond_search_sad sse3
 vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4
 
+prototype int vp9_full_range_search "struct macroblock *x, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv"
+specialize vp9_full_range_search
+
 prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count"
 specialize vp9_temporal_filter_apply sse2
 
diff --git a/source/libvpx/vp9/common/vp9_scan.c b/source/libvpx/vp9/common/vp9_scan.c
index f17da91..1ec5a0c 100644
--- a/source/libvpx/vp9/common/vp9_scan.c
+++ b/source/libvpx/vp9/common/vp9_scan.c
@@ -12,28 +12,28 @@
 
 #include "vp9/common/vp9_scan.h"
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
   0,  4,  1,  5,
   8,  2, 12,  9,
   3,  6, 13, 10,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0,  4,  8,  1,
   12,  5,  9,  2,
   13,  6, 10,  3,
   7, 14, 11, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_4x4[16]) = {
   0,  1,  4,  2,
   5,  3,  6,  8,
   9,  7, 12, 10,
   13, 11, 14, 15,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
   0,  8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
@@ -44,7 +44,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]) = {
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
   2, 40, 25, 10, 33, 18, 48,  3,
   26, 41, 11, 56, 19, 34,  4, 49,
@@ -55,7 +55,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]) = {
   31, 61, 39, 54, 47, 62, 55, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_8x8[64]) = {
   0,  1,  2,  8,  9,  3, 16, 10,
   4, 17, 11, 24,  5, 18, 25, 12,
   19, 26, 32,  6, 13, 20, 33, 27,
@@ -66,7 +66,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]) = {
   60, 39, 61, 47, 54, 55, 62, 63,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
   0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80,
   50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52,
   98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69,
@@ -87,7 +87,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
   34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
   67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21,
@@ -108,7 +108,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
+DECLARE_ALIGNED(16, static const int16_t, row_scan_16x16[256]) = {
   0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20,
   49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52,
   23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69,
@@ -130,7 +130,7 @@ DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]) = {
   255,
 };
 
-DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
   129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
   68, 131, 37, 100,
@@ -233,38 +233,68 @@ DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]) = {
 // in {top, left, topleft, topright, bottomleft} order
 // for each position in raster scan order.
 // -1 indicates the neighbor does not exist.
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-DECLARE_ALIGNED(16, int16_t,
-                vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
+DECLARE_ALIGNED(16, static int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_4x4[16]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_8x8[64]);
+DECLARE_ALIGNED(16, static int16_t, vp9_col_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static int16_t, vp9_row_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static  int16_t, vp9_default_iscan_16x16[256]);
+DECLARE_ALIGNED(16, static int16_t, vp9_default_iscan_32x32[1024]);
 
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
+const scan_order vp9_default_scan_orders[TX_SIZES] = {
+  {default_scan_4x4,   vp9_default_iscan_4x4,   default_scan_4x4_neighbors},
+  {default_scan_8x8,   vp9_default_iscan_8x8,   default_scan_8x8_neighbors},
+  {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+  {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+};
+
+const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp9_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp9_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors}
+  }, {  // TX_8X8
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp9_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp9_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors}
+  }, {  // TX_16X16
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp9_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp9_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors}
+  }, {  // TX_32X32
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors},
+  }
+};
 
 static int find_in_scan(const int16_t *scan, int l, int idx) {
   int n, l2 = l * l;
@@ -276,9 +306,9 @@ static int find_in_scan(const int16_t *scan, int l, int idx) {
   assert(0);
   return -1;
 }
-static void init_scan_neighbors(const int16_t *scan,
-                                int16_t *iscan,
-                                int l, int16_t *neighbors) {
+
+static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, int l,
+                                int16_t *neighbors) {
   int l2 = l * l;
   int n, i, j;
 
@@ -302,15 +332,15 @@ static void init_scan_neighbors(const int16_t *scan,
       // use the combination of the two as a context.
       int a = (i - 1) * l + j;
       int b =  i      * l + j - 1;
-      if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 ||
-          scan == vp9_col_scan_16x16) {
+      if (scan == col_scan_4x4 || scan == col_scan_8x8 ||
+          scan == col_scan_16x16) {
         // in the col/row scan cases (as well as left/top edge cases), we set
         // both contexts to the same value, so we can branchlessly do a+b+1>>1
         // which automatically becomes a if a == b
         neighbors[MAX_NEIGHBORS * n + 0] =
         neighbors[MAX_NEIGHBORS * n + 1] = a;
-      } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 ||
-                 scan == vp9_row_scan_16x16) {
+      } else if (scan == row_scan_4x4 || scan == row_scan_8x8 ||
+                 scan == row_scan_16x16) {
         neighbors[MAX_NEIGHBORS * n + 0] =
         neighbors[MAX_NEIGHBORS * n + 1] = b;
       } else {
@@ -334,24 +364,24 @@ static void init_scan_neighbors(const int16_t *scan,
 }
 
 void vp9_init_neighbors() {
-  init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4,
-                      vp9_default_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4,
-                      vp9_row_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4,
-                      vp9_col_scan_4x4_neighbors);
-  init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8,
-                      vp9_default_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8,
-                      vp9_row_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8,
-                      vp9_col_scan_8x8_neighbors);
-  init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16,
-                      vp9_default_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16,
-                      vp9_row_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16,
-                      vp9_col_scan_16x16_neighbors);
-  init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32,
-                      vp9_default_scan_32x32_neighbors);
+  init_scan_neighbors(default_scan_4x4, vp9_default_iscan_4x4, 4,
+                      default_scan_4x4_neighbors);
+  init_scan_neighbors(row_scan_4x4, vp9_row_iscan_4x4, 4,
+                      row_scan_4x4_neighbors);
+  init_scan_neighbors(col_scan_4x4, vp9_col_iscan_4x4, 4,
+                      col_scan_4x4_neighbors);
+  init_scan_neighbors(default_scan_8x8, vp9_default_iscan_8x8, 8,
+                      default_scan_8x8_neighbors);
+  init_scan_neighbors(row_scan_8x8, vp9_row_iscan_8x8, 8,
+                      row_scan_8x8_neighbors);
+  init_scan_neighbors(col_scan_8x8, vp9_col_iscan_8x8, 8,
+                      col_scan_8x8_neighbors);
+  init_scan_neighbors(default_scan_16x16, vp9_default_iscan_16x16, 16,
+                      default_scan_16x16_neighbors);
+  init_scan_neighbors(row_scan_16x16, vp9_row_iscan_16x16, 16,
+                      row_scan_16x16_neighbors);
+  init_scan_neighbors(col_scan_16x16, vp9_col_iscan_16x16, 16,
+                      col_scan_16x16_neighbors);
+  init_scan_neighbors(default_scan_32x32, vp9_default_iscan_32x32, 32,
+                      default_scan_32x32_neighbors);
 }
diff --git a/source/libvpx/vp9/common/vp9_scan.h b/source/libvpx/vp9/common/vp9_scan.h
index a5c8463..efab48b 100644
--- a/source/libvpx/vp9/common/vp9_scan.h
+++ b/source/libvpx/vp9/common/vp9_scan.h
@@ -15,184 +15,23 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
 
 #define MAX_NEIGHBORS 2
 
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_4x4[16]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_8x8[64]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_col_scan_16x16[256]);
-extern DECLARE_ALIGNED(16, const int16_t, vp9_row_scan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, const int16_t, vp9_default_scan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_4x4[16]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_8x8[64]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_8x8[64]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_16x16[256]);
-extern DECLARE_ALIGNED(16, int16_t, vp9_row_iscan_16x16[256]);
-
-extern DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
-
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]);
-extern DECLARE_ALIGNED(16, int16_t,
-                       vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
-
-
 void vp9_init_neighbors();
 
-static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_4x4;
-    case DCT_ADST:
-      return vp9_col_scan_4x4;
-    default:
-      return vp9_default_scan_4x4;
-  }
-}
-
-static INLINE void get_scan_nb_4x4(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_4x4;
-      *nb = vp9_row_scan_4x4_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_4x4;
-      *nb = vp9_col_scan_4x4_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_4x4;
-      *nb = vp9_default_scan_4x4_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_4x4(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_4x4;
-    case DCT_ADST:
-      return vp9_col_iscan_4x4;
-    default:
-      return vp9_default_iscan_4x4;
-  }
-}
-
-static INLINE const int16_t* get_scan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_8x8;
-    case DCT_ADST:
-      return vp9_col_scan_8x8;
-    default:
-      return vp9_default_scan_8x8;
-  }
-}
+typedef struct {
+  const int16_t *scan;
+  const int16_t *iscan;
+  const int16_t *neighbors;
+} scan_order;
 
-static INLINE void get_scan_nb_8x8(TX_TYPE tx_type,
-                                   const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_8x8;
-      *nb = vp9_row_scan_8x8_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_8x8;
-      *nb = vp9_col_scan_8x8_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_8x8;
-      *nb = vp9_default_scan_8x8_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_8x8(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_8x8;
-    case DCT_ADST:
-      return vp9_col_iscan_8x8;
-    default:
-      return vp9_default_iscan_8x8;
-  }
-}
-
-static INLINE const int16_t* get_scan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_scan_16x16;
-    case DCT_ADST:
-      return vp9_col_scan_16x16;
-    default:
-      return vp9_default_scan_16x16;
-  }
-}
-
-static INLINE void get_scan_nb_16x16(TX_TYPE tx_type,
-                                     const int16_t **scan, const int16_t **nb) {
-  switch (tx_type) {
-    case ADST_DCT:
-      *scan = vp9_row_scan_16x16;
-      *nb = vp9_row_scan_16x16_neighbors;
-      break;
-    case DCT_ADST:
-      *scan = vp9_col_scan_16x16;
-      *nb = vp9_col_scan_16x16_neighbors;
-      break;
-    default:
-      *scan = vp9_default_scan_16x16;
-      *nb = vp9_default_scan_16x16_neighbors;
-      break;
-  }
-}
-
-static INLINE const int16_t* get_iscan_16x16(TX_TYPE tx_type) {
-  switch (tx_type) {
-    case ADST_DCT:
-      return vp9_row_iscan_16x16;
-    case DCT_ADST:
-      return vp9_col_iscan_16x16;
-    default:
-      return vp9_default_iscan_16x16;
-  }
-}
+extern const scan_order vp9_default_scan_orders[TX_SIZES];
+extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
-                                   uint8_t *token_cache,
-                                   int c) {
+                                   const uint8_t *token_cache, int c) {
   return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] +
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
diff --git a/source/libvpx/vp9/common/vp9_tile_common.c b/source/libvpx/vp9/common/vp9_tile_common.c
index e3035d0..78909dd 100644
--- a/source/libvpx/vp9/common/vp9_tile_common.c
+++ b/source/libvpx/vp9/common/vp9_tile_common.c
@@ -15,46 +15,37 @@
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
 
-static int to_sbs(n_mis) {
-  return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
+static int get_tile_offset(int idx, int mis, int log2) {
+  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
+  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
+  return MIN(offset, mis);
 }
 
-static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
-                             int tile_idx, int log2_n_tiles, int n_mis) {
-  const int n_sbs = to_sbs(n_mis);
-  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
-  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
-
-  *min_tile_off = MIN(sb_off1 << 3, n_mis);
-  *max_tile_off = MIN(sb_off2 << 3, n_mis);
-}
-
-void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
-                   int row_idx, int col_idx) {
-  get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
-                   row_idx, cm->log2_tile_rows, cm->mi_rows);
-  get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
-                   col_idx, cm->log2_tile_cols, cm->mi_cols);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
 }
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = to_sbs(mi_cols);
-  int min_log2_n_tiles, max_log2_n_tiles;
+  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int min_log2 = 0, max_log2 = 0;
 
-  for (max_log2_n_tiles = 0;
-       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64;
-       max_log2_n_tiles++) {}
-  max_log2_n_tiles--;
-  if (max_log2_n_tiles <  0)
-    max_log2_n_tiles = 0;
+  // max
+  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  --max_log2;
+  if (max_log2 < 0)
+    max_log2 = 0;
 
-  for (min_log2_n_tiles = 0;
-       (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols;
-       min_log2_n_tiles++) {}
+  // min
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+    ++min_log2;
 
-  assert(min_log2_n_tiles <= max_log2_n_tiles);
+  assert(min_log2 <= max_log2);
 
-  *min_log2_tile_cols = min_log2_n_tiles;
-  *max_log2_tile_cols = max_log2_n_tiles;
+  *min_log2_tile_cols = min_log2;
+  *max_log2_tile_cols = max_log2;
 }
diff --git a/source/libvpx/vp9/common/vp9_tile_common.h b/source/libvpx/vp9/common/vp9_tile_common.h
index a110abb..a09876e 100644
--- a/source/libvpx/vp9/common/vp9_tile_common.h
+++ b/source/libvpx/vp9/common/vp9_tile_common.h
@@ -18,10 +18,10 @@ typedef struct TileInfo {
   int mi_col_start, mi_col_end;
 } TileInfo;
 
-// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
 // 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
 void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
-                   int row_idx, int col_idx);
+                   int row, int col);
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);
diff --git a/source/libvpx/vp9/common/vp9_treecoder.c b/source/libvpx/vp9/common/vp9_treecoder.c
deleted file mode 100644
index da1213d..0000000
--- a/source/libvpx/vp9/common/vp9_treecoder.c
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <assert.h>
-
-#include "./vpx_config.h"
-#include "vp9/common/vp9_treecoder.h"
-
-static void tree2tok(struct vp9_token *const p, vp9_tree t,
-                    int i, int v, int l) {
-  v += v;
-  ++l;
-
-  do {
-    const vp9_tree_index j = t[i++];
-
-    if (j <= 0) {
-      p[-j].value = v;
-      p[-j].len = l;
-    } else {
-      tree2tok(p, t, j, v, l);
-    }
-  } while (++v & 1);
-}
-
-void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) {
-  tree2tok(p, t, 0, 0, 0);
-}
-
-void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t,
-                                 int offset) {
-  tree2tok(p - offset, t, 0, 0, 0);
-}
-
-static unsigned int convert_distribution(unsigned int i,
-                                         vp9_tree tree,
-                                         vp9_prob probs[],
-                                         unsigned int branch_ct[][2],
-                                         const unsigned int num_events[],
-                                         unsigned int tok0_offset) {
-  unsigned int left, right;
-
-  if (tree[i] <= 0) {
-    left = num_events[-tree[i] - tok0_offset];
-  } else {
-    left = convert_distribution(tree[i], tree, probs, branch_ct,
-                                num_events, tok0_offset);
-  }
-  if (tree[i + 1] <= 0)
-    right = num_events[-tree[i + 1] - tok0_offset];
-  else
-    right = convert_distribution(tree[i + 1], tree, probs, branch_ct,
-                                 num_events, tok0_offset);
-
-  probs[i>>1] = get_binary_prob(left, right);
-  branch_ct[i>>1][0] = left;
-  branch_ct[i>>1][1] = right;
-  return left + right;
-}
-
-void vp9_tree_probs_from_distribution(vp9_tree tree, vp9_prob probs[/* n-1 */],
-                                      unsigned int branch_ct[/* n-1 */][2],
-                                      const unsigned int num_events[/* n */],
-                                      unsigned int tok0_offset) {
-  convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset);
-}
diff --git a/source/libvpx/vp9/common/vp9_treecoder.h b/source/libvpx/vp9/common/vp9_treecoder.h
index 4ba171f..ed8c74a 100644
--- a/source/libvpx/vp9/common/vp9_treecoder.h
+++ b/source/libvpx/vp9/common/vp9_treecoder.h
@@ -34,27 +34,11 @@ typedef int8_t vp9_tree_index;
 
 typedef const vp9_tree_index vp9_tree[];
 
-struct vp9_token {
-  int value;
-  int len;
-};
-
-/* Construct encoding array from tree. */
-
-void vp9_tokens_from_tree(struct vp9_token*, vp9_tree);
-void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset);
-
 /* Convert array of token occurrence counts into a table of probabilities
    for the associated binary encoding tree.  Also writes count of branches
    taken for each node on the tree; this facilitiates decisions as to
    probability updates. */
 
-void vp9_tree_probs_from_distribution(vp9_tree tree,
-                                      vp9_prob probs[ /* n - 1 */ ],
-                                      unsigned int branch_ct[ /* n - 1 */ ][2],
-                                      const unsigned int num_events[ /* n */ ],
-                                      unsigned int tok0_offset);
-
 static INLINE vp9_prob clip_prob(int p) {
   return (p > 255) ? 255u : (p < 1) ? 1u : p;
 }
@@ -81,21 +65,46 @@ static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) {
   return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8);
 }
 
-static INLINE vp9_prob merge_probs(vp9_prob pre_prob, vp9_prob prob,
+static INLINE vp9_prob merge_probs(vp9_prob pre_prob,
                                    const unsigned int ct[2],
                                    unsigned int count_sat,
                                    unsigned int max_update_factor) {
+  const vp9_prob prob = get_binary_prob(ct[0], ct[1]);
   const unsigned int count = MIN(ct[0] + ct[1], count_sat);
   const unsigned int factor = max_update_factor * count / count_sat;
   return weighted_prob(pre_prob, prob, factor);
 }
 
-static INLINE vp9_prob merge_probs2(vp9_prob pre_prob,
-                                   const unsigned int ct[2],
-                                   unsigned int count_sat,
-                                   unsigned int max_update_factor) {
-  return merge_probs(pre_prob, get_binary_prob(ct[0], ct[1]), ct, count_sat,
-                     max_update_factor);
+static unsigned int tree_merge_probs_impl(unsigned int i,
+                                          const vp9_tree_index *tree,
+                                          const vp9_prob *pre_probs,
+                                          const unsigned int *counts,
+                                          unsigned int count_sat,
+                                          unsigned int max_update_factor,
+                                          vp9_prob *probs) {
+  const int l = tree[i];
+  const unsigned int left_count = (l <= 0)
+                 ? counts[-l]
+                 : tree_merge_probs_impl(l, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const int r = tree[i + 1];
+  const unsigned int right_count = (r <= 0)
+                 ? counts[-r]
+                 : tree_merge_probs_impl(r, tree, pre_probs, counts,
+                                         count_sat, max_update_factor, probs);
+  const unsigned int ct[2] = { left_count, right_count };
+  probs[i >> 1] = merge_probs(pre_probs[i >> 1], ct,
+                              count_sat, max_update_factor);
+  return left_count + right_count;
+}
+
+static void tree_merge_probs(const vp9_tree_index *tree,
+                             const vp9_prob *pre_probs,
+                             const unsigned int *counts,
+                             unsigned int count_sat,
+                             unsigned int max_update_factor, vp9_prob *probs) {
+  tree_merge_probs_impl(0, tree, pre_probs, counts,
+                        count_sat, max_update_factor, probs);
 }
 
 
diff --git a/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index ccf5aac..947c0ba 100644
--- a/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/source/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -15,6 +15,16 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
+#define RECON_AND_STORE4X4(dest, in_x) \
+{                                                     \
+  __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+  d0 = _mm_unpacklo_epi8(d0, zero); \
+  d0 = _mm_add_epi16(in_x, d0); \
+  d0 = _mm_packus_epi16(d0, d0); \
+  *(int *)dest = _mm_cvtsi128_si32(d0); \
+  dest += stride; \
+}
+
 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
@@ -26,21 +36,19 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   __m128i input0, input1, input2, input3;
 
   // Rows
-  input0 = _mm_loadl_epi64((const __m128i *)input);
-  input1 = _mm_loadl_epi64((const __m128i *)(input + 4));
-  input2 = _mm_loadl_epi64((const __m128i *)(input + 8));
-  input3 = _mm_loadl_epi64((const __m128i *)(input + 12));
+  input0 = _mm_load_si128((const __m128i *)input);
+  input2 = _mm_load_si128((const __m128i *)(input + 8));
 
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
   input0 = _mm_shufflelo_epi16(input0, 0xd8);
-  input1 = _mm_shufflelo_epi16(input1, 0xd8);
+  input0 = _mm_shufflehi_epi16(input0, 0xd8);
   input2 = _mm_shufflelo_epi16(input2, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
+  input2 = _mm_shufflehi_epi16(input2, 0xd8);
 
+  input1 = _mm_unpackhi_epi32(input0, input0);
   input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpacklo_epi32(input1, input1);
+  input3 = _mm_unpackhi_epi32(input2, input2);
   input2 = _mm_unpacklo_epi32(input2, input2);
-  input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
   input0 = _mm_madd_epi16(input0, cst);
@@ -59,16 +67,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input1);
+  input1 = _mm_packs_epi32(input2, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -78,14 +84,9 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
   // Columns
   // Construct i3, i1, i3, i1, i2, i0, i2, i0
-  input0 = _mm_shufflelo_epi16(input2, 0xd8);
-  input1 = _mm_shufflehi_epi16(input2, 0xd8);
-  input2 = _mm_shufflehi_epi16(input3, 0xd8);
-  input3 = _mm_shufflelo_epi16(input3, 0xd8);
-
-  input0 = _mm_unpacklo_epi32(input0, input0);
-  input1 = _mm_unpackhi_epi32(input1, input1);
-  input2 = _mm_unpackhi_epi32(input2, input2);
+  input0 = _mm_unpacklo_epi32(input2, input2);
+  input1 = _mm_unpackhi_epi32(input2, input2);
+  input2 = _mm_unpackhi_epi32(input3, input3);
   input3 = _mm_unpacklo_epi32(input3, input3);
 
   // Stage 1
@@ -105,16 +106,14 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
 
   // Stage 2
-  input0 = _mm_packs_epi32(input0, zero);
-  input1 = _mm_packs_epi32(input1, zero);
-  input2 = _mm_packs_epi32(input2, zero);
-  input3 = _mm_packs_epi32(input3, zero);
+  input0 = _mm_packs_epi32(input0, input2);
+  input1 = _mm_packs_epi32(input1, input3);
 
   // Transpose
-  input1 = _mm_unpacklo_epi16(input0, input1);
-  input3 = _mm_unpacklo_epi16(input2, input3);
-  input0 = _mm_unpacklo_epi32(input1, input3);
-  input1 = _mm_unpackhi_epi32(input1, input3);
+  input2 = _mm_unpacklo_epi16(input0, input1);
+  input3 = _mm_unpackhi_epi16(input0, input1);
+  input0 = _mm_unpacklo_epi32(input2, input3);
+  input1 = _mm_unpackhi_epi32(input2, input3);
 
   // Switch column2, column 3, and then, we got:
   // input2: column1, column 0;  input3: column2, column 3.
@@ -129,23 +128,31 @@ void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-#define RECON_AND_STORE4X4(dest, in_x) \
-  {                                                     \
-      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      d0 = _mm_add_epi16(in_x, d0); \
-      d0 = _mm_packus_epi16(d0, d0); \
-      *(int *)dest = _mm_cvtsi128_si32(d0); \
-      dest += stride; \
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)), d2);
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, input2);
+     d2 = _mm_add_epi16(d2, input3);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store input0
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store input1
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store input2
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+     // store input3
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
   }
-
-  input0 = _mm_srli_si128(input2, 8);
-  input1 = _mm_srli_si128(input3, 8);
-
-  RECON_AND_STORE4X4(dest, input2);
-  RECON_AND_STORE4X4(dest, input0);
-  RECON_AND_STORE4X4(dest, input1);
-  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
@@ -167,12 +174,10 @@ void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 
 static INLINE void transpose_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
 static void idct4_1d_sse2(__m128i *in) {
@@ -185,8 +190,8 @@ static void idct4_1d_sse2(__m128i *in) {
 
   transpose_4x4(in);
   // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
@@ -202,16 +207,13 @@ static void idct4_1d_sse2(__m128i *in) {
   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
 
-  u[0] = _mm_packs_epi32(v[0], v[2]);
-  u[1] = _mm_packs_epi32(v[1], v[3]);
-  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
-  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
 
   // stage 2
-  in[0] = _mm_add_epi16(u[0], u[3]);
-  in[1] = _mm_add_epi16(u[1], u[2]);
-  in[2] = _mm_sub_epi16(u[1], u[2]);
-  in[3] = _mm_sub_epi16(u[0], u[3]);
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
 static void iadst4_1d_sse2(__m128i *in) {
@@ -225,13 +227,14 @@ static void iadst4_1d_sse2(__m128i *in) {
   __m128i u[8], v[8], in7;
 
   transpose_4x4(in);
-  in7 = _mm_add_epi16(in[0], in[3]);
-  in7 = _mm_sub_epi16(in7, in[2]);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
 
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[1], kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
@@ -258,22 +261,18 @@ static void iadst4_1d_sse2(__m128i *in) {
   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
 
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
-  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  __m128i in[4];
+  __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((const __m128i *)input);
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
+  in[0]= _mm_loadu_si128((const __m128i *)(input));
+  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -300,18 +299,35 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
   // Final round and shift
   in[0] = _mm_add_epi16(in[0], eight);
   in[1] = _mm_add_epi16(in[1], eight);
-  in[2] = _mm_add_epi16(in[2], eight);
-  in[3] = _mm_add_epi16(in[3], eight);
 
   in[0] = _mm_srai_epi16(in[0], 4);
   in[1] = _mm_srai_epi16(in[1], 4);
-  in[2] = _mm_srai_epi16(in[2], 4);
-  in[3] = _mm_srai_epi16(in[3], 4);
 
-  RECON_AND_STORE4X4(dest, in[0]);
-  RECON_AND_STORE4X4(dest, in[1]);
-  RECON_AND_STORE4X4(dest, in[2]);
-  RECON_AND_STORE4X4(dest, in[3]);
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)));
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, in[0]);
+     d2 = _mm_add_epi16(d2, in[1]);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store result[0]
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store result[1]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store result[2]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+     // store result[3]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
 }
 
 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
@@ -415,6 +431,27 @@ void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
       res3 = _mm_packs_epi32(tmp6, tmp7); \
   }
 
+#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
+  {   \
+      tmp0 = _mm_madd_epi16(lo_0, cst0); \
+      tmp1 = _mm_madd_epi16(hi_0, cst0); \
+      tmp2 = _mm_madd_epi16(lo_0, cst1); \
+      tmp3 = _mm_madd_epi16(hi_0, cst1); \
+      \
+      tmp0 = _mm_add_epi32(tmp0, rounding); \
+      tmp1 = _mm_add_epi32(tmp1, rounding); \
+      tmp2 = _mm_add_epi32(tmp2, rounding); \
+      tmp3 = _mm_add_epi32(tmp3, rounding); \
+      \
+      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+      \
+      res0 = _mm_packs_epi32(tmp0, tmp1); \
+      res1 = _mm_packs_epi32(tmp2, tmp3); \
+  }
+
 #define IDCT8_1D  \
   /* Stage1 */      \
   { \
@@ -613,6 +650,25 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
   res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
 }
 
+static INLINE void array_transpose_4X8(__m128i *in, __m128i * out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+  out[4] = out[5] = out[6] = out[7] = zero;
+}
+
 static void idct8_1d_sse2(__m128i *in) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -1102,14 +1158,14 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
 #define IDCT16_1D \
   /* Stage2 */ \
   { \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7);   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7);   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]); \
+    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]); \
+    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);   \
+    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);   \
+    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]); \
+    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]); \
+    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]); \
+    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]); \
     \
     MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
                            stg2_0, stg2_1, stg2_2, stg2_3, \
@@ -1122,10 +1178,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
     \
   /* Stage3 */ \
   { \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]); \
+    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]); \
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]); \
+    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]); \
     \
     MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
                            stg3_0, stg3_1, stg3_2, stg3_3, \
@@ -1144,10 +1200,10 @@ void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
   \
   /* Stage4 */ \
   { \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]); \
+    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]); \
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]); \
+    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]); \
     \
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
     const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -1259,16 +1315,7 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-  __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
-          r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
-          r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+  __m128i in[16], l[16], r[16], *curr1;
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
@@ -1277,162 +1324,132 @@ void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
 
-  // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
-  for (i = 0; i < 4; i++) {
-    // 1-D idct
-    if (i < 2) {
-      if (i == 1) input += 128;
+  curr1 = l;
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
 
       // Load input data.
-      in0 = _mm_load_si128((const __m128i *)input);
-      in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-      in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-      in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-      in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-      in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-      in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-      in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
-      in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
-      in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
-      in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
-      in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
-      in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
-      in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
-      in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
-      in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
-
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-    }
-
-    if (i == 2) {
-      TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
-                    in13, in14, in15);
-    }
-
-    if (i == 3) {
-      TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
-                    in12, in13, in14, in15);
-    }
+      in[0] = _mm_load_si128((const __m128i *)input);
+      in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+      in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+      in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+      in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+      in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+      in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+      in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
+      in[4] = _mm_load_si128((const __m128i *)(input + 8 * 8));
+      in[12] = _mm_load_si128((const __m128i *)(input + 8 * 9));
+      in[5] = _mm_load_si128((const __m128i *)(input + 8 * 10));
+      in[13] = _mm_load_si128((const __m128i *)(input + 8 * 11));
+      in[6] = _mm_load_si128((const __m128i *)(input + 8 * 12));
+      in[14] = _mm_load_si128((const __m128i *)(input + 8 * 13));
+      in[7] = _mm_load_si128((const __m128i *)(input + 8 * 14));
+      in[15] = _mm_load_si128((const __m128i *)(input + 8 * 15));
+
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+
+      IDCT16_1D
+
+      // Stage7
+      curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
+      curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
+      curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
+      curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
+      curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
+      curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
+      curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
+      curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
+      curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
+
+      curr1 = r;
+      input += 128;
+  }
+  for (i = 0; i < 2; i++) {
+      // 1-D idct
+      array_transpose_8x8(l+i*8, in);
+      array_transpose_8x8(r+i*8, in+8);
 
-    IDCT16_1D
+      IDCT16_1D
 
-    // Stage7
-    if (i == 0) {
-      // Left 8x16
-      l0 = _mm_add_epi16(stp2_0, stp1_15);
-      l1 = _mm_add_epi16(stp2_1, stp1_14);
-      l2 = _mm_add_epi16(stp2_2, stp2_13);
-      l3 = _mm_add_epi16(stp2_3, stp2_12);
-      l4 = _mm_add_epi16(stp2_4, stp2_11);
-      l5 = _mm_add_epi16(stp2_5, stp2_10);
-      l6 = _mm_add_epi16(stp2_6, stp1_9);
-      l7 = _mm_add_epi16(stp2_7, stp1_8);
-      l8 = _mm_sub_epi16(stp2_7, stp1_8);
-      l9 = _mm_sub_epi16(stp2_6, stp1_9);
-      l10 = _mm_sub_epi16(stp2_5, stp2_10);
-      l11 = _mm_sub_epi16(stp2_4, stp2_11);
-      l12 = _mm_sub_epi16(stp2_3, stp2_12);
-      l13 = _mm_sub_epi16(stp2_2, stp2_13);
-      l14 = _mm_sub_epi16(stp2_1, stp1_14);
-      l15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else if (i == 1) {
-      // Right 8x16
-      r0 = _mm_add_epi16(stp2_0, stp1_15);
-      r1 = _mm_add_epi16(stp2_1, stp1_14);
-      r2 = _mm_add_epi16(stp2_2, stp2_13);
-      r3 = _mm_add_epi16(stp2_3, stp2_12);
-      r4 = _mm_add_epi16(stp2_4, stp2_11);
-      r5 = _mm_add_epi16(stp2_5, stp2_10);
-      r6 = _mm_add_epi16(stp2_6, stp1_9);
-      r7 = _mm_add_epi16(stp2_7, stp1_8);
-      r8 = _mm_sub_epi16(stp2_7, stp1_8);
-      r9 = _mm_sub_epi16(stp2_6, stp1_9);
-      r10 = _mm_sub_epi16(stp2_5, stp2_10);
-      r11 = _mm_sub_epi16(stp2_4, stp2_11);
-      r12 = _mm_sub_epi16(stp2_3, stp2_12);
-      r13 = _mm_sub_epi16(stp2_2, stp2_13);
-      r14 = _mm_sub_epi16(stp2_1, stp1_14);
-      r15 = _mm_sub_epi16(stp2_0, stp1_15);
-    } else {
       // 2-D
-      in0 = _mm_add_epi16(stp2_0, stp1_15);
-      in1 = _mm_add_epi16(stp2_1, stp1_14);
-      in2 = _mm_add_epi16(stp2_2, stp2_13);
-      in3 = _mm_add_epi16(stp2_3, stp2_12);
-      in4 = _mm_add_epi16(stp2_4, stp2_11);
-      in5 = _mm_add_epi16(stp2_5, stp2_10);
-      in6 = _mm_add_epi16(stp2_6, stp1_9);
-      in7 = _mm_add_epi16(stp2_7, stp1_8);
-      in8 = _mm_sub_epi16(stp2_7, stp1_8);
-      in9 = _mm_sub_epi16(stp2_6, stp1_9);
-      in10 = _mm_sub_epi16(stp2_5, stp2_10);
-      in11 = _mm_sub_epi16(stp2_4, stp2_11);
-      in12 = _mm_sub_epi16(stp2_3, stp2_12);
-      in13 = _mm_sub_epi16(stp2_2, stp2_13);
-      in14 = _mm_sub_epi16(stp2_1, stp1_14);
-      in15 = _mm_sub_epi16(stp2_0, stp1_15);
+      in[0] = _mm_add_epi16(stp2_0, stp1_15);
+      in[1] = _mm_add_epi16(stp2_1, stp1_14);
+      in[2] = _mm_add_epi16(stp2_2, stp2_13);
+      in[3] = _mm_add_epi16(stp2_3, stp2_12);
+      in[4] = _mm_add_epi16(stp2_4, stp2_11);
+      in[5] = _mm_add_epi16(stp2_5, stp2_10);
+      in[6] = _mm_add_epi16(stp2_6, stp1_9);
+      in[7] = _mm_add_epi16(stp2_7, stp1_8);
+      in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+      in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+      in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+      in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+      in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+      in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+      in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+      in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
 
       dest += 8 - (stride * 16);
-    }
   }
 }
 
@@ -2452,15 +2469,7 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
-          in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
-          in10 = zero, in11 = zero, in12 = zero, in13 = zero,
-          in14 = zero, in15 = zero;
-  __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
-          l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
-          l12 = zero, l13 = zero, l14 = zero, l15 = zero;
-
+  __m128i in[16], l[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_8_0, stp1_12_0;
@@ -2468,25 +2477,26 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
+  in[4] = in[5] = in[6] = in[7] = in[12] = in[13] = in[14] = in[15] = zero;
   // 1-D idct. Load input data.
-  in0 = _mm_load_si128((const __m128i *)input);
-  in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
-  in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
-  in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
-  in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
-  in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
-  in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
-  in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
+  in[0] = _mm_load_si128((const __m128i *)input);
+  in[8] = _mm_load_si128((const __m128i *)(input + 8 * 1));
+  in[1] = _mm_load_si128((const __m128i *)(input + 8 * 2));
+  in[9] = _mm_load_si128((const __m128i *)(input + 8 * 3));
+  in[2] = _mm_load_si128((const __m128i *)(input + 8 * 4));
+  in[10] = _mm_load_si128((const __m128i *)(input + 8 * 5));
+  in[3] = _mm_load_si128((const __m128i *)(input + 8 * 6));
+  in[11] = _mm_load_si128((const __m128i *)(input + 8 * 7));
 
-  TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
-  TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1], in[2], in[3]);
+  TRANSPOSE_8X4(in[8], in[9], in[10], in[11], in[8], in[9], in[10], in[11]);
 
   // Stage2
   {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
-    const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
-    const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], in[11]);
+    const __m128i lo_9_7 = _mm_unpackhi_epi16(in[8], in[3]);
+    const __m128i lo_5_11 = _mm_unpackhi_epi16(in[2], in[9]);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(in[10], in[1]);
 
     tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
     tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
@@ -2528,8 +2538,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   // Stage3
   {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], in[11]);
+    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[9], in[3]);
 
     tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
     tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
@@ -2564,8 +2574,8 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 
   // Stage4
   {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);
+    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[2], in[10]);
     const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
     const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
 
@@ -2674,106 +2684,99 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
   }
 
   // Stage7. Left 8x16 only.
-  l0 = _mm_add_epi16(stp2_0, stp1_15);
-  l1 = _mm_add_epi16(stp2_1, stp1_14);
-  l2 = _mm_add_epi16(stp2_2, stp2_13);
-  l3 = _mm_add_epi16(stp2_3, stp2_12);
-  l4 = _mm_add_epi16(stp2_4, stp2_11);
-  l5 = _mm_add_epi16(stp2_5, stp2_10);
-  l6 = _mm_add_epi16(stp2_6, stp1_9);
-  l7 = _mm_add_epi16(stp2_7, stp1_8);
-  l8 = _mm_sub_epi16(stp2_7, stp1_8);
-  l9 = _mm_sub_epi16(stp2_6, stp1_9);
-  l10 = _mm_sub_epi16(stp2_5, stp2_10);
-  l11 = _mm_sub_epi16(stp2_4, stp2_11);
-  l12 = _mm_sub_epi16(stp2_3, stp2_12);
-  l13 = _mm_sub_epi16(stp2_2, stp2_13);
-  l14 = _mm_sub_epi16(stp2_1, stp1_14);
-  l15 = _mm_sub_epi16(stp2_0, stp1_15);
+  l[0] = _mm_add_epi16(stp2_0, stp1_15);
+  l[1] = _mm_add_epi16(stp2_1, stp1_14);
+  l[2] = _mm_add_epi16(stp2_2, stp2_13);
+  l[3] = _mm_add_epi16(stp2_3, stp2_12);
+  l[4] = _mm_add_epi16(stp2_4, stp2_11);
+  l[5] = _mm_add_epi16(stp2_5, stp2_10);
+  l[6] = _mm_add_epi16(stp2_6, stp1_9);
+  l[7] = _mm_add_epi16(stp2_7, stp1_8);
+  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
+  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
+  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
+  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
+  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
+  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
+  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
+  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
   // 2-D idct. We do 2 8x16 blocks.
   for (i = 0; i < 2; i++) {
-    if (i == 0)
-      TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-
-    if (i == 1)
-      TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-
-    in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+    array_transpose_4X8(l + 8*i, in);
+    in[8] = in[9] = in[10] = in[11] = in[12] = in[13] = in[14] = in[15] = zero;
 
     IDCT16_1D
 
     // Stage7
-    in0 = _mm_add_epi16(stp2_0, stp1_15);
-    in1 = _mm_add_epi16(stp2_1, stp1_14);
-    in2 = _mm_add_epi16(stp2_2, stp2_13);
-    in3 = _mm_add_epi16(stp2_3, stp2_12);
-    in4 = _mm_add_epi16(stp2_4, stp2_11);
-    in5 = _mm_add_epi16(stp2_5, stp2_10);
-    in6 = _mm_add_epi16(stp2_6, stp1_9);
-    in7 = _mm_add_epi16(stp2_7, stp1_8);
-    in8 = _mm_sub_epi16(stp2_7, stp1_8);
-    in9 = _mm_sub_epi16(stp2_6, stp1_9);
-    in10 = _mm_sub_epi16(stp2_5, stp2_10);
-    in11 = _mm_sub_epi16(stp2_4, stp2_11);
-    in12 = _mm_sub_epi16(stp2_3, stp2_12);
-    in13 = _mm_sub_epi16(stp2_2, stp2_13);
-    in14 = _mm_sub_epi16(stp2_1, stp1_14);
-    in15 = _mm_sub_epi16(stp2_0, stp1_15);
+    in[0] = _mm_add_epi16(stp2_0, stp1_15);
+    in[1] = _mm_add_epi16(stp2_1, stp1_14);
+    in[2] = _mm_add_epi16(stp2_2, stp2_13);
+    in[3] = _mm_add_epi16(stp2_3, stp2_12);
+    in[4] = _mm_add_epi16(stp2_4, stp2_11);
+    in[5] = _mm_add_epi16(stp2_5, stp2_10);
+    in[6] = _mm_add_epi16(stp2_6, stp1_9);
+    in[7] = _mm_add_epi16(stp2_7, stp1_8);
+    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
+    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
+    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
+    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
+    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
+    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
+    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
+    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
 
     // Final rounding and shift
-    in0 = _mm_adds_epi16(in0, final_rounding);
-    in1 = _mm_adds_epi16(in1, final_rounding);
-    in2 = _mm_adds_epi16(in2, final_rounding);
-    in3 = _mm_adds_epi16(in3, final_rounding);
-    in4 = _mm_adds_epi16(in4, final_rounding);
-    in5 = _mm_adds_epi16(in5, final_rounding);
-    in6 = _mm_adds_epi16(in6, final_rounding);
-    in7 = _mm_adds_epi16(in7, final_rounding);
-    in8 = _mm_adds_epi16(in8, final_rounding);
-    in9 = _mm_adds_epi16(in9, final_rounding);
-    in10 = _mm_adds_epi16(in10, final_rounding);
-    in11 = _mm_adds_epi16(in11, final_rounding);
-    in12 = _mm_adds_epi16(in12, final_rounding);
-    in13 = _mm_adds_epi16(in13, final_rounding);
-    in14 = _mm_adds_epi16(in14, final_rounding);
-    in15 = _mm_adds_epi16(in15, final_rounding);
-
-    in0 = _mm_srai_epi16(in0, 6);
-    in1 = _mm_srai_epi16(in1, 6);
-    in2 = _mm_srai_epi16(in2, 6);
-    in3 = _mm_srai_epi16(in3, 6);
-    in4 = _mm_srai_epi16(in4, 6);
-    in5 = _mm_srai_epi16(in5, 6);
-    in6 = _mm_srai_epi16(in6, 6);
-    in7 = _mm_srai_epi16(in7, 6);
-    in8 = _mm_srai_epi16(in8, 6);
-    in9 = _mm_srai_epi16(in9, 6);
-    in10 = _mm_srai_epi16(in10, 6);
-    in11 = _mm_srai_epi16(in11, 6);
-    in12 = _mm_srai_epi16(in12, 6);
-    in13 = _mm_srai_epi16(in13, 6);
-    in14 = _mm_srai_epi16(in14, 6);
-    in15 = _mm_srai_epi16(in15, 6);
-
-    RECON_AND_STORE(dest, in0);
-    RECON_AND_STORE(dest, in1);
-    RECON_AND_STORE(dest, in2);
-    RECON_AND_STORE(dest, in3);
-    RECON_AND_STORE(dest, in4);
-    RECON_AND_STORE(dest, in5);
-    RECON_AND_STORE(dest, in6);
-    RECON_AND_STORE(dest, in7);
-    RECON_AND_STORE(dest, in8);
-    RECON_AND_STORE(dest, in9);
-    RECON_AND_STORE(dest, in10);
-    RECON_AND_STORE(dest, in11);
-    RECON_AND_STORE(dest, in12);
-    RECON_AND_STORE(dest, in13);
-    RECON_AND_STORE(dest, in14);
-    RECON_AND_STORE(dest, in15);
+    in[0] = _mm_adds_epi16(in[0], final_rounding);
+    in[1] = _mm_adds_epi16(in[1], final_rounding);
+    in[2] = _mm_adds_epi16(in[2], final_rounding);
+    in[3] = _mm_adds_epi16(in[3], final_rounding);
+    in[4] = _mm_adds_epi16(in[4], final_rounding);
+    in[5] = _mm_adds_epi16(in[5], final_rounding);
+    in[6] = _mm_adds_epi16(in[6], final_rounding);
+    in[7] = _mm_adds_epi16(in[7], final_rounding);
+    in[8] = _mm_adds_epi16(in[8], final_rounding);
+    in[9] = _mm_adds_epi16(in[9], final_rounding);
+    in[10] = _mm_adds_epi16(in[10], final_rounding);
+    in[11] = _mm_adds_epi16(in[11], final_rounding);
+    in[12] = _mm_adds_epi16(in[12], final_rounding);
+    in[13] = _mm_adds_epi16(in[13], final_rounding);
+    in[14] = _mm_adds_epi16(in[14], final_rounding);
+    in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+    in[0] = _mm_srai_epi16(in[0], 6);
+    in[1] = _mm_srai_epi16(in[1], 6);
+    in[2] = _mm_srai_epi16(in[2], 6);
+    in[3] = _mm_srai_epi16(in[3], 6);
+    in[4] = _mm_srai_epi16(in[4], 6);
+    in[5] = _mm_srai_epi16(in[5], 6);
+    in[6] = _mm_srai_epi16(in[6], 6);
+    in[7] = _mm_srai_epi16(in[7], 6);
+    in[8] = _mm_srai_epi16(in[8], 6);
+    in[9] = _mm_srai_epi16(in[9], 6);
+    in[10] = _mm_srai_epi16(in[10], 6);
+    in[11] = _mm_srai_epi16(in[11], 6);
+    in[12] = _mm_srai_epi16(in[12], 6);
+    in[13] = _mm_srai_epi16(in[13], 6);
+    in[14] = _mm_srai_epi16(in[14], 6);
+    in[15] = _mm_srai_epi16(in[15], 6);
+
+    RECON_AND_STORE(dest, in[0]);
+    RECON_AND_STORE(dest, in[1]);
+    RECON_AND_STORE(dest, in[2]);
+    RECON_AND_STORE(dest, in[3]);
+    RECON_AND_STORE(dest, in[4]);
+    RECON_AND_STORE(dest, in[5]);
+    RECON_AND_STORE(dest, in[6]);
+    RECON_AND_STORE(dest, in[7]);
+    RECON_AND_STORE(dest, in[8]);
+    RECON_AND_STORE(dest, in[9]);
+    RECON_AND_STORE(dest, in[10]);
+    RECON_AND_STORE(dest, in[11]);
+    RECON_AND_STORE(dest, in[12]);
+    RECON_AND_STORE(dest, in[13]);
+    RECON_AND_STORE(dest, in[14]);
+    RECON_AND_STORE(dest, in[15]);
 
     dest += 8 - (stride * 16);
   }
@@ -2785,28 +2788,329 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
     input += 8; \
   }  \
 
+#define IDCT32_1D_34 \
+/* Stage1 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \
+  \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(zero, in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero); \
+  \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, \
+                         stg1_1, stp1_16, stp1_31); \
+  MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, \
+                         stg1_7, stp1_19, stp1_28); \
+  MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, \
+                         stg1_9, stp1_20, stp1_27); \
+  MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, \
+                         stg1_15, stp1_23, stp1_24); \
+} \
+\
+/* Stage2 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \
+  \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, \
+                         stg2_1, stp2_8, stp2_15); \
+  MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, \
+                         stg2_7, stp2_11, stp2_12); \
+  \
+  stp2_16 = stp1_16; \
+  stp2_19 = stp1_19; \
+  \
+  stp2_20 = stp1_20; \
+  stp2_23 = stp1_23; \
+  \
+  stp2_24 = stp1_24; \
+  stp2_27 = stp1_27; \
+  \
+  stp2_28 = stp1_28; \
+  stp2_31 = stp1_31; \
+} \
+\
+/* Stage3 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \
+  \
+  const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31); \
+  const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27); \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, \
+                         stg3_1, stp1_4, stp1_7); \
+  \
+  stp1_8 = stp2_8; \
+  stp1_11 = stp2_11; \
+  stp1_12 = stp2_12; \
+  stp1_15 = stp2_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
+                         stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
+                         stp1_18, stp1_29) \
+  MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
+                         stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
+                         stp1_22, stp1_25) \
+  \
+  stp1_16 = stp2_16; \
+  stp1_31 = stp2_31; \
+  stp1_19 = stp2_19; \
+  stp1_20 = stp2_20; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_27 = stp2_27; \
+  stp1_28 = stp2_28; \
+} \
+\
+/* Stage4 */ \
+{ \
+  const __m128i zero = _mm_setzero_si128();\
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \
+  \
+  const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); \
+  const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); \
+  \
+  MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, \
+                         stg4_1, stp2_0, stp2_1); \
+  \
+  stp2_4 = stp1_4; \
+  stp2_5 = stp1_4; \
+  stp2_6 = stp1_7; \
+  stp2_7 = stp1_7; \
+  \
+  MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
+                         stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
+                         stp2_10, stp2_13) \
+  \
+  stp2_8 = stp1_8; \
+  stp2_15 = stp1_15; \
+  stp2_11 = stp1_11; \
+  stp2_12 = stp1_12; \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
+  stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
+  stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
+  stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
+  stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
+  stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
+  \
+  stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
+  stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
+  stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
+  stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
+  stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
+} \
+\
+/* Stage5 */ \
+{ \
+  const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+  const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+  const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
+  const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
+  \
+  const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
+  const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  stp1_0 = stp2_0; \
+  stp1_1 = stp2_1; \
+  stp1_2 = stp2_1; \
+  stp1_3 = stp2_0; \
+  \
+  tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+  tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+  tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+  tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+  \
+  tmp0 = _mm_add_epi32(tmp0, rounding); \
+  tmp1 = _mm_add_epi32(tmp1, rounding); \
+  tmp2 = _mm_add_epi32(tmp2, rounding); \
+  tmp3 = _mm_add_epi32(tmp3, rounding); \
+  \
+  tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+  \
+  stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+  stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+  \
+  stp1_4 = stp2_4; \
+  stp1_7 = stp2_7; \
+  \
+  stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
+  stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+  stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+  stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+  stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  \
+  MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
+                         stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
+                         stp1_19, stp1_28) \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
+                         stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  \
+  stp1_22 = stp2_22; \
+  stp1_23 = stp2_23; \
+  stp1_24 = stp2_24; \
+  stp1_25 = stp2_25; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+} \
+\
+/* Stage6 */ \
+{ \
+  const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+  const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+  const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+  const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+  \
+  stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
+  stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+  stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+  stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
+  stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
+  stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+  stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+  stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
+  \
+  stp2_8 = stp1_8; \
+  stp2_9 = stp1_9; \
+  stp2_14 = stp1_14; \
+  stp2_15 = stp1_15; \
+  \
+  MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+                         stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
+                         stp2_13, stp2_11, stp2_12) \
+  \
+  stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
+  stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
+  stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
+  stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
+  stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
+  stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
+  stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
+  stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
+  \
+  stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
+  stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
+  stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
+  stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
+  stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
+  stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
+  stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
+  stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
+} \
+\
+/* Stage7 */ \
+{ \
+  const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
+  const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
+  const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
+  const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
+  \
+  const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
+  const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
+  const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
+  const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
+  \
+  stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
+  stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
+  stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
+  stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
+  stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
+  stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
+  stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
+  stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
+  stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
+  stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
+  stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
+  stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
+  stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
+  stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
+  stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
+  stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
+  \
+  stp1_16 = stp2_16; \
+  stp1_17 = stp2_17; \
+  stp1_18 = stp2_18; \
+  stp1_19 = stp2_19; \
+  \
+  MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
+                         stp1_21, stp1_26) \
+  MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
+                         stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
+                         stp1_23, stp1_24) \
+  \
+  stp1_28 = stp2_28; \
+  stp1_29 = stp2_29; \
+  stp1_30 = stp2_30; \
+  stp1_31 = stp2_31; \
+}
+
+
 #define IDCT32_1D \
 /* Stage1 */ \
 { \
-  const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
-  const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
-  const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
-  const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
-  \
-  const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
-  const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
-  const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
-  const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
-  \
-  const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
-  const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
-  const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
-  const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
-  \
-  const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
-  const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
-  const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
-  const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
+  const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); \
+  const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); \
+  const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); \
+  const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); \
+  \
+  const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); \
+  const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); \
+  const __m128i lo_25_7= _mm_unpacklo_epi16(in[25], in[7]); \
+  const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); \
+  \
+  const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); \
+  const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); \
+  const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); \
+  const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); \
+  \
+  const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); \
+  const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); \
+  const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); \
+  const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); \
   \
   MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
                          stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
@@ -2824,15 +3128,15 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage2 */ \
 { \
-  const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
-  const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
-  const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
-  const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
+  const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); \
+  const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); \
+  const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); \
+  const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); \
   \
-  const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
-  const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
-  const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
-  const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
+  const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); \
+  const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); \
+  const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); \
+  const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); \
   \
   MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
                          stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
@@ -2864,10 +3168,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage3 */ \
 { \
-  const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
-  const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
-  const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
-  const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
+  const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); \
+  const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); \
+  const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); \
+  const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); \
   \
   const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
   const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
@@ -2911,10 +3215,10 @@ void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
 \
 /* Stage4 */ \
 { \
-  const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
-  const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
-  const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
-  const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
+  const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); \
+  const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); \
+  const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); \
+  const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); \
   \
   const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
   const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
@@ -3171,10 +3475,7 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[32];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3186,296 +3487,225 @@ void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
-
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
-    i32 = (i << 5);
-    if (i == 0) {
-      // First 1-D idct: first 8 rows
-      // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else if (i < 4) {
-      // First 1-D idct: next 24 zero-coeff rows
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    } else {
-      // Second 1-D idct
-      j = i - 4;
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
-
-    IDCT32_1D
+  int i;
+  // Load input data.
+  LOAD_DQCOEFF(in[0], input);
+  LOAD_DQCOEFF(in[8], input);
+  LOAD_DQCOEFF(in[16], input);
+  LOAD_DQCOEFF(in[24], input);
+  LOAD_DQCOEFF(in[1], input);
+  LOAD_DQCOEFF(in[9], input);
+  LOAD_DQCOEFF(in[17], input);
+  LOAD_DQCOEFF(in[25], input);
+  LOAD_DQCOEFF(in[2], input);
+  LOAD_DQCOEFF(in[10], input);
+  LOAD_DQCOEFF(in[18], input);
+  LOAD_DQCOEFF(in[26], input);
+  LOAD_DQCOEFF(in[3], input);
+  LOAD_DQCOEFF(in[11], input);
+  LOAD_DQCOEFF(in[19], input);
+  LOAD_DQCOEFF(in[27], input);
+
+  LOAD_DQCOEFF(in[4], input);
+  LOAD_DQCOEFF(in[12], input);
+  LOAD_DQCOEFF(in[20], input);
+  LOAD_DQCOEFF(in[28], input);
+  LOAD_DQCOEFF(in[5], input);
+  LOAD_DQCOEFF(in[13], input);
+  LOAD_DQCOEFF(in[21], input);
+  LOAD_DQCOEFF(in[29], input);
+  LOAD_DQCOEFF(in[6], input);
+  LOAD_DQCOEFF(in[14], input);
+  LOAD_DQCOEFF(in[22], input);
+  LOAD_DQCOEFF(in[30], input);
+  LOAD_DQCOEFF(in[7], input);
+  LOAD_DQCOEFF(in[15], input);
+  LOAD_DQCOEFF(in[23], input);
+  LOAD_DQCOEFF(in[31], input);
 
-    // final stage
-    if (i < 4) {
-      // 1_D: Store 32 intermediate results for each 8x32 block.
-      col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-      col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-      col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-      col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-      col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-      col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-      col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-      col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-      col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-      col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-      col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-      col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-      col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-      col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-      col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-      col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-      col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-      col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-      col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-      col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-      col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-      col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-      col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-      col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-      col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-      col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-      col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-      col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-      col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-      col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-      col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-      col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+  array_transpose_8x8(in, in);
+  array_transpose_8x8(in+8, in+8);
+  array_transpose_8x8(in+16, in+16);
+  array_transpose_8x8(in+24, in+24);
+
+  IDCT32_1D
+
+  // 1_D: Store 32 intermediate results for each 8x32 block.
+  col[0] = _mm_add_epi16(stp1_0, stp1_31);
+  col[1] = _mm_add_epi16(stp1_1, stp1_30);
+  col[2] = _mm_add_epi16(stp1_2, stp1_29);
+  col[3] = _mm_add_epi16(stp1_3, stp1_28);
+  col[4] = _mm_add_epi16(stp1_4, stp1_27);
+  col[5] = _mm_add_epi16(stp1_5, stp1_26);
+  col[6] = _mm_add_epi16(stp1_6, stp1_25);
+  col[7] = _mm_add_epi16(stp1_7, stp1_24);
+  col[8] = _mm_add_epi16(stp1_8, stp1_23);
+  col[9] = _mm_add_epi16(stp1_9, stp1_22);
+  col[10] = _mm_add_epi16(stp1_10, stp1_21);
+  col[11] = _mm_add_epi16(stp1_11, stp1_20);
+  col[12] = _mm_add_epi16(stp1_12, stp1_19);
+  col[13] = _mm_add_epi16(stp1_13, stp1_18);
+  col[14] = _mm_add_epi16(stp1_14, stp1_17);
+  col[15] = _mm_add_epi16(stp1_15, stp1_16);
+  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
+  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
+  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
+  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
+  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
+  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
+  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
+  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
+  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
+  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
+  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
+  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
+  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
+  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
+  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
+  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+i*8, in);
+      IDCT32_1D_34
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
   }
-}
 
 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
                                  int stride) {
@@ -3530,10 +3760,7 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
 
   const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
-          in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
-          in24, in25, in26, in27, in28, in29, in30, in31;
-  __m128i col[128];
+  __m128i in[32], col[128], zero_idx[16];
   __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
           stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
           stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
@@ -3546,66 +3773,63 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
           stp2_30, stp2_31;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i, j, i32;
-  __m128i zero_idx[16];
   int zero_flag[2];
 
-  // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
-  for (i = 0; i < 8; i++) {
+  for (i = 0; i < 4; i++) {
     i32 = (i << 5);
-    if (i < 4) {
       // First 1-D idct
       // Load input data.
-      LOAD_DQCOEFF(in0, input);
-      LOAD_DQCOEFF(in8, input);
-      LOAD_DQCOEFF(in16, input);
-      LOAD_DQCOEFF(in24, input);
-      LOAD_DQCOEFF(in1, input);
-      LOAD_DQCOEFF(in9, input);
-      LOAD_DQCOEFF(in17, input);
-      LOAD_DQCOEFF(in25, input);
-      LOAD_DQCOEFF(in2, input);
-      LOAD_DQCOEFF(in10, input);
-      LOAD_DQCOEFF(in18, input);
-      LOAD_DQCOEFF(in26, input);
-      LOAD_DQCOEFF(in3, input);
-      LOAD_DQCOEFF(in11, input);
-      LOAD_DQCOEFF(in19, input);
-      LOAD_DQCOEFF(in27, input);
-
-      LOAD_DQCOEFF(in4, input);
-      LOAD_DQCOEFF(in12, input);
-      LOAD_DQCOEFF(in20, input);
-      LOAD_DQCOEFF(in28, input);
-      LOAD_DQCOEFF(in5, input);
-      LOAD_DQCOEFF(in13, input);
-      LOAD_DQCOEFF(in21, input);
-      LOAD_DQCOEFF(in29, input);
-      LOAD_DQCOEFF(in6, input);
-      LOAD_DQCOEFF(in14, input);
-      LOAD_DQCOEFF(in22, input);
-      LOAD_DQCOEFF(in30, input);
-      LOAD_DQCOEFF(in7, input);
-      LOAD_DQCOEFF(in15, input);
-      LOAD_DQCOEFF(in23, input);
-      LOAD_DQCOEFF(in31, input);
+      LOAD_DQCOEFF(in[0], input);
+      LOAD_DQCOEFF(in[8], input);
+      LOAD_DQCOEFF(in[16], input);
+      LOAD_DQCOEFF(in[24], input);
+      LOAD_DQCOEFF(in[1], input);
+      LOAD_DQCOEFF(in[9], input);
+      LOAD_DQCOEFF(in[17], input);
+      LOAD_DQCOEFF(in[25], input);
+      LOAD_DQCOEFF(in[2], input);
+      LOAD_DQCOEFF(in[10], input);
+      LOAD_DQCOEFF(in[18], input);
+      LOAD_DQCOEFF(in[26], input);
+      LOAD_DQCOEFF(in[3], input);
+      LOAD_DQCOEFF(in[11], input);
+      LOAD_DQCOEFF(in[19], input);
+      LOAD_DQCOEFF(in[27], input);
+
+      LOAD_DQCOEFF(in[4], input);
+      LOAD_DQCOEFF(in[12], input);
+      LOAD_DQCOEFF(in[20], input);
+      LOAD_DQCOEFF(in[28], input);
+      LOAD_DQCOEFF(in[5], input);
+      LOAD_DQCOEFF(in[13], input);
+      LOAD_DQCOEFF(in[21], input);
+      LOAD_DQCOEFF(in[29], input);
+      LOAD_DQCOEFF(in[6], input);
+      LOAD_DQCOEFF(in[14], input);
+      LOAD_DQCOEFF(in[22], input);
+      LOAD_DQCOEFF(in[30], input);
+      LOAD_DQCOEFF(in[7], input);
+      LOAD_DQCOEFF(in[15], input);
+      LOAD_DQCOEFF(in[23], input);
+      LOAD_DQCOEFF(in[31], input);
 
       // checking if all entries are zero
-      zero_idx[0] = _mm_or_si128(in0, in1);
-      zero_idx[1] = _mm_or_si128(in2, in3);
-      zero_idx[2] = _mm_or_si128(in4, in5);
-      zero_idx[3] = _mm_or_si128(in6, in7);
-      zero_idx[4] = _mm_or_si128(in8, in9);
-      zero_idx[5] = _mm_or_si128(in10, in11);
-      zero_idx[6] = _mm_or_si128(in12, in13);
-      zero_idx[7] = _mm_or_si128(in14, in15);
-      zero_idx[8] = _mm_or_si128(in16, in17);
-      zero_idx[9] = _mm_or_si128(in18, in19);
-      zero_idx[10] = _mm_or_si128(in20, in21);
-      zero_idx[11] = _mm_or_si128(in22, in23);
-      zero_idx[12] = _mm_or_si128(in24, in25);
-      zero_idx[13] = _mm_or_si128(in26, in27);
-      zero_idx[14] = _mm_or_si128(in28, in29);
-      zero_idx[15] = _mm_or_si128(in30, in31);
+      zero_idx[0] = _mm_or_si128(in[0], in[1]);
+      zero_idx[1] = _mm_or_si128(in[2], in[3]);
+      zero_idx[2] = _mm_or_si128(in[4], in[5]);
+      zero_idx[3] = _mm_or_si128(in[6], in[7]);
+      zero_idx[4] = _mm_or_si128(in[8], in[9]);
+      zero_idx[5] = _mm_or_si128(in[10], in[11]);
+      zero_idx[6] = _mm_or_si128(in[12], in[13]);
+      zero_idx[7] = _mm_or_si128(in[14], in[15]);
+      zero_idx[8] = _mm_or_si128(in[16], in[17]);
+      zero_idx[9] = _mm_or_si128(in[18], in[19]);
+      zero_idx[10] = _mm_or_si128(in[20], in[21]);
+      zero_idx[11] = _mm_or_si128(in[22], in[23]);
+      zero_idx[12] = _mm_or_si128(in[24], in[25]);
+      zero_idx[13] = _mm_or_si128(in[26], in[27]);
+      zero_idx[14] = _mm_or_si128(in[28], in[29]);
+      zero_idx[15] = _mm_or_si128(in[30], in[31]);
 
       zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
       zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
@@ -3667,44 +3891,13 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       }
 
       // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                    in4, in5, in6, in7);
-      TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
-                    in10, in11, in12, in13, in14, in15);
-      TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
-                    in18, in19, in20, in21, in22, in23);
-      TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
-                    in26, in27, in28, in29, in30, in31);
-    } else {
-      // Second 1-D idct
-      j = i - 4;
-
-      // Transpose 32x8 block to 8x32 block
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
-                    in5, in6, in7);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
-                    in11, in12, in13, in14, in15);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
-                    in19, in20, in21, in22, in23);
-      j += 4;
-      TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
-                    col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
-                    col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
-                    in28, in29, in30, in31);
-    }
+      array_transpose_8x8(in, in);
+      array_transpose_8x8(in+8, in+8);
+      array_transpose_8x8(in+16, in+16);
+      array_transpose_8x8(in+24, in+24);
 
-    IDCT32_1D
+      IDCT32_1D
 
-    // final stage
-    if (i < 4) {
       // 1_D: Store 32 intermediate results for each 8x32 block.
       col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
       col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
@@ -3738,146 +3931,156 @@ void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
       col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
       col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
-    } else {
+    }
+  for (i = 0; i < 4; i++) {
       const __m128i zero = _mm_setzero_si128();
+      // Second 1-D idct
+      j = i << 3;
+
+      // Transpose 32x8 block to 8x32 block
+      array_transpose_8x8(col+j, in);
+      array_transpose_8x8(col+j+32, in+8);
+      array_transpose_8x8(col+j+64, in+16);
+      array_transpose_8x8(col+j+96, in+24);
+
+      IDCT32_1D
 
       // 2_D: Calculate the results and store them to destination.
-      in0 = _mm_add_epi16(stp1_0, stp1_31);
-      in1 = _mm_add_epi16(stp1_1, stp1_30);
-      in2 = _mm_add_epi16(stp1_2, stp1_29);
-      in3 = _mm_add_epi16(stp1_3, stp1_28);
-      in4 = _mm_add_epi16(stp1_4, stp1_27);
-      in5 = _mm_add_epi16(stp1_5, stp1_26);
-      in6 = _mm_add_epi16(stp1_6, stp1_25);
-      in7 = _mm_add_epi16(stp1_7, stp1_24);
-      in8 = _mm_add_epi16(stp1_8, stp1_23);
-      in9 = _mm_add_epi16(stp1_9, stp1_22);
-      in10 = _mm_add_epi16(stp1_10, stp1_21);
-      in11 = _mm_add_epi16(stp1_11, stp1_20);
-      in12 = _mm_add_epi16(stp1_12, stp1_19);
-      in13 = _mm_add_epi16(stp1_13, stp1_18);
-      in14 = _mm_add_epi16(stp1_14, stp1_17);
-      in15 = _mm_add_epi16(stp1_15, stp1_16);
-      in16 = _mm_sub_epi16(stp1_15, stp1_16);
-      in17 = _mm_sub_epi16(stp1_14, stp1_17);
-      in18 = _mm_sub_epi16(stp1_13, stp1_18);
-      in19 = _mm_sub_epi16(stp1_12, stp1_19);
-      in20 = _mm_sub_epi16(stp1_11, stp1_20);
-      in21 = _mm_sub_epi16(stp1_10, stp1_21);
-      in22 = _mm_sub_epi16(stp1_9, stp1_22);
-      in23 = _mm_sub_epi16(stp1_8, stp1_23);
-      in24 = _mm_sub_epi16(stp1_7, stp1_24);
-      in25 = _mm_sub_epi16(stp1_6, stp1_25);
-      in26 = _mm_sub_epi16(stp1_5, stp1_26);
-      in27 = _mm_sub_epi16(stp1_4, stp1_27);
-      in28 = _mm_sub_epi16(stp1_3, stp1_28);
-      in29 = _mm_sub_epi16(stp1_2, stp1_29);
-      in30 = _mm_sub_epi16(stp1_1, stp1_30);
-      in31 = _mm_sub_epi16(stp1_0, stp1_31);
+      in[0] = _mm_add_epi16(stp1_0, stp1_31);
+      in[1] = _mm_add_epi16(stp1_1, stp1_30);
+      in[2] = _mm_add_epi16(stp1_2, stp1_29);
+      in[3] = _mm_add_epi16(stp1_3, stp1_28);
+      in[4] = _mm_add_epi16(stp1_4, stp1_27);
+      in[5] = _mm_add_epi16(stp1_5, stp1_26);
+      in[6] = _mm_add_epi16(stp1_6, stp1_25);
+      in[7] = _mm_add_epi16(stp1_7, stp1_24);
+      in[8] = _mm_add_epi16(stp1_8, stp1_23);
+      in[9] = _mm_add_epi16(stp1_9, stp1_22);
+      in[10] = _mm_add_epi16(stp1_10, stp1_21);
+      in[11] = _mm_add_epi16(stp1_11, stp1_20);
+      in[12] = _mm_add_epi16(stp1_12, stp1_19);
+      in[13] = _mm_add_epi16(stp1_13, stp1_18);
+      in[14] = _mm_add_epi16(stp1_14, stp1_17);
+      in[15] = _mm_add_epi16(stp1_15, stp1_16);
+      in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+      in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+      in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+      in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+      in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+      in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+      in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+      in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+      in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+      in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+      in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+      in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+      in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+      in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+      in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+      in[31] = _mm_sub_epi16(stp1_0, stp1_31);
 
       // Final rounding and shift
-      in0 = _mm_adds_epi16(in0, final_rounding);
-      in1 = _mm_adds_epi16(in1, final_rounding);
-      in2 = _mm_adds_epi16(in2, final_rounding);
-      in3 = _mm_adds_epi16(in3, final_rounding);
-      in4 = _mm_adds_epi16(in4, final_rounding);
-      in5 = _mm_adds_epi16(in5, final_rounding);
-      in6 = _mm_adds_epi16(in6, final_rounding);
-      in7 = _mm_adds_epi16(in7, final_rounding);
-      in8 = _mm_adds_epi16(in8, final_rounding);
-      in9 = _mm_adds_epi16(in9, final_rounding);
-      in10 = _mm_adds_epi16(in10, final_rounding);
-      in11 = _mm_adds_epi16(in11, final_rounding);
-      in12 = _mm_adds_epi16(in12, final_rounding);
-      in13 = _mm_adds_epi16(in13, final_rounding);
-      in14 = _mm_adds_epi16(in14, final_rounding);
-      in15 = _mm_adds_epi16(in15, final_rounding);
-      in16 = _mm_adds_epi16(in16, final_rounding);
-      in17 = _mm_adds_epi16(in17, final_rounding);
-      in18 = _mm_adds_epi16(in18, final_rounding);
-      in19 = _mm_adds_epi16(in19, final_rounding);
-      in20 = _mm_adds_epi16(in20, final_rounding);
-      in21 = _mm_adds_epi16(in21, final_rounding);
-      in22 = _mm_adds_epi16(in22, final_rounding);
-      in23 = _mm_adds_epi16(in23, final_rounding);
-      in24 = _mm_adds_epi16(in24, final_rounding);
-      in25 = _mm_adds_epi16(in25, final_rounding);
-      in26 = _mm_adds_epi16(in26, final_rounding);
-      in27 = _mm_adds_epi16(in27, final_rounding);
-      in28 = _mm_adds_epi16(in28, final_rounding);
-      in29 = _mm_adds_epi16(in29, final_rounding);
-      in30 = _mm_adds_epi16(in30, final_rounding);
-      in31 = _mm_adds_epi16(in31, final_rounding);
-
-      in0 = _mm_srai_epi16(in0, 6);
-      in1 = _mm_srai_epi16(in1, 6);
-      in2 = _mm_srai_epi16(in2, 6);
-      in3 = _mm_srai_epi16(in3, 6);
-      in4 = _mm_srai_epi16(in4, 6);
-      in5 = _mm_srai_epi16(in5, 6);
-      in6 = _mm_srai_epi16(in6, 6);
-      in7 = _mm_srai_epi16(in7, 6);
-      in8 = _mm_srai_epi16(in8, 6);
-      in9 = _mm_srai_epi16(in9, 6);
-      in10 = _mm_srai_epi16(in10, 6);
-      in11 = _mm_srai_epi16(in11, 6);
-      in12 = _mm_srai_epi16(in12, 6);
-      in13 = _mm_srai_epi16(in13, 6);
-      in14 = _mm_srai_epi16(in14, 6);
-      in15 = _mm_srai_epi16(in15, 6);
-      in16 = _mm_srai_epi16(in16, 6);
-      in17 = _mm_srai_epi16(in17, 6);
-      in18 = _mm_srai_epi16(in18, 6);
-      in19 = _mm_srai_epi16(in19, 6);
-      in20 = _mm_srai_epi16(in20, 6);
-      in21 = _mm_srai_epi16(in21, 6);
-      in22 = _mm_srai_epi16(in22, 6);
-      in23 = _mm_srai_epi16(in23, 6);
-      in24 = _mm_srai_epi16(in24, 6);
-      in25 = _mm_srai_epi16(in25, 6);
-      in26 = _mm_srai_epi16(in26, 6);
-      in27 = _mm_srai_epi16(in27, 6);
-      in28 = _mm_srai_epi16(in28, 6);
-      in29 = _mm_srai_epi16(in29, 6);
-      in30 = _mm_srai_epi16(in30, 6);
-      in31 = _mm_srai_epi16(in31, 6);
-
-      RECON_AND_STORE(dest, in0);
-      RECON_AND_STORE(dest, in1);
-      RECON_AND_STORE(dest, in2);
-      RECON_AND_STORE(dest, in3);
-      RECON_AND_STORE(dest, in4);
-      RECON_AND_STORE(dest, in5);
-      RECON_AND_STORE(dest, in6);
-      RECON_AND_STORE(dest, in7);
-      RECON_AND_STORE(dest, in8);
-      RECON_AND_STORE(dest, in9);
-      RECON_AND_STORE(dest, in10);
-      RECON_AND_STORE(dest, in11);
-      RECON_AND_STORE(dest, in12);
-      RECON_AND_STORE(dest, in13);
-      RECON_AND_STORE(dest, in14);
-      RECON_AND_STORE(dest, in15);
-      RECON_AND_STORE(dest, in16);
-      RECON_AND_STORE(dest, in17);
-      RECON_AND_STORE(dest, in18);
-      RECON_AND_STORE(dest, in19);
-      RECON_AND_STORE(dest, in20);
-      RECON_AND_STORE(dest, in21);
-      RECON_AND_STORE(dest, in22);
-      RECON_AND_STORE(dest, in23);
-      RECON_AND_STORE(dest, in24);
-      RECON_AND_STORE(dest, in25);
-      RECON_AND_STORE(dest, in26);
-      RECON_AND_STORE(dest, in27);
-      RECON_AND_STORE(dest, in28);
-      RECON_AND_STORE(dest, in29);
-      RECON_AND_STORE(dest, in30);
-      RECON_AND_STORE(dest, in31);
+      in[0] = _mm_adds_epi16(in[0], final_rounding);
+      in[1] = _mm_adds_epi16(in[1], final_rounding);
+      in[2] = _mm_adds_epi16(in[2], final_rounding);
+      in[3] = _mm_adds_epi16(in[3], final_rounding);
+      in[4] = _mm_adds_epi16(in[4], final_rounding);
+      in[5] = _mm_adds_epi16(in[5], final_rounding);
+      in[6] = _mm_adds_epi16(in[6], final_rounding);
+      in[7] = _mm_adds_epi16(in[7], final_rounding);
+      in[8] = _mm_adds_epi16(in[8], final_rounding);
+      in[9] = _mm_adds_epi16(in[9], final_rounding);
+      in[10] = _mm_adds_epi16(in[10], final_rounding);
+      in[11] = _mm_adds_epi16(in[11], final_rounding);
+      in[12] = _mm_adds_epi16(in[12], final_rounding);
+      in[13] = _mm_adds_epi16(in[13], final_rounding);
+      in[14] = _mm_adds_epi16(in[14], final_rounding);
+      in[15] = _mm_adds_epi16(in[15], final_rounding);
+      in[16] = _mm_adds_epi16(in[16], final_rounding);
+      in[17] = _mm_adds_epi16(in[17], final_rounding);
+      in[18] = _mm_adds_epi16(in[18], final_rounding);
+      in[19] = _mm_adds_epi16(in[19], final_rounding);
+      in[20] = _mm_adds_epi16(in[20], final_rounding);
+      in[21] = _mm_adds_epi16(in[21], final_rounding);
+      in[22] = _mm_adds_epi16(in[22], final_rounding);
+      in[23] = _mm_adds_epi16(in[23], final_rounding);
+      in[24] = _mm_adds_epi16(in[24], final_rounding);
+      in[25] = _mm_adds_epi16(in[25], final_rounding);
+      in[26] = _mm_adds_epi16(in[26], final_rounding);
+      in[27] = _mm_adds_epi16(in[27], final_rounding);
+      in[28] = _mm_adds_epi16(in[28], final_rounding);
+      in[29] = _mm_adds_epi16(in[29], final_rounding);
+      in[30] = _mm_adds_epi16(in[30], final_rounding);
+      in[31] = _mm_adds_epi16(in[31], final_rounding);
+
+      in[0] = _mm_srai_epi16(in[0], 6);
+      in[1] = _mm_srai_epi16(in[1], 6);
+      in[2] = _mm_srai_epi16(in[2], 6);
+      in[3] = _mm_srai_epi16(in[3], 6);
+      in[4] = _mm_srai_epi16(in[4], 6);
+      in[5] = _mm_srai_epi16(in[5], 6);
+      in[6] = _mm_srai_epi16(in[6], 6);
+      in[7] = _mm_srai_epi16(in[7], 6);
+      in[8] = _mm_srai_epi16(in[8], 6);
+      in[9] = _mm_srai_epi16(in[9], 6);
+      in[10] = _mm_srai_epi16(in[10], 6);
+      in[11] = _mm_srai_epi16(in[11], 6);
+      in[12] = _mm_srai_epi16(in[12], 6);
+      in[13] = _mm_srai_epi16(in[13], 6);
+      in[14] = _mm_srai_epi16(in[14], 6);
+      in[15] = _mm_srai_epi16(in[15], 6);
+      in[16] = _mm_srai_epi16(in[16], 6);
+      in[17] = _mm_srai_epi16(in[17], 6);
+      in[18] = _mm_srai_epi16(in[18], 6);
+      in[19] = _mm_srai_epi16(in[19], 6);
+      in[20] = _mm_srai_epi16(in[20], 6);
+      in[21] = _mm_srai_epi16(in[21], 6);
+      in[22] = _mm_srai_epi16(in[22], 6);
+      in[23] = _mm_srai_epi16(in[23], 6);
+      in[24] = _mm_srai_epi16(in[24], 6);
+      in[25] = _mm_srai_epi16(in[25], 6);
+      in[26] = _mm_srai_epi16(in[26], 6);
+      in[27] = _mm_srai_epi16(in[27], 6);
+      in[28] = _mm_srai_epi16(in[28], 6);
+      in[29] = _mm_srai_epi16(in[29], 6);
+      in[30] = _mm_srai_epi16(in[30], 6);
+      in[31] = _mm_srai_epi16(in[31], 6);
+
+      RECON_AND_STORE(dest, in[0]);
+      RECON_AND_STORE(dest, in[1]);
+      RECON_AND_STORE(dest, in[2]);
+      RECON_AND_STORE(dest, in[3]);
+      RECON_AND_STORE(dest, in[4]);
+      RECON_AND_STORE(dest, in[5]);
+      RECON_AND_STORE(dest, in[6]);
+      RECON_AND_STORE(dest, in[7]);
+      RECON_AND_STORE(dest, in[8]);
+      RECON_AND_STORE(dest, in[9]);
+      RECON_AND_STORE(dest, in[10]);
+      RECON_AND_STORE(dest, in[11]);
+      RECON_AND_STORE(dest, in[12]);
+      RECON_AND_STORE(dest, in[13]);
+      RECON_AND_STORE(dest, in[14]);
+      RECON_AND_STORE(dest, in[15]);
+      RECON_AND_STORE(dest, in[16]);
+      RECON_AND_STORE(dest, in[17]);
+      RECON_AND_STORE(dest, in[18]);
+      RECON_AND_STORE(dest, in[19]);
+      RECON_AND_STORE(dest, in[20]);
+      RECON_AND_STORE(dest, in[21]);
+      RECON_AND_STORE(dest, in[22]);
+      RECON_AND_STORE(dest, in[23]);
+      RECON_AND_STORE(dest, in[24]);
+      RECON_AND_STORE(dest, in[25]);
+      RECON_AND_STORE(dest, in[26]);
+      RECON_AND_STORE(dest, in[27]);
+      RECON_AND_STORE(dest, in[28]);
+      RECON_AND_STORE(dest, in[29]);
+      RECON_AND_STORE(dest, in[30]);
+      RECON_AND_STORE(dest, in[31]);
 
       dest += 8 - (stride * 32);
     }
-  }
 }  //NOLINT
 
 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
diff --git a/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c b/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
new file mode 100644
index 0000000..3c5cb8f
--- /dev/null
+++ b/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_avx2.c
@@ -0,0 +1,943 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  /* AVX2 */
+
+static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
+    __m128i abs_p1p0;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
+    q4p4 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
+    q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
+    q3p3 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
+    q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
+    q2p2 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
+    q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
+    q1p1 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
+    p1q1 = _mm_shuffle_epi32(q1p1, 78);
+    q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
+    q0p0 = _mm_castps_si128(
+            _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
+    p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+    {
+        __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
+        abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                _mm_subs_epu8(q0p0, q1p1));
+        abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+        fe = _mm_set1_epi8(0xfe);
+        ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                _mm_subs_epu8(p0q0, q0p0));
+        abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                _mm_subs_epu8(p1q1, q1p1));
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(abs_p1p0, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                        _mm_subs_epu8(q1p1, q2p2)),
+                _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                        _mm_subs_epu8(q2p2, q3p3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i t1 = _mm_set1_epi16(0x1);
+        __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
+        __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
+        __m128i qs0 = _mm_xor_si128(p0q0, t80);
+        __m128i qs1 = _mm_xor_si128(p1q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+        __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+        filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, qs0ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        filter1 = _mm_unpacklo_epi8(zero, filter1);
+        filter1 = _mm_srai_epi16(filter1, 0xB);
+        filter2 = _mm_unpacklo_epi8(zero, filter2);
+        filter2 = _mm_srai_epi16(filter2, 0xB);
+
+        /* Filter1 >> 3 */
+        filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
+        qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi16(filter1, t1);
+        filt = _mm_srai_epi16(filt, 1);
+        filt = _mm_andnot_si128(
+                _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
+        filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
+        qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            flat = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                            _mm_subs_epu8(q0p0, q2p2)),
+                    _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                            _mm_subs_epu8(q0p0, q3p3)));
+            flat = _mm_max_epu8(abs_p1p0, flat);
+            flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
+            q5p5 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q5p5),
+                            (__m64 *) (s + 5 * p)));
+
+            q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
+            q6p6 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q6p6),
+                            (__m64 *) (s + 6 * p)));
+
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
+                            _mm_subs_epu8(q0p0, q4p4)),
+                    _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
+                            _mm_subs_epu8(q0p0, q5p5)));
+
+            q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
+            q7p7 = _mm_castps_si128(
+                    _mm_loadh_pi(_mm_castsi128_ps(q7p7),
+                            (__m64 *) (s + 7 * p)));
+
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
+                            _mm_subs_epu8(q0p0, q6p6)),
+                    _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
+                            _mm_subs_epu8(q0p0, q7p7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m128i eight = _mm_set1_epi16(8);
+            const __m128i four = _mm_set1_epi16(4);
+            __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+            __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+            __m128i pixelFilter_p, pixelFilter_q;
+            __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+            __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
+
+            p7_16 = _mm_unpacklo_epi8(q7p7, zero);
+            p6_16 = _mm_unpacklo_epi8(q6p6, zero);
+            p5_16 = _mm_unpacklo_epi8(q5p5, zero);
+            p4_16 = _mm_unpacklo_epi8(q4p4, zero);
+            p3_16 = _mm_unpacklo_epi8(q3p3, zero);
+            p2_16 = _mm_unpacklo_epi8(q2p2, zero);
+            p1_16 = _mm_unpacklo_epi8(q1p1, zero);
+            p0_16 = _mm_unpacklo_epi8(q0p0, zero);
+            q0_16 = _mm_unpackhi_epi8(q0p0, zero);
+            q1_16 = _mm_unpackhi_epi8(q1p1, zero);
+            q2_16 = _mm_unpackhi_epi8(q2p2, zero);
+            q3_16 = _mm_unpackhi_epi8(q3p3, zero);
+            q4_16 = _mm_unpackhi_epi8(q4p4, zero);
+            q5_16 = _mm_unpackhi_epi8(q5p5, zero);
+            q6_16 = _mm_unpackhi_epi8(q6p6, zero);
+            q7_16 = _mm_unpackhi_epi8(q7p7, zero);
+
+            pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
+                    _mm_add_epi16(p4_16, p3_16));
+            pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
+                    _mm_add_epi16(q4_16, q3_16));
+
+            pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
+                    _mm_add_epi16(p2_16, p1_16));
+            pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
+                    _mm_add_epi16(q2_16, q1_16));
+            pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+            pixelFilter_p = _mm_add_epi16(eight,
+                    _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+            pixetFilter_p2p1p0 = _mm_add_epi16(four,
+                    _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
+                    4);
+            flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(p3_16, p0_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(q3_16, q0_16)), 3);
+
+            flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(p7_16, p7_16);
+            sum_q7 = _mm_add_epi16(q7_16, q7_16);
+            sum_p3 = _mm_add_epi16(p3_16, p3_16);
+            sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
+                    4);
+            flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p1_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q1_16)), 3);
+            flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+            sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
+                    4);
+            flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+            pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_p2p1p0,
+                            _mm_add_epi16(sum_p3, p2_16)), 3);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixetFilter_q2q1q0,
+                            _mm_add_epi16(sum_q3, q2_16)), 3);
+            flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
+                    4);
+            flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
+                    4);
+            flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
+                    4);
+            flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+            sum_p7 = _mm_add_epi16(sum_p7, p7_16);
+            sum_q7 = _mm_add_epi16(sum_q7, q7_16);
+            pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+            pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+            res_p = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
+                    4);
+            res_q = _mm_srli_epi16(
+                    _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
+                    4);
+            flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
+        }
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        flat = _mm_shuffle_epi32(flat, 68);
+        flat2 = _mm_shuffle_epi32(flat2, 68);
+
+        q2p2 = _mm_andnot_si128(flat, q2p2);
+        flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat_q2p2);
+
+        qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+        flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+        q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+        qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+        flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+        q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+        q6p6 = _mm_andnot_si128(flat2, q6p6);
+        flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
+        q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
+        _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
+        _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
+
+        q5p5 = _mm_andnot_si128(flat2, q5p5);
+        flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+        q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
+        _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
+        _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
+
+        q4p4 = _mm_andnot_si128(flat2, q4p4);
+        flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+        q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
+        _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
+        _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
+
+        q3p3 = _mm_andnot_si128(flat2, q3p3);
+        flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+        q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
+        _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
+        _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
+
+        q2p2 = _mm_andnot_si128(flat2, q2p2);
+        flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+        q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
+        _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
+        _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
+
+        q1p1 = _mm_andnot_si128(flat2, q1p1);
+        flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+        q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
+        _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
+        _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
+
+        q0p0 = _mm_andnot_si128(flat2, q0p0);
+        flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+        q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
+        _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
+        _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
+    }
+}
+
+static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh) {
+    __m128i mask, hev, flat, flat2;
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i p7, p6, p5;
+    __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+    __m128i q5, q6, q7;
+
+    const __m128i thresh = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _thresh[0]));
+    const __m128i limit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _limit[0]));
+    const __m128i blimit = _mm_broadcastb_epi8(
+            _mm_cvtsi32_si128((int) _blimit[0]));
+
+    p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
+    p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
+    p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
+    p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
+    p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
+    q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
+    q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
+    q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
+    q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
+    q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
+
+    {
+        const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                _mm_subs_epu8(p0, p1));
+        const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                _mm_subs_epu8(q0, q1));
+        const __m128i fe = _mm_set1_epi8(0xfe);
+        const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+        __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                _mm_subs_epu8(q0, p0));
+        __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                _mm_subs_epu8(q1, p1));
+        __m128i work;
+        flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+        hev = _mm_subs_epu8(flat, thresh);
+        hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+        abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+        abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+        mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+        mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+        // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+        mask = _mm_max_epu8(flat, mask);
+        // mask |= (abs(p1 - p0) > limit) * -1;
+        // mask |= (abs(q1 - q0) > limit) * -1;
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+                _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+        mask = _mm_max_epu8(work, mask);
+        work = _mm_max_epu8(
+                _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+                _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+        mask = _mm_max_epu8(work, mask);
+        mask = _mm_subs_epu8(mask, limit);
+        mask = _mm_cmpeq_epi8(mask, zero);
+    }
+
+    // lp filter
+    {
+        const __m128i t4 = _mm_set1_epi8(4);
+        const __m128i t3 = _mm_set1_epi8(3);
+        const __m128i t80 = _mm_set1_epi8(0x80);
+        const __m128i te0 = _mm_set1_epi8(0xe0);
+        const __m128i t1f = _mm_set1_epi8(0x1f);
+        const __m128i t1 = _mm_set1_epi8(0x1);
+        const __m128i t7f = _mm_set1_epi8(0x7f);
+
+        __m128i ps1 = _mm_xor_si128(p1, t80);
+        __m128i ps0 = _mm_xor_si128(p0, t80);
+        __m128i qs0 = _mm_xor_si128(q0, t80);
+        __m128i qs1 = _mm_xor_si128(q1, t80);
+        __m128i filt;
+        __m128i work_a;
+        __m128i filter1, filter2;
+        __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
+                flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
+                flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
+                flat_q2;
+
+        filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+        work_a = _mm_subs_epi8(qs0, ps0);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        filt = _mm_adds_epi8(filt, work_a);
+        /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+        filt = _mm_and_si128(filt, mask);
+
+        filter1 = _mm_adds_epi8(filt, t4);
+        filter2 = _mm_adds_epi8(filt, t3);
+
+        /* Filter1 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter1);
+        filter1 = _mm_srli_epi16(filter1, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter1 = _mm_and_si128(filter1, t1f);
+        filter1 = _mm_or_si128(filter1, work_a);
+        qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+        /* Filter2 >> 3 */
+        work_a = _mm_cmpgt_epi8(zero, filter2);
+        filter2 = _mm_srli_epi16(filter2, 3);
+        work_a = _mm_and_si128(work_a, te0);
+        filter2 = _mm_and_si128(filter2, t1f);
+        filter2 = _mm_or_si128(filter2, work_a);
+        ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+        /* filt >> 1 */
+        filt = _mm_adds_epi8(filter1, t1);
+        work_a = _mm_cmpgt_epi8(zero, filt);
+        filt = _mm_srli_epi16(filt, 1);
+        work_a = _mm_and_si128(work_a, t80);
+        filt = _mm_and_si128(filt, t7f);
+        filt = _mm_or_si128(filt, work_a);
+        filt = _mm_andnot_si128(hev, filt);
+        ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+        qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+        // loopfilter done
+
+        {
+            __m128i work;
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+                    _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+                    _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+            flat = _mm_max_epu8(work, flat);
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
+                    _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
+            flat = _mm_subs_epu8(flat, one);
+            flat = _mm_cmpeq_epi8(flat, zero);
+            flat = _mm_and_si128(flat, mask);
+
+            p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
+            q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
+            flat2 = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
+                    _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
+            q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
+                    _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+
+            p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
+            q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
+            work = _mm_max_epu8(
+                    _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
+                    _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
+
+            flat2 = _mm_max_epu8(work, flat2);
+            flat2 = _mm_subs_epu8(flat2, one);
+            flat2 = _mm_cmpeq_epi8(flat2, zero);
+            flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+        }
+
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+        // flat and wide flat calculations
+        {
+            const __m256i eight = _mm256_set1_epi16(8);
+            const __m256i four = _mm256_set1_epi16(4);
+            __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
+                    q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
+                    p256_0, q256_0;
+            __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
+                    pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
+                    res_q;
+
+            p256_7 = _mm256_cvtepu8_epi16(p7);
+            p256_6 = _mm256_cvtepu8_epi16(p6);
+            p256_5 = _mm256_cvtepu8_epi16(p5);
+            p256_4 = _mm256_cvtepu8_epi16(p4);
+            p256_3 = _mm256_cvtepu8_epi16(p3);
+            p256_2 = _mm256_cvtepu8_epi16(p2);
+            p256_1 = _mm256_cvtepu8_epi16(p1);
+            p256_0 = _mm256_cvtepu8_epi16(p0);
+            q256_0 = _mm256_cvtepu8_epi16(q0);
+            q256_1 = _mm256_cvtepu8_epi16(q1);
+            q256_2 = _mm256_cvtepu8_epi16(q2);
+            q256_3 = _mm256_cvtepu8_epi16(q3);
+            q256_4 = _mm256_cvtepu8_epi16(q4);
+            q256_5 = _mm256_cvtepu8_epi16(q5);
+            q256_6 = _mm256_cvtepu8_epi16(q6);
+            q256_7 = _mm256_cvtepu8_epi16(q7);
+
+            pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
+                    _mm256_add_epi16(p256_4, p256_3));
+            pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
+                    _mm256_add_epi16(q256_4, q256_3));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
+                    _mm256_add_epi16(p256_2, p256_1));
+            pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+            pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
+                    _mm256_add_epi16(q256_2, q256_1));
+            pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+
+            pixelFilter_p = _mm256_add_epi16(eight,
+                    _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+
+            pixetFilter_p2p1p0 = _mm256_add_epi16(four,
+                    _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(p256_7, p256_0)), 4);
+
+            flat2_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(q256_7, q256_0)), 4);
+
+            flat2_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(p256_3, p256_0)), 3);
+
+            flat_p0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(q256_3, q256_0)), 3);
+
+            flat_q0 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(p256_7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(q256_7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(p256_3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(q256_3, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_1)), 4);
+
+            flat2_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_1)), 4);
+
+            flat2_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_1)), 3);
+
+            flat_p1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_1)), 3);
+
+            flat_q1 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
+
+            sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_2)), 4);
+
+            flat2_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_2)), 4);
+
+            flat2_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
+
+            pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_p2p1p0,
+                            _mm256_add_epi16(sum_p3, p256_2)), 3);
+
+            flat_p2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixetFilter_q2q1q0,
+                            _mm256_add_epi16(sum_q3, q256_2)), 3);
+
+            flat_q2 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_3)), 4);
+
+            flat2_p3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_3)), 4);
+
+            flat2_q3 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_4)), 4);
+
+            flat2_p4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_4)), 4);
+
+            flat2_q4 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_5)), 4);
+
+            flat2_p5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_5)), 4);
+
+            flat2_q5 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+
+            sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
+
+            sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
+
+            pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
+
+            pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
+
+            res_p = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_p,
+                            _mm256_add_epi16(sum_p7, p256_6)), 4);
+
+            flat2_p6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
+                            168));
+
+            res_q = _mm256_srli_epi16(
+                    _mm256_add_epi16(pixelFilter_q,
+                            _mm256_add_epi16(sum_q7, q256_6)), 4);
+
+            flat2_q6 = _mm256_castsi256_si128(
+                    _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
+                            168));
+        }
+
+        // wide flat
+        // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+        p2 = _mm_andnot_si128(flat, p2);
+        flat_p2 = _mm_and_si128(flat, flat_p2);
+        p2 = _mm_or_si128(flat_p2, p2);
+
+        p1 = _mm_andnot_si128(flat, ps1);
+        flat_p1 = _mm_and_si128(flat, flat_p1);
+        p1 = _mm_or_si128(flat_p1, p1);
+
+        p0 = _mm_andnot_si128(flat, ps0);
+        flat_p0 = _mm_and_si128(flat, flat_p0);
+        p0 = _mm_or_si128(flat_p0, p0);
+
+        q0 = _mm_andnot_si128(flat, qs0);
+        flat_q0 = _mm_and_si128(flat, flat_q0);
+        q0 = _mm_or_si128(flat_q0, q0);
+
+        q1 = _mm_andnot_si128(flat, qs1);
+        flat_q1 = _mm_and_si128(flat, flat_q1);
+        q1 = _mm_or_si128(flat_q1, q1);
+
+        q2 = _mm_andnot_si128(flat, q2);
+        flat_q2 = _mm_and_si128(flat, flat_q2);
+        q2 = _mm_or_si128(flat_q2, q2);
+
+        p6 = _mm_andnot_si128(flat2, p6);
+        flat2_p6 = _mm_and_si128(flat2, flat2_p6);
+        p6 = _mm_or_si128(flat2_p6, p6);
+        _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
+
+        p5 = _mm_andnot_si128(flat2, p5);
+        flat2_p5 = _mm_and_si128(flat2, flat2_p5);
+        p5 = _mm_or_si128(flat2_p5, p5);
+        _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
+
+        p4 = _mm_andnot_si128(flat2, p4);
+        flat2_p4 = _mm_and_si128(flat2, flat2_p4);
+        p4 = _mm_or_si128(flat2_p4, p4);
+        _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
+
+        p3 = _mm_andnot_si128(flat2, p3);
+        flat2_p3 = _mm_and_si128(flat2, flat2_p3);
+        p3 = _mm_or_si128(flat2_p3, p3);
+        _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
+
+        p2 = _mm_andnot_si128(flat2, p2);
+        flat2_p2 = _mm_and_si128(flat2, flat2_p2);
+        p2 = _mm_or_si128(flat2_p2, p2);
+        _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
+
+        p1 = _mm_andnot_si128(flat2, p1);
+        flat2_p1 = _mm_and_si128(flat2, flat2_p1);
+        p1 = _mm_or_si128(flat2_p1, p1);
+        _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
+
+        p0 = _mm_andnot_si128(flat2, p0);
+        flat2_p0 = _mm_and_si128(flat2, flat2_p0);
+        p0 = _mm_or_si128(flat2_p0, p0);
+        _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
+
+        q0 = _mm_andnot_si128(flat2, q0);
+        flat2_q0 = _mm_and_si128(flat2, flat2_q0);
+        q0 = _mm_or_si128(flat2_q0, q0);
+        _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
+
+        q1 = _mm_andnot_si128(flat2, q1);
+        flat2_q1 = _mm_and_si128(flat2, flat2_q1);
+        q1 = _mm_or_si128(flat2_q1, q1);
+        _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
+
+        q2 = _mm_andnot_si128(flat2, q2);
+        flat2_q2 = _mm_and_si128(flat2, flat2_q2);
+        q2 = _mm_or_si128(flat2_q2, q2);
+        _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
+
+        q3 = _mm_andnot_si128(flat2, q3);
+        flat2_q3 = _mm_and_si128(flat2, flat2_q3);
+        q3 = _mm_or_si128(flat2_q3, q3);
+        _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
+
+        q4 = _mm_andnot_si128(flat2, q4);
+        flat2_q4 = _mm_and_si128(flat2, flat2_q4);
+        q4 = _mm_or_si128(flat2_q4, q4);
+        _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
+
+        q5 = _mm_andnot_si128(flat2, q5);
+        flat2_q5 = _mm_and_si128(flat2, flat2_q5);
+        q5 = _mm_or_si128(flat2_q5, q5);
+        _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
+
+        q6 = _mm_andnot_si128(flat2, q6);
+        flat2_q6 = _mm_and_si128(flat2, flat2_q6);
+        q6 = _mm_or_si128(flat2_q6, q6);
+        _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
+    }
+}
+
+void vp9_mb_lpf_horizontal_edge_w_avx2(unsigned char *s, int p,
+        const unsigned char *_blimit, const unsigned char *_limit,
+        const unsigned char *_thresh, int count) {
+    if (count == 1)
+        mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
+    else
+        mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
+}
diff --git a/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index fa4dd9b..3ca55cf 100644
--- a/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/source/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <emmintrin.h>  /* SSE2 */
+#include <emmintrin.h>  // SSE2
 #include "vp9/common/vp9_loopfilter.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -17,20 +17,14 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
   q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
@@ -105,7 +99,7 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
@@ -116,11 +110,11 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s,
     filter2 = _mm_unpacklo_epi8(zero, filter2);
     filter2 = _mm_srai_epi16(filter2, 0xB);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi16(filter1, t1);
     filt = _mm_srai_epi16(filt, 1);
     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
@@ -375,32 +369,25 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                              const unsigned char *_blimit,
                                              const unsigned char *_limit,
                                              const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
-
-  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
 
-  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
-  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
 
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
 
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
   int i = 0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
@@ -413,16 +400,16 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
 
-  _mm_store_si128((__m128i *)ap[4], p4);
-  _mm_store_si128((__m128i *)ap[3], p3);
-  _mm_store_si128((__m128i *)ap[2], p2);
-  _mm_store_si128((__m128i *)ap[1], p1);
-  _mm_store_si128((__m128i *)ap[0], p0);
-  _mm_store_si128((__m128i *)aq[4], q4);
-  _mm_store_si128((__m128i *)aq[3], q3);
-  _mm_store_si128((__m128i *)aq[2], q2);
-  _mm_store_si128((__m128i *)aq[1], q1);
-  _mm_store_si128((__m128i *)aq[0], q0);
+  _mm_store_si128((__m128i *)&ap[4 * 16], p4);
+  _mm_store_si128((__m128i *)&ap[3 * 16], p3);
+  _mm_store_si128((__m128i *)&ap[2 * 16], p2);
+  _mm_store_si128((__m128i *)&ap[1 * 16], p1);
+  _mm_store_si128((__m128i *)&ap[0 * 16], p0);
+  _mm_store_si128((__m128i *)&aq[4 * 16], q4);
+  _mm_store_si128((__m128i *)&aq[3 * 16], q3);
+  _mm_store_si128((__m128i *)&aq[2 * 16], q2);
+  _mm_store_si128((__m128i *)&aq[1 * 16], q1);
+  _mm_store_si128((__m128i *)&aq[0 * 16], q0);
 
 
   {
@@ -486,13 +473,13 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -500,7 +487,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter1 = _mm_or_si128(filter1, work_a);
     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
@@ -508,7 +495,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     filter2 = _mm_or_si128(filter2, work_a);
     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -546,8 +533,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p5)),
                            _mm_or_si128(_mm_subs_epu8(q5, q0),
                                         _mm_subs_epu8(q0, q5)));
-      _mm_store_si128((__m128i *)ap[5], p5);
-      _mm_store_si128((__m128i *)aq[5], q5);
+      _mm_store_si128((__m128i *)&ap[5 * 16], p5);
+      _mm_store_si128((__m128i *)&aq[5 * 16], q5);
       flat2 = _mm_max_epu8(work, flat2);
       p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
       q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
@@ -555,8 +542,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p6)),
                            _mm_or_si128(_mm_subs_epu8(q6, q0),
                                         _mm_subs_epu8(q0, q6)));
-      _mm_store_si128((__m128i *)ap[6], p6);
-      _mm_store_si128((__m128i *)aq[6], q6);
+      _mm_store_si128((__m128i *)&ap[6 * 16], p6);
+      _mm_store_si128((__m128i *)&aq[6 * 16], q6);
       flat2 = _mm_max_epu8(work, flat2);
 
       p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
@@ -565,8 +552,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
                                        _mm_subs_epu8(p0, p7)),
                            _mm_or_si128(_mm_subs_epu8(q7, q0),
                                         _mm_subs_epu8(q0, q7)));
-      _mm_store_si128((__m128i *)ap[7], p7);
-      _mm_store_si128((__m128i *)aq[7], q7);
+      _mm_store_si128((__m128i *)&ap[7 * 16], p7);
+      _mm_store_si128((__m128i *)&aq[7 * 16], q7);
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
@@ -586,22 +573,38 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
         __m128i a, b, c;
 
         unsigned int off = i * 8;
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
+                               zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
+                               zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
+                               zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
+                               zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
+                               zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
+                               zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
+                               zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
+                               zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
+                               zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
+                               zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
+                               zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
+                               zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
+                               zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
+                               zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
+                               zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
+                               zero);
 
         c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
         c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
@@ -610,117 +613,117 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
         a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
         a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
 
-        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q1, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q2, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q3, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(q4, c);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
         a = _mm_add_epi16(q5, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q6, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         temp_flat2 = _mm_srli_si128(temp_flat2, 8);
@@ -730,51 +733,51 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    work_a = _mm_load_si128((__m128i *)ap[2]);
-    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
-    _mm_store_si128((__m128i *)flat_op[2], p2);
+    _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
 
-    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
     work_a = _mm_andnot_si128(flat, ps1);
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
-    _mm_store_si128((__m128i *)flat_op[1], p1);
+    _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
 
-    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat_op[0]);
     work_a = _mm_andnot_si128(flat, ps0);
     p0 = _mm_and_si128(flat, p0);
     p0 = _mm_or_si128(work_a, p0);
-    _mm_store_si128((__m128i *)flat_op[0], p0);
+    _mm_store_si128((__m128i *)&flat_op[0], p0);
 
-    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
     work_a = _mm_andnot_si128(flat, qs0);
     q0 = _mm_and_si128(flat, q0);
     q0 = _mm_or_si128(work_a, q0);
-    _mm_store_si128((__m128i *)flat_oq[0], q0);
+    _mm_store_si128((__m128i *)&flat_oq[0], q0);
 
-    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
     work_a = _mm_andnot_si128(flat, qs1);
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
-    _mm_store_si128((__m128i *)flat_oq[1], q1);
+    _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
 
-    work_a = _mm_load_si128((__m128i *)aq[2]);
-    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
     q2 = _mm_or_si128(work_a, q2);
-    _mm_store_si128((__m128i *)flat_oq[2], q2);
+    _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
 
     // write out op6 - op3
     {
       unsigned char *dst = (s - 7 * p);
       for (i = 6; i > 2; i--) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)ap[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -783,43 +786,43 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
       }
     }
 
-    work_a = _mm_load_si128((__m128i *)flat_op[2]);
-    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p2 = _mm_and_si128(flat2, p2);
     p2 = _mm_or_si128(work_a, p2);
     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[1]);
-    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
+    p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p1 = _mm_and_si128(flat2, p1);
     p1 = _mm_or_si128(work_a, p1);
     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[0]);
-    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p0 = _mm_and_si128(flat2, p0);
     p0 = _mm_or_si128(work_a, p0);
     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
-    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q0 = _mm_and_si128(flat2, q0);
     q0 = _mm_or_si128(work_a, q0);
     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
-    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
+    q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q1 = _mm_and_si128(flat2, q1);
     q1 = _mm_or_si128(work_a, q1);
     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
-    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q2 = _mm_and_si128(flat2, q2);
     q2 = _mm_or_si128(work_a, q2);
@@ -830,8 +833,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
       unsigned char *dst = (s + 3 * p);
       for (i = 3; i < 7; i++) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)aq[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -842,6 +845,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s,
   }
 }
 
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
                                        int p,
                                        const unsigned char *_blimit,
@@ -860,34 +864,260 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh,
                                             int count) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  __m128i mask, hev, flat;
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
   const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
   (void)count;
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
+  {
+    // filter_mask and hev_mask
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                            _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                            _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                            _mm_subs_epu8(p1q1, q1p1));
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                                     _mm_subs_epu8(q1p1, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                                     _mm_subs_epu8(q2p2, q3p3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+
+    flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                                     _mm_subs_epu8(q0p0, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                                     _mm_subs_epu8(q0p0, q3p3)));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+    }
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 11);
+    filter1 = _mm_packs_epi16(filter1, filter1);
+
+    // Filter2 >> 3
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 11);
+    filter2 = _mm_packs_epi16(filter2, zero);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    filt = _mm_unpacklo_epi8(zero, filt);
+    filt = _mm_srai_epi16(filt, 9);
+    filt = _mm_packs_epi16(filt, zero);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
+    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
+    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
+    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
+    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p,
+                                               const uint8_t *_blimit0,
+                                               const uint8_t *_limit0,
+                                               const uint8_t *_thresh0,
+                                               const uint8_t *_blimit1,
+                                               const uint8_t *_limit1,
+                                               const uint8_t *_thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   {
     const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
                                           _mm_subs_epu8(p0, p1));
@@ -901,6 +1131,8 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
                                     _mm_subs_epu8(q1, p1));
     __m128i work;
+
+    // filter_mask and hev_mask
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -926,6 +1158,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
 
+    // flat_mask4
     work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
                                      _mm_subs_epu8(p0, p2)),
                          _mm_or_si128(_mm_subs_epu8(q2, q0),
@@ -943,7 +1176,9 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
   {
     const __m128i four = _mm_set1_epi16(4);
     unsigned char *src = s;
-    {
+    int i = 0;
+
+    do {
       __m128i workp_a, workp_b, workp_shft;
       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
@@ -958,38 +1193,40 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op2[0],
+      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op1[0],
+      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_op0[0],
+      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq0[0],
+      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq1[0],
+      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
 
       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-      _mm_storel_epi64((__m128i *)&flat_oq2[0],
+      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
                        _mm_packus_epi16(workp_shft, workp_shft));
-    }
+
+      src += 8;
+    } while (++i < 2);
   }
   // lp filter
   {
@@ -1001,13 +1238,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     const __m128i t1 = _mm_set1_epi8(0x1);
     const __m128i t7f = _mm_set1_epi8(0x7f);
 
-    const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
                                       t80);
-    const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
                                       t80);
-    const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)),
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
                                       t80);
-    const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)),
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
                                       t80);
     __m128i filt;
     __m128i work_a;
@@ -1018,27 +1255,27 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
     filt = _mm_adds_epi8(filt, work_a);
-    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
     filt = _mm_and_si128(filt, mask);
 
     filter1 = _mm_adds_epi8(filt, t4);
     filter2 = _mm_adds_epi8(filt, t3);
 
-    /* Filter1 >> 3 */
+    // Filter1 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter1);
     filter1 = _mm_srli_epi16(filter1, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter1 = _mm_and_si128(filter1, t1f);
     filter1 = _mm_or_si128(filter1, work_a);
 
-    /* Filter2 >> 3 */
+    // Filter2 >> 3
     work_a = _mm_cmpgt_epi8(zero, filter2);
     filter2 = _mm_srli_epi16(filter2, 3);
     work_a = _mm_and_si128(work_a, te0);
     filter2 = _mm_and_si128(filter2, t1f);
     filter2 = _mm_or_si128(filter2, work_a);
 
-    /* filt >> 1 */
+    // filt >> 1
     filt = _mm_adds_epi8(filter1, t1);
     work_a = _mm_cmpgt_epi8(zero, filt);
     filt = _mm_srli_epi16(filt, 1);
@@ -1049,47 +1286,186 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
     filt = _mm_andnot_si128(hev, filt);
 
     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
-    q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
     work_a = _mm_andnot_si128(flat, work_a);
     q0 = _mm_and_si128(flat, q0);
     q0 = _mm_or_si128(work_a, q0);
 
     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
-    q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
     work_a = _mm_andnot_si128(flat, work_a);
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
 
     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
-    q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
     q2 = _mm_or_si128(work_a, q2);
 
     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
-    p0 = _mm_loadl_epi64((__m128i *)flat_op0);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
     work_a = _mm_andnot_si128(flat, work_a);
     p0 = _mm_and_si128(flat, p0);
     p0 = _mm_or_si128(work_a, p0);
 
     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
-    p1 = _mm_loadl_epi64((__m128i *)flat_op1);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
     work_a = _mm_andnot_si128(flat, work_a);
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
 
     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
-    p2 = _mm_loadl_epi64((__m128i *)flat_op2);
+    p2 = _mm_load_si128((__m128i *)flat_op2);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
 
-    _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-    _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
-    _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
-    _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
-    _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
-    _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit0,
+                                            const unsigned char *_limit0,
+                                            const unsigned char *_thresh0,
+                                            const unsigned char *_blimit1,
+                                            const unsigned char *_limit1,
+                                            const unsigned char *_thresh1) {
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, hev, flat;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  // filter_mask and hev_mask
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    // (vp9_filter + 3 * (qs0 - ps0)) & mask
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    // Filter1 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    // Filter2 >> 3
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    // filt >> 1
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
   }
 }
 
@@ -1098,7 +1474,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
 
-  /* Read in 16 lines */
+  // Read in 16 lines
   x0 = _mm_loadl_epi64((__m128i *)in0);
   x8 = _mm_loadl_epi64((__m128i *)in1);
   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
@@ -1136,7 +1512,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store first 4-line result */
+  // Store first 4-line result
   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1152,7 +1528,7 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   x14 = _mm_unpacklo_epi32(x12, x13);
   x15 = _mm_unpackhi_epi32(x12, x13);
 
-  /* Store second 4-line result */
+  // Store second 4-line result
   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
@@ -1222,61 +1598,129 @@ static INLINE void transpose(unsigned char *src[], int in_p,
   } while (++idx8x8 < num_8x8_to_transpose);
 }
 
-void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s,
-                                          int p,
+void vp9_loop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                           const uint8_t *blimit0,
+                                           const uint8_t *limit0,
+                                           const uint8_t *thresh0,
+                                           const uint8_t *blimit1,
+                                           const uint8_t *limit1,
+                                           const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
+  unsigned char *src[2];
+  unsigned char *dst[2];
+
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
+
+  // Loop filtering
+  vp9_loop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                          thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
+
+  // Transpose back
+  transpose(src, 16, dst, p, 2);
+}
+
+void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, int p,
                                           const unsigned char *blimit,
                                           const unsigned char *limit,
                                           const unsigned char *thresh,
                                           int count) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 8);
+  unsigned char *src[1];
+  unsigned char *dst[1];
+  (void)count;
+
+  // Transpose 8x8
+  src[0] = s - 4;
+  dst[0] = t_dst;
+
+  transpose(src, p, dst, 8, 1);
+
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 4 * 8, 8, blimit, limit,
+                                         thresh, 1);
+
+  src[0] = t_dst;
+  dst[0] = s - 4;
+
+  // Transpose back
+  transpose(src, 8, dst, p, 1);
+}
+
+void vp9_mbloop_filter_vertical_edge_16_sse2(uint8_t *s, int p,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 16 * 8);
   unsigned char *src[2];
   unsigned char *dst[2];
 
-  (void)count;
-  /* Transpose 16x16 */
-  transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16);
-  transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16);
+  // Transpose 8x16
+  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
 
-  /* Loop filtering */
-  vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                         thresh, 1);
-  src[0] = t_dst + 3 * 16;
-  src[1] = t_dst + 3 * 16 + 8;
+  // Loop filtering
+  vp9_mbloop_filter_horizontal_edge_16_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
+                                            thresh0, blimit1, limit1, thresh1);
+  src[0] = t_dst;
+  src[1] = t_dst + 8;
 
-  dst[0] = s - 5;
-  dst[1] = s - 5 + p * 8;
+  dst[0] = s - 4;
+  dst[1] = s - 4 + p * 8;
 
-  /* Transpose 16x8 */
+  // Transpose back
   transpose(src, 16, dst, p, 2);
 }
 
-void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s,
-                                     int p,
+void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, int p,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
-  unsigned char *src[4];
-  unsigned char *dst[4];
-
-  dst[0] = t_dst;
-  dst[1] = t_dst + 8 * 16;
+  DECLARE_ALIGNED_ARRAY(8, unsigned char, t_dst, 8 * 16);
+  unsigned char *src[2];
+  unsigned char *dst[2];
 
   src[0] = s - 8;
-  src[1] = s - 8 + 8;
+  src[1] = s;
+  dst[0] = t_dst;
+  dst[1] = t_dst + 8 * 8;
 
-  /* Transpose 16x16 */
-  transpose(src, p, dst, 16, 2);
+  // Transpose 16x8
+  transpose(src, p, dst, 8, 2);
 
-  /* Loop filtering */
-  vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit,
-                                    thresh, 1);
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_8(t_dst + 8 * 8, 8, blimit, limit, thresh);
 
   src[0] = t_dst;
-  src[1] = t_dst + 8 * 16;
-
+  src[1] = t_dst + 8 * 8;
   dst[0] = s - 8;
-  dst[1] = s - 8 + 8;
+  dst[1] = s;
 
-  transpose(src, 16, dst, p, 2);
+  // Transpose back
+  transpose(src, 8, dst, p, 2);
+}
+
+void vp9_mb_lpf_vertical_edge_w_16_sse2(unsigned char *s, int p,
+                                        const uint8_t *blimit,
+                                        const uint8_t *limit,
+                                        const uint8_t *thresh) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256);
+
+  // Transpose 16x16
+  transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
+  transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  mb_lpf_horizontal_edge_w_sse2_16(t_dst + 8 * 16, 16, blimit, limit,
+                                   thresh);
+
+  // Transpose back
+  transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
+  transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
 }
diff --git a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 7a5cca0..634fa77 100644
--- a/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/source/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -11,17 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-
 %macro VERTx4 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -81,11 +70,14 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    movdqa      xmm1, xmm2
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
@@ -166,10 +158,13 @@
     pmaddubsw   xmm6, k6k7
 
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
@@ -251,10 +246,13 @@
     pmaddubsw   xmm6, k6k7
 
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    movdqa      xmm1, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 %if %1
@@ -538,14 +536,22 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     movdqa      %2,   %1
     pshufb      %1,   [GLOBAL(shuf_t0t1)]
     pshufb      %2,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   %1,   xmm6
-    pmaddubsw   %2,   xmm7
+    pmaddubsw   %1,   k0k1k4k5
+    pmaddubsw   %2,   k2k3k6k7
 
-    paddsw      %1,   %2
-    movdqa      %2,   %1
+    movdqa      xmm4, %1
+    movdqa      xmm5, %2
+    psrldq      %1,   8
     psrldq      %2,   8
-    paddsw      %1,   %2
-    paddsw      %1,   xmm5
+    movdqa      xmm6, xmm5
+
+    paddsw      xmm4, %2
+    pmaxsw      xmm5, %1
+    pminsw      %1, xmm6
+    paddsw      %1, xmm4
+    paddsw      %1, xmm5
+
+    paddsw      %1,   krd
     psraw       %1,   7
     packuswb    %1,   %1
 %endm
@@ -565,6 +571,10 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
     pshufd      xmm5, xmm5, 0               ;rounding
 
+    movdqa      k0k1k4k5, xmm6
+    movdqa      k2k3k6k7, xmm7
+    movdqa      krd, xmm5
+
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
@@ -631,9 +641,13 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pmaddubsw   %3,   k4k5
     pmaddubsw   %4,   k6k7
 
-    paddsw      %1,   %2
     paddsw      %1,   %4
+    movdqa      %4,   %2
+    pmaxsw      %2,   %3
+    pminsw      %3,   %4
     paddsw      %1,   %3
+    paddsw      %1,   %2
+
     paddsw      %1,   krd
     psraw       %1,   7
     packuswb    %1,   %1
@@ -779,12 +793,19 @@ sym(vp9_filter_block1d16_v8_avg_ssse3):
     pmaddubsw   xmm6,   k4k5
     pmaddubsw   xmm7,   k6k7
 
-    paddsw      xmm0,   xmm1
     paddsw      xmm0,   xmm3
+    movdqa      xmm3,   xmm1
+    pmaxsw      xmm1,   xmm2
+    pminsw      xmm2,   xmm3
     paddsw      xmm0,   xmm2
-    paddsw      xmm4,   xmm5
+    paddsw      xmm0,   xmm1
+
     paddsw      xmm4,   xmm7
+    movdqa      xmm7,   xmm5
+    pmaxsw      xmm5,   xmm6
+    pminsw      xmm6,   xmm7
     paddsw      xmm4,   xmm6
+    paddsw      xmm4,   xmm5
 
     paddsw      xmm0,   krd
     paddsw      xmm4,   krd
@@ -826,8 +847,16 @@ sym(vp9_filter_block1d4_h8_ssse3):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 0
 
+    add rsp, 16 * 3
+    pop rsp
     ; begin epilog
     pop rdi
     pop rsi
@@ -932,8 +961,16 @@ sym(vp9_filter_block1d4_h8_avg_ssse3):
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 1
 
+    add rsp, 16 * 3
+    pop rsp
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/source/libvpx/vp9/decoder/vp9_dboolhuff.c b/source/libvpx/vp9/decoder/vp9_dboolhuff.c
index 06acec4..4f16e95 100644
--- a/source/libvpx/vp9/decoder/vp9_dboolhuff.c
+++ b/source/libvpx/vp9/decoder/vp9_dboolhuff.c
@@ -18,32 +18,28 @@
 // Even relatively modest values like 100 would work fine.
 #define LOTS_OF_BITS 0x40000000
 
-
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size) {
-  int marker_bit;
-
-  r->buffer_end = buffer + size;
-  r->buffer = buffer;
-  r->value = 0;
-  r->count = -8;
-  r->range = 255;
-
-  if (size && !buffer)
+  if (size && !buffer) {
     return 1;
-
-  vp9_reader_fill(r);
-  marker_bit = vp9_read_bit(r);
-  return marker_bit != 0;
+  } else {
+    r->buffer_end = buffer + size;
+    r->buffer = buffer;
+    r->value = 0;
+    r->count = -8;
+    r->range = 255;
+    vp9_reader_fill(r);
+    return vp9_read_bit(r) != 0;  // marker bit
+  }
 }
 
 void vp9_reader_fill(vp9_reader *r) {
   const uint8_t *const buffer_end = r->buffer_end;
   const uint8_t *buffer = r->buffer;
-  VP9_BD_VALUE value = r->value;
+  BD_VALUE value = r->value;
   int count = r->count;
-  int shift = BD_VALUE_SIZE - 8 - (count + 8);
+  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);
   int loop_end = 0;
-  const int bits_left = (int)((buffer_end - buffer)*CHAR_BIT);
+  const int bits_left = (int)((buffer_end - buffer) * CHAR_BIT);
   const int x = shift + CHAR_BIT - bits_left;
 
   if (x >= 0) {
@@ -54,7 +50,7 @@ void vp9_reader_fill(vp9_reader *r) {
   if (x < 0 || bits_left) {
     while (shift >= loop_end) {
       count += CHAR_BIT;
-      value |= (VP9_BD_VALUE)*buffer++ << shift;
+      value |= (BD_VALUE)*buffer++ << shift;
       shift -= CHAR_BIT;
     }
   }
diff --git a/source/libvpx/vp9/decoder/vp9_dboolhuff.h b/source/libvpx/vp9/decoder/vp9_dboolhuff.h
index c864516..8339c27 100644
--- a/source/libvpx/vp9/decoder/vp9_dboolhuff.h
+++ b/source/libvpx/vp9/decoder/vp9_dboolhuff.h
@@ -18,46 +18,50 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-typedef size_t VP9_BD_VALUE;
+#include "vp9/common/vp9_treecoder.h"
 
-#define BD_VALUE_SIZE ((int)sizeof(VP9_BD_VALUE)*CHAR_BIT)
+typedef size_t BD_VALUE;
+
+#define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
+
+DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
 
 typedef struct {
   const uint8_t *buffer_end;
   const uint8_t *buffer;
-  VP9_BD_VALUE value;
+  BD_VALUE value;
   int count;
   unsigned int range;
 } vp9_reader;
 
-DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
-
 int vp9_reader_init(vp9_reader *r, const uint8_t *buffer, size_t size);
 
 void vp9_reader_fill(vp9_reader *r);
 
+int vp9_reader_has_error(vp9_reader *r);
+
 const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
-static int vp9_read(vp9_reader *br, int probability) {
+static int vp9_read(vp9_reader *r, int prob) {
   unsigned int bit = 0;
-  VP9_BD_VALUE value;
-  VP9_BD_VALUE bigsplit;
+  BD_VALUE value;
+  BD_VALUE bigsplit;
   int count;
   unsigned int range;
-  unsigned int split = 1 + (((br->range - 1) * probability) >> 8);
+  unsigned int split = (r->range * prob + (256 - prob)) >> CHAR_BIT;
 
-  if (br->count < 0)
-    vp9_reader_fill(br);
+  if (r->count < 0)
+    vp9_reader_fill(r);
 
-  value = br->value;
-  count = br->count;
+  value = r->value;
+  count = r->count;
 
-  bigsplit = (VP9_BD_VALUE)split << (BD_VALUE_SIZE - 8);
+  bigsplit = (BD_VALUE)split << (BD_VALUE_SIZE - CHAR_BIT);
 
   range = split;
 
   if (value >= bigsplit) {
-    range = br->range - split;
+    range = r->range - split;
     value = value - bigsplit;
     bit = 1;
   }
@@ -68,9 +72,9 @@ static int vp9_read(vp9_reader *br, int probability) {
     value <<= shift;
     count -= shift;
   }
-  br->value = value;
-  br->count = count;
-  br->range = range;
+  r->value = value;
+  r->count = count;
+  r->range = range;
 
   return bit;
 }
@@ -79,15 +83,23 @@ static int vp9_read_bit(vp9_reader *r) {
   return vp9_read(r, 128);  // vp9_prob_half
 }
 
-static int vp9_read_literal(vp9_reader *br, int bits) {
-  int z = 0, bit;
+static int vp9_read_literal(vp9_reader *r, int bits) {
+  int literal = 0, bit;
 
   for (bit = bits - 1; bit >= 0; bit--)
-    z |= vp9_read_bit(br) << bit;
+    literal |= vp9_read_bit(r) << bit;
 
-  return z;
+  return literal;
 }
 
-int vp9_reader_has_error(vp9_reader *r);
+static int vp9_read_tree(vp9_reader *r, const vp9_tree_index *tree,
+                         const vp9_prob *probs) {
+  vp9_tree_index i = 0;
+
+  while ((i = tree[i + vp9_read(r, probs[i >> 1])]) > 0)
+    continue;
+
+  return -i;
+}
 
 #endif  // VP9_DECODER_VP9_DBOOLHUFF_H_
diff --git a/source/libvpx/vp9/decoder/vp9_decodframe.c b/source/libvpx/vp9/decoder/vp9_decodeframe.c
index e3a2b77..79f0835 100644
--- a/source/libvpx/vp9/decoder/vp9_decodframe.c
+++ b/source/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -9,8 +9,11 @@
  */
 
 #include <assert.h>
+#include <stdlib.h>  // qsort()
 
 #include "./vp9_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
 
@@ -18,7 +21,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -28,35 +30,35 @@
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/decoder/vp9_dboolhuff.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_read_bit_buffer.h"
 #include "vp9/decoder/vp9_thread.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 typedef struct TileWorkerData {
   VP9_COMMON *cm;
   vp9_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
 } TileWorkerData;
 
 static int read_be32(const uint8_t *p) {
   return (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
 }
 
-static int is_compound_prediction_allowed(const VP9_COMMON *cm) {
+static int is_compound_reference_allowed(const VP9_COMMON *cm) {
   int i;
-  for (i = 1; i < ALLOWED_REFS_PER_FRAME; ++i)
+  for (i = 1; i < REFS_PER_FRAME; ++i)
     if  (cm->ref_frame_sign_bias[i + 1] != cm->ref_frame_sign_bias[1])
       return 1;
 
   return 0;
 }
 
-static void setup_compound_prediction(VP9_COMMON *cm) {
+static void setup_compound_reference(VP9_COMMON *cm) {
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -91,7 +93,7 @@ static TX_MODE read_tx_mode(vp9_reader *r) {
   return tx_mode;
 }
 
-static void read_tx_probs(struct tx_probs *tx_probs, vp9_reader *r) {
+static void read_tx_mode_probs(struct tx_probs *tx_probs, vp9_reader *r) {
   int i, j;
 
   for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
@@ -121,78 +123,67 @@ static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
       vp9_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
 }
 
-static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
-  COMPPREDMODE_TYPE mode = vp9_read_bit(r);
-  if (mode)
-    mode += vp9_read_bit(r);
-  return mode;
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, vp9_reader *r) {
+  if (is_compound_reference_allowed(cm)) {
+    REFERENCE_MODE mode = vp9_read_bit(r);
+    if (mode)
+      mode += vp9_read_bit(r);
+    setup_compound_reference(cm);
+    return mode;
+  } else {
+    return SINGLE_REFERENCE;
+  }
 }
 
-static void read_comp_pred(VP9_COMMON *cm, vp9_reader *r) {
+static void read_reference_mode_probs(VP9_COMMON *cm, vp9_reader *r) {
   int i;
-
-  const int compound_allowed = is_compound_prediction_allowed(cm);
-  cm->comp_pred_mode = compound_allowed ? read_comp_pred_mode(r)
-                                        : SINGLE_PREDICTION_ONLY;
-  if (compound_allowed)
-    setup_compound_prediction(cm);
-
-  if (cm->comp_pred_mode == HYBRID_PREDICTION)
+  if (cm->reference_mode == REFERENCE_MODE_SELECT)
     for (i = 0; i < COMP_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(r, &cm->fc.comp_inter_prob[i]);
 
-  if (cm->comp_pred_mode != COMP_PREDICTION_ONLY)
+  if (cm->reference_mode != COMPOUND_REFERENCE)
     for (i = 0; i < REF_CONTEXTS; i++) {
       vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][0]);
       vp9_diff_update_prob(r, &cm->fc.single_ref_prob[i][1]);
     }
 
-  if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+  if (cm->reference_mode != SINGLE_REFERENCE)
     for (i = 0; i < REF_CONTEXTS; i++)
       vp9_diff_update_prob(r, &cm->fc.comp_ref_prob[i]);
 }
 
-static void update_mv(vp9_reader *r, vp9_prob *p) {
-  if (vp9_read(r, NMV_UPDATE_PROB))
-    *p = (vp9_read_literal(r, 7) << 1) | 1;
+static void update_mv_probs(vp9_prob *p, int n, vp9_reader *r) {
+  int i;
+  for (i = 0; i < n; ++i)
+    if (vp9_read(r, NMV_UPDATE_PROB))
+      p[i] = (vp9_read_literal(r, 7) << 1) | 1;
 }
 
-static void read_mv_probs(vp9_reader *r, nmv_context *mvc, int allow_hp) {
-  int i, j, k;
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vp9_reader *r) {
+  int i, j;
 
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(r, &mvc->joints[j]);
+  update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
 
   for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
-    update_mv(r, &comp->sign);
-
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(r, &comp->classes[j]);
-
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(r, &comp->class0[j]);
-
-    for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(r, &comp->bits[j]);
+    nmv_component *const comp_ctx = &ctx->comps[i];
+    update_mv_probs(&comp_ctx->sign, 1, r);
+    update_mv_probs(comp_ctx->classes, MV_CLASSES - 1, r);
+    update_mv_probs(comp_ctx->class0, CLASS0_SIZE - 1, r);
+    update_mv_probs(comp_ctx->bits, MV_OFFSET_BITS, r);
   }
 
   for (i = 0; i < 2; ++i) {
-    nmv_component *const comp = &mvc->comps[i];
-
+    nmv_component *const comp_ctx = &ctx->comps[i];
     for (j = 0; j < CLASS0_SIZE; ++j)
-      for (k = 0; k < 3; ++k)
-        update_mv(r, &comp->class0_fp[j][k]);
-
-    for (j = 0; j < 3; ++j)
-      update_mv(r, &comp->fp[j]);
+      update_mv_probs(comp_ctx->class0_fp[j], MV_FP_SIZE - 1, r);
+    update_mv_probs(comp_ctx->fp, 3, r);
   }
 
   if (allow_hp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(r, &mvc->comps[i].class0_hp);
-      update_mv(r, &mvc->comps[i].hp);
+      nmv_component *const comp_ctx = &ctx->comps[i];
+      update_mv_probs(&comp_ctx->class0_hp, 1, r);
+      update_mv_probs(&comp_ctx->hp, 1, r);
     }
   }
 }
@@ -208,20 +199,22 @@ static void setup_plane_dequants(VP9_COMMON *cm, MACROBLOCKD *xd, int q_index) {
 // Allocate storage for each tile column.
 // TODO(jzern): when max_threads <= 1 the same storage could be used for each
 // tile.
-static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
+static void alloc_tile_storage(VP9D_COMP *pbi, int tile_rows, int tile_cols) {
   VP9_COMMON *const cm = &pbi->common;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  int i, tile_col;
+  int i, tile_row, tile_col;
 
   CHECK_MEM_ERROR(cm, pbi->mi_streams,
-                  vpx_realloc(pbi->mi_streams, tile_cols *
+                  vpx_realloc(pbi->mi_streams, tile_rows * tile_cols *
                               sizeof(*pbi->mi_streams)));
-  for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-    TileInfo tile;
-
-    vp9_tile_init(&tile, cm, 0, tile_col);
-    pbi->mi_streams[tile_col] =
-        &cm->mi[cm->mi_rows * tile.mi_col_start];
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      TileInfo tile;
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
+      pbi->mi_streams[tile_row * tile_cols + tile_col] =
+          &cm->mi[tile.mi_row_start * cm->mode_info_stride
+                  + tile.mi_col_start];
+    }
   }
 
   // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm
@@ -245,48 +238,46 @@ static void alloc_tile_storage(VP9D_COMP *pbi, int tile_cols) {
 }
 
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+                                    TX_SIZE tx_size, uint8_t *dst, int stride,
+                                    int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t* const qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
-  const int stride = pd->dst.stride;
-  const int eob = pd->eobs[block];
   if (eob > 0) {
     TX_TYPE tx_type;
-    const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                         block);
-    uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                   pd->dst.buf, stride);
+    const int plane_type = pd->plane_type;
+    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     switch (tx_size) {
       case TX_4X4:
-        tx_type = get_tx_type_4x4(pd->plane_type, xd, raster_block);
+        tx_type = get_tx_type_4x4(plane_type, xd, block);
         if (tx_type == DCT_DCT)
-          xd->itxm_add(qcoeff, dst, stride, eob);
+          xd->itxm_add(dqcoeff, dst, stride, eob);
         else
-          vp9_iht4x4_add(tx_type, qcoeff, dst, stride, eob);
+          vp9_iht4x4_16_add(dqcoeff, dst, stride, tx_type);
         break;
       case TX_8X8:
-        tx_type = get_tx_type_8x8(pd->plane_type, xd);
-        vp9_iht8x8_add(tx_type, qcoeff, dst, stride, eob);
+        tx_type = get_tx_type_8x8(plane_type, xd);
+        vp9_iht8x8_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_16X16:
-        tx_type = get_tx_type_16x16(pd->plane_type, xd);
-        vp9_iht16x16_add(tx_type, qcoeff, dst, stride, eob);
+        tx_type = get_tx_type_16x16(plane_type, xd);
+        vp9_iht16x16_add(tx_type, dqcoeff, dst, stride, eob);
         break;
       case TX_32X32:
         tx_type = DCT_DCT;
-        vp9_idct32x32_add(qcoeff, dst, stride, eob);
+        vp9_idct32x32_add(dqcoeff, dst, stride, eob);
         break;
       default:
-        assert(!"Invalid transform size");
+        assert(0 && "Invalid transform size");
     }
 
     if (eob == 1) {
-      vpx_memset(qcoeff, 0, 2 * sizeof(qcoeff[0]));
+      vpx_memset(dqcoeff, 0, 2 * sizeof(dqcoeff[0]));
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        vpx_memset(qcoeff, 0, 4 * (4 << tx_size) * sizeof(qcoeff[0]));
+        vpx_memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+      else if (tx_size == TX_32X32 && eob <= 34)
+        vpx_memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
-        vpx_memset(qcoeff, 0, (16 << (tx_size << 1)) * sizeof(qcoeff[0]));
+        vpx_memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
     }
   }
 }
@@ -303,29 +294,30 @@ static void predict_and_reconstruct_intra_block(int plane, int block,
   struct intra_args *const args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
-
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-  uint8_t* const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
   const MB_PREDICTION_MODE mode = (plane == 0)
-        ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[raster_block].as_mode
-                                          : mi->mbmi.mode)
-        : mi->mbmi.uv_mode;
+          ? ((mi->mbmi.sb_type < BLOCK_8X8) ? mi->bmi[block].as_mode
+                                            : mi->mbmi.mode)
+          : mi->mbmi.uv_mode;
+  int x, y;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  dst = &pd->dst.buf[4 * y * pd->dst.stride + 4 * x];
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
-    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
+    extend_for_intra(xd, plane_bsize, plane, x, y);
 
-  vp9_predict_intra_block(xd, raster_block >> tx_size,
+  vp9_predict_intra_block(xd, block >> (tx_size << 1),
                           b_width_log2(plane_bsize), tx_size, mode,
                           dst, pd->dst.stride, dst, pd->dst.stride);
 
   if (!mi->mbmi.skip_coeff) {
-    vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, tx_size,
-                            args->r);
-    inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+    const int eob = vp9_decode_block_tokens(cm, xd, plane, block,
+                                            plane_bsize, x, y, tx_size,
+                                            args->r);
+    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
+                            eob);
   }
 }
 
@@ -342,33 +334,41 @@ static void reconstruct_inter_block(int plane, int block,
   struct inter_args *args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
-
-  *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
-                                             plane_bsize, tx_size, args->r);
-  inverse_transform_block(xd, plane, block, plane_bsize, tx_size);
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int x, y, eob;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
+  eob = vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y,
+                                tx_size, args->r);
+  inverse_transform_block(xd, plane, block, tx_size,
+                          &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
+                          pd->dst.stride, eob);
+  *args->eobtotal += eob;
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                         const TileInfo *const tile,
                         BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
   const int offset = mi_row * cm->mode_info_stride + mi_col;
-
-  xd->mode_info_stride = cm->mode_info_stride;
+  const int tile_offset = tile->mi_row_start * cm->mode_info_stride +
+                          tile->mi_col_start;
+  int x, y;
 
   xd->mi_8x8 = cm->mi_grid_visible + offset;
   xd->prev_mi_8x8 = cm->prev_mi_grid_visible + offset;
-
-  // we are using the mode info context stream here
-  xd->mi_8x8[0] = xd->mi_stream;
-  xd->mi_8x8[0]->mbmi.sb_type = bsize;
-  ++xd->mi_stream;
-
   // Special case: if prev_mi is NULL, the previous mode info context
   // cannot be used.
   xd->last_mi = cm->prev_mi ? xd->prev_mi_8x8[0] : NULL;
 
+  xd->mi_8x8[0] = xd->mi_stream + offset - tile_offset;
+  xd->mi_8x8[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi_8x8[y * cm->mode_info_stride + x] = xd->mi_8x8[0];
+
   set_skip_context(xd, xd->above_context, xd->left_context, mi_row, mi_col);
 
   // Distance of Mb to the various image edges. These are specified to 8th pel
@@ -384,6 +384,8 @@ static void set_ref(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   const int ref = mbmi->ref_frame[idx] - LAST_FRAME;
   const YV12_BUFFER_CONFIG *cfg = get_frame_ref_buffer(cm, ref);
   const struct scale_factors_common *sfc = &cm->active_ref_scale_comm[ref];
+
+  xd->ref_buf[idx] = cfg;
   if (!vp9_is_valid_scale(sfc))
     vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                        "Invalid scale factors");
@@ -431,7 +433,7 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         vp9_get_filter_kernel(mbmi->interp_filter);
 
     // Prediction
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+    vp9_dec_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
 
     // Reconstruction
     if (!mbmi->skip_coeff) {
@@ -446,21 +448,30 @@ static void decode_modes_b(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   xd->corrupted |= vp9_reader_has_error(r);
 }
 
-static PARTITION_TYPE read_partition(int hbs, int mi_rows, int mi_cols,
-                                     int mi_row, int mi_col,
-                                     vp9_prob probs[PARTITION_TYPES - 1],
+static PARTITION_TYPE read_partition(VP9_COMMON *cm, MACROBLOCKD *xd, int hbs,
+                                     int mi_row, int mi_col, BLOCK_SIZE bsize,
                                      vp9_reader *r) {
-  const int has_rows = (mi_row + hbs) < mi_rows;
-  const int has_cols = (mi_col + hbs) < mi_cols;
+  const int ctx = partition_plane_context(xd->above_seg_context,
+                                          xd->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
+  PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    return treed_read(r, vp9_partition_tree, probs);
+    p = vp9_read_tree(r, vp9_partition_tree, probs);
   else if (!has_rows && has_cols)
-    return vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vp9_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    return vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vp9_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
-    return PARTITION_SPLIT;
+    p = PARTITION_SPLIT;
+
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.partition[ctx][p];
+
+  return p;
 }
 
 static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -470,19 +481,11 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  int ctx;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  ctx = partition_plane_context(xd->above_seg_context, xd->left_seg_context,
-                                mi_row, mi_col, bsize);
-  partition = read_partition(hbs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
-                             cm->fc.partition_prob[cm->frame_type][ctx], r);
-
-  if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.partition[ctx][partition];
-
+  partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
     decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize);
@@ -508,7 +511,7 @@ static void decode_modes_sb(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize);
         break;
       default:
-        assert(!"Invalid partition type");
+        assert(0 && "Invalid partition type");
     }
   }
 
@@ -541,27 +544,20 @@ static void read_coef_probs_common(vp9_coeff_probs_model *coef_probs,
   int i, j, k, l, m;
 
   if (vp9_read_bit(r))
-    for (i = 0; i < BLOCK_TYPES; i++)
-      for (j = 0; j < REF_TYPES; j++)
-        for (k = 0; k < COEF_BANDS; k++)
-          for (l = 0; l < PREV_COEF_CONTEXTS; l++)
-            if (k > 0 || l < 3)
-              for (m = 0; m < UNCONSTRAINED_NODES; m++)
-                vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+            for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+              vp9_diff_update_prob(r, &coef_probs[i][j][k][l][m]);
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
                             vp9_reader *r) {
-  read_coef_probs_common(fc->coef_probs[TX_4X4], r);
-
-  if (tx_mode > ONLY_4X4)
-    read_coef_probs_common(fc->coef_probs[TX_8X8], r);
-
-  if (tx_mode > ALLOW_8X8)
-    read_coef_probs_common(fc->coef_probs[TX_16X16], r);
-
-  if (tx_mode > ALLOW_16X16)
-    read_coef_probs_common(fc->coef_probs[TX_32X32], r);
+    const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+    TX_SIZE tx_size;
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      read_coef_probs_common(fc->coef_probs[tx_size], r);
 }
 
 static void setup_segmentation(struct segmentation *seg,
@@ -696,20 +692,13 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
   VP9_COMMON *cm = &pbi->common;
 
   if (cm->width != width || cm->height != height) {
-    if (!pbi->initial_width || !pbi->initial_height) {
-      if (vp9_alloc_frame_buffers(cm, width, height))
+    // Change in frame size.
+    // TODO(agrange) Don't test width/height, check overall size.
+    if (width > cm->width || height > cm->height) {
+      // Rescale frame buffers only if they're not big enough already.
+      if (vp9_resize_frame_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate frame buffers");
-      pbi->initial_width = width;
-      pbi->initial_height = height;
-    } else {
-      if (width > pbi->initial_width)
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Frame width too large");
-
-      if (height > pbi->initial_height)
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Frame height too large");
     }
 
     cm->width = width;
@@ -718,9 +707,21 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) {
     vp9_update_frame_size(cm);
   }
 
-  vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
-                           cm->subsampling_x, cm->subsampling_y,
-                           VP9BORDERINPIXELS);
+  if (cm->fb_list != NULL) {
+    vpx_codec_frame_buffer_t *const ext_fb = &cm->fb_list[cm->new_fb_idx];
+    if (vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
+                                 cm->width, cm->height,
+                                 cm->subsampling_x, cm->subsampling_y,
+                                 VP9BORDERINPIXELS, ext_fb,
+                                 cm->realloc_fb_cb, cm->user_priv)) {
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to allocate external frame buffer");
+    }
+  } else {
+    vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height,
+                             cm->subsampling_x, cm->subsampling_y,
+                             VP9BORDERINPIXELS, NULL, NULL, NULL);
+  }
 }
 
 static void setup_frame_size(VP9D_COMP *pbi,
@@ -737,7 +738,7 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
 
   int width, height;
   int found = 0, i;
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
     if (vp9_rb_read_bit(rb)) {
       YV12_BUFFER_CONFIG *const cfg = get_frame_ref_buffer(cm, i);
       width = cfg->y_crop_width;
@@ -759,9 +760,10 @@ static void setup_frame_size_with_refs(VP9D_COMP *pbi,
 }
 
 static void setup_tile_context(VP9D_COMP *const pbi, MACROBLOCKD *const xd,
-                               int tile_col) {
+                               int tile_row, int tile_col) {
   int i;
-  xd->mi_stream = pbi->mi_streams[tile_col];
+  const int tile_cols = 1 << pbi->common.log2_tile_cols;
+  xd->mi_stream = pbi->mi_streams[tile_row * tile_cols + tile_col];
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->above_context[i] = pbi->above_context[i];
@@ -793,8 +795,9 @@ static void decode_tile(VP9D_COMP *pbi, const TileInfo *const tile,
     vp9_zero(xd->left_context);
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
+         mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64);
+    }
 
     if (pbi->do_loopfilter_inline) {
       const int lf_start = mi_row - MI_BLOCK_SIZE;
@@ -864,77 +867,80 @@ static size_t get_tile(const uint8_t *const data_end,
   return size;
 }
 
-static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
-  vp9_reader residual_bc;
+typedef struct TileBuffer {
+  const uint8_t *data;
+  size_t size;
+  int col;  // only used with multi-threaded decoding
+} TileBuffer;
 
+static const uint8_t *decode_tiles(VP9D_COMP *pbi, const uint8_t *data) {
   VP9_COMMON *const cm = &pbi->common;
   MACROBLOCKD *const xd = &pbi->mb;
-
-  const uint8_t *const data_end = pbi->source + pbi->source_sz;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
+  TileBuffer tile_buffers[4][1 << 6];
   int tile_row, tile_col;
+  const uint8_t *const data_end = pbi->source + pbi->source_sz;
+  const uint8_t *end = NULL;
+  vp9_reader r;
+
+  assert(tile_rows <= 4);
+  assert(tile_cols <= (1 << 6));
 
   // Note: this memset assumes above_context[0], [1] and [2]
   // are allocated as part of the same buffer.
   vpx_memset(pbi->above_context[0], 0,
-             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE *
-             2 * aligned_mi_cols);
+             sizeof(*pbi->above_context[0]) * MAX_MB_PLANE * 2 * aligned_cols);
 
   vpx_memset(pbi->above_seg_context, 0,
-             sizeof(*pbi->above_seg_context) * aligned_mi_cols);
-
-  if (pbi->oxcf.inv_tile_order) {
-    const uint8_t *data_ptr2[4][1 << 6];
-    vp9_reader bc_bak = {0};
-
-    // pre-initialize the offsets, we're going to decode in inverse order
-    data_ptr2[0][0] = data;
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int last_tile =
-            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
-        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
-        data_ptr2[tile_row][tile_col] = data;
-        data += size;
-      }
+             sizeof(*pbi->above_seg_context) * aligned_cols);
+
+  // Load tile data into tile_buffers
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const int last_tile = tile_row == tile_rows - 1 &&
+                            tile_col == tile_cols - 1;
+      const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
+      TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
+      buf->data = data;
+      buf->size = size;
+      data += size;
     }
+  }
 
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = tile_cols - 1; tile_col >= 0; tile_col--) {
-        TileInfo tile;
-
-        vp9_tile_init(&tile, cm, tile_row, tile_col);
-        setup_token_decoder(data_ptr2[tile_row][tile_col], data_end,
-                            data_end - data_ptr2[tile_row][tile_col],
-                            &cm->error, &residual_bc);
-        setup_tile_context(pbi, xd, tile_col);
-        decode_tile(pbi, &tile, &residual_bc);
-        if (tile_row == tile_rows - 1 && tile_col == tile_cols - 1)
-          bc_bak = residual_bc;
-      }
-    }
-    residual_bc = bc_bak;
-  } else {
-    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-      for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-        const int last_tile =
-            tile_row == tile_rows - 1 && tile_col == tile_cols - 1;
-        const size_t size = get_tile(data_end, last_tile, &cm->error, &data);
-        TileInfo tile;
-
-        vp9_tile_init(&tile, cm, tile_row, tile_col);
-
-        setup_token_decoder(data, data_end, size, &cm->error, &residual_bc);
-        setup_tile_context(pbi, xd, tile_col);
-        decode_tile(pbi, &tile, &residual_bc);
-        data += size;
-      }
+  // Decode tiles using data from tile_buffers
+  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+      const int col = pbi->oxcf.inv_tile_order ? tile_cols - tile_col - 1
+                                               : tile_col;
+      const int last_tile = tile_row == tile_rows - 1 &&
+                                 col == tile_cols - 1;
+      const TileBuffer *const buf = &tile_buffers[tile_row][col];
+      TileInfo tile;
+
+      vp9_tile_init(&tile, cm, tile_row, col);
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error, &r);
+      setup_tile_context(pbi, xd, tile_row, col);
+      decode_tile(pbi, &tile, &r);
+
+      if (last_tile)
+        end = vp9_reader_find_end(&r);
     }
   }
 
-  return vp9_reader_find_end(&residual_bc);
+  return end;
+}
+
+static void setup_tile_macroblockd(TileWorkerData *const tile_data) {
+  MACROBLOCKD *xd = &tile_data->xd;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    pd[i].dqcoeff = tile_data->dqcoeff[i];
+    vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
+  }
 }
 
 static int tile_worker_hook(void *arg1, void *arg2) {
@@ -947,22 +953,40 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     vp9_zero(tile_data->xd.left_context);
     vp9_zero(tile_data->xd.left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
+         mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64);
+    }
   }
   return !tile_data->xd.corrupted;
 }
 
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+  const TileBuffer *const buf1 = (const TileBuffer*)a;
+  const TileBuffer *const buf2 = (const TileBuffer*)b;
+  if (buf1->size < buf2->size) {
+    return 1;
+  } else if (buf1->size == buf2->size) {
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
   VP9_COMMON *const cm = &pbi->common;
+  const uint8_t *bit_reader_end = NULL;
   const uint8_t *const data_end = pbi->source + pbi->source_sz;
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
   const int num_workers = MIN(pbi->oxcf.max_threads & ~1, tile_cols);
-  int tile_col = 0;
+  TileBuffer tile_buffers[1 << 6];
+  int n;
+  int final_worker = -1;
 
+  assert(tile_cols <= (1 << 6));
   assert(tile_rows == 1);
   (void)tile_rows;
 
@@ -995,47 +1019,82 @@ static const uint8_t *decode_tiles_mt(VP9D_COMP *pbi, const uint8_t *data) {
   vpx_memset(pbi->above_seg_context, 0,
              sizeof(*pbi->above_seg_context) * aligned_mi_cols);
 
-  while (tile_col < tile_cols) {
+  // Load tile data into tile_buffers
+  for (n = 0; n < tile_cols; ++n) {
+    const size_t size =
+        get_tile(data_end, n == tile_cols - 1, &cm->error, &data);
+    TileBuffer *const buf = &tile_buffers[n];
+    buf->data = data;
+    buf->size = size;
+    buf->col = n;
+    data += size;
+  }
+
+  // Sort the buffers based on size in descending order.
+  qsort(tile_buffers, tile_cols, sizeof(tile_buffers[0]), compare_tile_buffers);
+
+  // Rearrange the tile buffers such that per-tile group the largest, and
+  // presumably the most difficult, tile will be decoded in the main thread.
+  // This should help minimize the number of instances where the main thread is
+  // waiting for a worker to complete.
+  {
+    int group_start = 0;
+    while (group_start < tile_cols) {
+      const TileBuffer largest = tile_buffers[group_start];
+      const int group_end = MIN(group_start + num_workers, tile_cols) - 1;
+      memmove(tile_buffers + group_start, tile_buffers + group_start + 1,
+              (group_end - group_start) * sizeof(tile_buffers[0]));
+      tile_buffers[group_end] = largest;
+      group_start = group_end + 1;
+    }
+  }
+
+  n = 0;
+  while (n < tile_cols) {
     int i;
-    for (i = 0; i < num_workers && tile_col < tile_cols; ++i) {
+    for (i = 0; i < num_workers && n < tile_cols; ++i) {
       VP9Worker *const worker = &pbi->tile_workers[i];
       TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
       TileInfo *const tile = (TileInfo*)worker->data2;
-      const size_t size =
-          get_tile(data_end, tile_col == tile_cols - 1, &cm->error, &data);
+      TileBuffer *const buf = &tile_buffers[n];
 
       tile_data->cm = cm;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      vp9_tile_init(tile, tile_data->cm, 0, tile_col);
+      vp9_tile_init(tile, tile_data->cm, 0, buf->col);
 
-      setup_token_decoder(data, data_end, size, &cm->error,
+      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader);
-      setup_tile_context(pbi, &tile_data->xd, tile_col);
+      setup_tile_context(pbi, &tile_data->xd, 0, buf->col);
+      setup_tile_macroblockd(tile_data);
 
       worker->had_error = 0;
-      if (i == num_workers - 1 || tile_col == tile_cols - 1) {
+      if (i == num_workers - 1 || n == tile_cols - 1) {
         vp9_worker_execute(worker);
       } else {
         vp9_worker_launch(worker);
       }
 
-      data += size;
-      ++tile_col;
+      if (buf->col == tile_cols - 1) {
+        final_worker = i;
+      }
+
+      ++n;
     }
 
     for (; i > 0; --i) {
       VP9Worker *const worker = &pbi->tile_workers[i - 1];
       pbi->mb.corrupted |= !vp9_worker_sync(worker);
     }
+    if (final_worker > -1) {
+      TileWorkerData *const tile_data =
+          (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+      bit_reader_end = vp9_reader_find_end(&tile_data->bit_reader);
+      final_worker = -1;
+    }
   }
 
-  {
-    const int final_worker = (tile_cols + num_workers - 1) % num_workers;
-    TileWorkerData *const tile_data =
-        (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-    return vp9_reader_find_end(&tile_data->bit_reader);
-  }
+  return bit_reader_end;
 }
 
 static void check_sync_code(VP9_COMMON *cm, struct vp9_read_bit_buffer *rb) {
@@ -1108,9 +1167,9 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
       }
     }
 
-    pbi->refresh_frame_flags = (1 << NUM_REF_FRAMES) - 1;
+    pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
 
-    for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+    for (i = 0; i < REFS_PER_FRAME; ++i)
       cm->active_ref_idx[i] = cm->new_fb_idx;
 
     setup_frame_size(pbi, rb);
@@ -1123,13 +1182,13 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
     if (cm->intra_only) {
       check_sync_code(cm, rb);
 
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
+      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(pbi, rb);
     } else {
-      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, NUM_REF_FRAMES);
+      pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
 
-      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-        const int ref = vp9_rb_read_literal(rb, NUM_REF_FRAMES_LOG2);
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        const int ref = vp9_rb_read_literal(rb, REF_FRAMES_LOG2);
         cm->active_ref_idx[i] = cm->ref_frame_map[ref];
         cm->ref_frame_sign_bias[LAST_FRAME + i] = vp9_rb_read_bit(rb);
       }
@@ -1139,8 +1198,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
       cm->allow_high_precision_mv = vp9_rb_read_bit(rb);
       cm->mcomp_filter_type = read_interp_filter_type(rb);
 
-      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
         vp9_setup_scale_factors(cm, i);
+        if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+          vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+                                   cm->subsampling_x, cm->subsampling_y);
+      }
     }
   }
 
@@ -1154,7 +1217,7 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi,
 
   // This flag will be overridden by the call to vp9_setup_past_independence
   // below, forcing the use of context 0 for those frame types.
-  cm->frame_context_idx = vp9_rb_read_literal(rb, NUM_FRAME_CONTEXTS_LOG2);
+  cm->frame_context_idx = vp9_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp9_setup_past_independence(cm);
@@ -1187,7 +1250,7 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
 
   cm->tx_mode = xd->lossless ? ONLY_4X4 : read_tx_mode(&r);
   if (cm->tx_mode == TX_MODE_SELECT)
-    read_tx_probs(&fc->tx_probs, &r);
+    read_tx_mode_probs(&fc->tx_probs, &r);
   read_coef_probs(fc, cm->tx_mode, &r);
 
   for (k = 0; k < MBSKIP_CONTEXTS; ++k)
@@ -1205,7 +1268,8 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp9_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-    read_comp_pred(cm, &r);
+    cm->reference_mode = read_reference_mode(cm, &r);
+    read_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
@@ -1213,9 +1277,9 @@ static int read_compressed_header(VP9D_COMP *pbi, const uint8_t *data,
 
     for (j = 0; j < PARTITION_CONTEXTS; ++j)
       for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp9_diff_update_prob(&r, &fc->partition_prob[INTER_FRAME][j][i]);
+        vp9_diff_update_prob(&r, &fc->partition_prob[j][i]);
 
-    read_mv_probs(&r, nmvc, cm->allow_high_precision_mv);
+    read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
   }
 
   return vp9_reader_has_error(&r);
@@ -1311,9 +1375,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
     }
   }
 
-  alloc_tile_storage(pbi, tile_cols);
+  alloc_tile_storage(pbi, tile_rows, tile_cols);
 
-  xd->mi_8x8 = cm->mi_grid_visible;
   xd->mode_info_stride = cm->mode_info_stride;
   set_prev_mi(cm);
 
@@ -1323,7 +1386,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
   vp9_zero(cm->counts);
   for (i = 0; i < MAX_MB_PLANE; ++i)
-    vp9_zero(xd->plane[i].qcoeff);
+    vpx_memset(xd->plane[i].dqcoeff, 0, 64 * 64 * sizeof(int16_t));
 
   xd->corrupted = 0;
   new_fb->corrupted = read_compressed_header(pbi, data, first_partition_size);
diff --git a/source/libvpx/vp9/decoder/vp9_decodframe.h b/source/libvpx/vp9/decoder/vp9_decodeframe.h
index c665f6f..7245a98 100644
--- a/source/libvpx/vp9/decoder/vp9_decodframe.h
+++ b/source/libvpx/vp9/decoder/vp9_decodeframe.h
@@ -9,8 +9,8 @@
  */
 
 
-#ifndef VP9_DECODER_VP9_DECODFRAME_H_
-#define VP9_DECODER_VP9_DECODFRAME_H_
+#ifndef VP9_DECODER_VP9_DECODEFRAME_H_
+#define VP9_DECODER_VP9_DECODEFRAME_H_
 
 struct VP9Common;
 struct VP9Decompressor;
@@ -18,4 +18,4 @@ struct VP9Decompressor;
 void vp9_init_dequantizer(struct VP9Common *cm);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
-#endif  // VP9_DECODER_VP9_DECODFRAME_H_
+#endif  // VP9_DECODER_VP9_DECODEFRAME_H_
diff --git a/source/libvpx/vp9/decoder/vp9_decodemv.c b/source/libvpx/vp9/decoder/vp9_decodemv.c
index 475a299..4e2bc35 100644
--- a/source/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/source/libvpx/vp9/decoder/vp9_decodemv.c
@@ -20,13 +20,13 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#include "vp9/decoder/vp9_dboolhuff.h"
 #include "vp9/decoder/vp9_decodemv.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
-  return (MB_PREDICTION_MODE)treed_read(r, vp9_intra_mode_tree, p);
+  return (MB_PREDICTION_MODE)vp9_read_tree(r, vp9_intra_mode_tree, p);
 }
 
 static MB_PREDICTION_MODE read_intra_mode_y(VP9_COMMON *cm, vp9_reader *r,
@@ -48,51 +48,49 @@ static MB_PREDICTION_MODE read_intra_mode_uv(VP9_COMMON *cm, vp9_reader *r,
 }
 
 static MB_PREDICTION_MODE read_inter_mode(VP9_COMMON *cm, vp9_reader *r,
-                                          uint8_t context) {
-  const MB_PREDICTION_MODE mode = treed_read(r, vp9_inter_mode_tree,
-                                             cm->fc.inter_mode_probs[context]);
+                                          int ctx) {
+  const int mode = vp9_read_tree(r, vp9_inter_mode_tree,
+                                 cm->fc.inter_mode_probs[ctx]);
   if (!cm->frame_parallel_decoding_mode)
-    ++cm->counts.inter_mode[context][inter_mode_offset(mode)];
-  return mode;
+    ++cm->counts.inter_mode[ctx][mode];
+
+  return NEARESTMV + mode;
 }
 
 static int read_segment_id(vp9_reader *r, const struct segmentation *seg) {
-  return treed_read(r, vp9_segment_tree, seg->tree_probs);
+  return vp9_read_tree(r, vp9_segment_tree, seg->tree_probs);
 }
 
 static TX_SIZE read_selected_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd,
-                                     BLOCK_SIZE bsize, vp9_reader *r) {
-  const uint8_t context = vp9_get_pred_context_tx_size(xd);
-  const vp9_prob *tx_probs = get_tx_probs(bsize, context, &cm->fc.tx_probs);
+                                     TX_SIZE max_tx_size, vp9_reader *r) {
+  const int ctx = vp9_get_tx_size_context(xd);
+  const vp9_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc.tx_probs);
   TX_SIZE tx_size = vp9_read(r, tx_probs[0]);
-  if (tx_size != TX_4X4 && bsize >= BLOCK_16X16) {
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     tx_size += vp9_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && bsize >= BLOCK_32X32)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
       tx_size += vp9_read(r, tx_probs[2]);
   }
 
   if (!cm->frame_parallel_decoding_mode)
-    ++get_tx_counts(bsize, context, &cm->counts.tx)[tx_size];
+    ++get_tx_counts(max_tx_size, ctx, &cm->counts.tx)[tx_size];
   return tx_size;
 }
 
-static TX_SIZE read_tx_size(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                            TX_MODE tx_mode, BLOCK_SIZE bsize, int allow_select,
-                            vp9_reader *r) {
-  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8) {
-    return read_selected_tx_size(cm, xd, bsize, r);
-  } else {
-    const TX_SIZE max_tx_size_block = max_txsize_lookup[bsize];
-    const TX_SIZE max_tx_size_txmode = tx_mode_to_biggest_tx_size[tx_mode];
-    return MIN(max_tx_size_block, max_tx_size_txmode);
-  }
+static TX_SIZE read_tx_size(VP9_COMMON *cm, MACROBLOCKD *xd, TX_MODE tx_mode,
+                            BLOCK_SIZE bsize, int allow_select, vp9_reader *r) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (allow_select && tx_mode == TX_MODE_SELECT && bsize >= BLOCK_8X8)
+    return read_selected_tx_size(cm, xd, max_tx_size, r);
+  else
+    return MIN(max_tx_size, tx_mode_to_biggest_tx_size[tx_mode]);
 }
 
 static void set_segment_id(VP9_COMMON *cm, BLOCK_SIZE bsize,
                            int mi_row, int mi_col, int segment_id) {
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
   const int xmis = MIN(cm->mi_cols - mi_col, bw);
   const int ymis = MIN(cm->mi_rows - mi_row, bh);
   int x, y;
@@ -125,23 +123,23 @@ static int read_intra_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                                  int mi_row, int mi_col, vp9_reader *r) {
   struct segmentation *const seg = &cm->seg;
-  const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
-  int pred_segment_id, segment_id;
+  MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int predicted_segment_id, segment_id;
 
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
-  pred_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
-                                       bsize, mi_row, mi_col);
+  predicted_segment_id = vp9_get_segment_id(cm, cm->last_frame_seg_map,
+                                            bsize, mi_row, mi_col);
   if (!seg->update_map)
-    return pred_segment_id;
+    return predicted_segment_id;
 
   if (seg->temporal_update) {
     const vp9_prob pred_prob = vp9_get_pred_prob_seg_id(seg, xd);
-    const int pred_flag = vp9_read(r, pred_prob);
-    vp9_set_pred_flag_seg_id(xd, pred_flag);
-    segment_id = pred_flag ? pred_segment_id
-                           : read_segment_id(r, seg);
+    mbmi->seg_id_predicted = vp9_read(r, pred_prob);
+    segment_id = mbmi->seg_id_predicted ? predicted_segment_id
+                                        : read_segment_id(r, seg);
   } else {
     segment_id = read_segment_id(r, seg);
   }
@@ -149,26 +147,27 @@ static int read_inter_segment_id(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   return segment_id;
 }
 
-static uint8_t read_skip_coeff(VP9_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vp9_reader *r) {
-  int skip_coeff = vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
-  if (!skip_coeff) {
-    const int ctx = vp9_get_pred_context_mbskip(xd);
-    skip_coeff = vp9_read(r, vp9_get_pred_prob_mbskip(cm, xd));
+static int read_skip_coeff(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                           int segment_id, vp9_reader *r) {
+  if (vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = vp9_get_skip_context(xd);
+    const int skip = vp9_read(r, cm->fc.mbskip_probs[ctx]);
     if (!cm->frame_parallel_decoding_mode)
-      ++cm->counts.mbskip[ctx][skip_coeff];
+      ++cm->counts.mbskip[ctx][skip];
+    return skip;
   }
-  return skip_coeff;
 }
 
 static void read_intra_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       MODE_INFO *const m,
                                        int mi_row, int mi_col, vp9_reader *r) {
-  MB_MODE_INFO *const mbmi = &m->mbmi;
-  const BLOCK_SIZE bsize = mbmi->sb_type;
+  MODE_INFO *const mi = xd->mi_8x8[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->mi_8x8[-cm->mode_info_stride];
   const MODE_INFO *left_mi  = xd->left_available ? xd->mi_8x8[-1] : NULL;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
 
   mbmi->segment_id = read_intra_segment_id(cm, xd, mi_row, mi_col, r);
   mbmi->skip_coeff = read_skip_coeff(cm, xd, mbmi->segment_id, r);
@@ -177,8 +176,8 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   mbmi->ref_frame[1] = NONE;
 
   if (bsize >= BLOCK_8X8) {
-    const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, 0);
-    const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, 0);
+    const MB_PREDICTION_MODE A = above_block_mode(mi, above_mi, 0);
+    const MB_PREDICTION_MODE L = left_block_mode(mi, left_mi, 0);
     mbmi->mode = read_intra_mode(r, vp9_kf_y_mode_prob[A][L]);
   } else {
     // Only 4x4, 4x8, 8x4 blocks
@@ -189,19 +188,19 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int ib = idy * 2 + idx;
-        const MB_PREDICTION_MODE A = above_block_mode(m, above_mi, ib);
-        const MB_PREDICTION_MODE L = left_block_mode(m, left_mi, ib);
+        const MB_PREDICTION_MODE A = above_block_mode(mi, above_mi, ib);
+        const MB_PREDICTION_MODE L = left_block_mode(mi, left_mi, ib);
         const MB_PREDICTION_MODE b_mode = read_intra_mode(r,
                                               vp9_kf_y_mode_prob[A][L]);
-        m->bmi[ib].as_mode = b_mode;
+        mi->bmi[ib].as_mode = b_mode;
         if (num_4x4_h == 2)
-          m->bmi[ib + 2].as_mode = b_mode;
+          mi->bmi[ib + 2].as_mode = b_mode;
         if (num_4x4_w == 2)
-          m->bmi[ib + 1].as_mode = b_mode;
+          mi->bmi[ib + 1].as_mode = b_mode;
       }
     }
 
-    mbmi->mode = m->bmi[3].as_mode;
+    mbmi->mode = mi->bmi[3].as_mode;
   }
 
   mbmi->uv_mode = read_intra_mode(r, vp9_kf_uv_mode_prob[mbmi->mode]);
@@ -211,12 +210,12 @@ static int read_mv_component(vp9_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
   const int sign = vp9_read(r, mvcomp->sign);
-  const int mv_class = treed_read(r, vp9_mv_class_tree, mvcomp->classes);
+  const int mv_class = vp9_read_tree(r, vp9_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = treed_read(r, vp9_mv_class0_tree, mvcomp->class0);
+    d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -227,8 +226,8 @@ static int read_mv_component(vp9_reader *r,
   }
 
   // Fractional part
-  fr = treed_read(r, vp9_mv_fp_tree,
-                  class0 ? mvcomp->class0_fp[d] : mvcomp->fp);
+  fr = vp9_read_tree(r, vp9_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+                                               : mvcomp->fp);
 
 
   // High precision part (if hp is not used, the default value of the hp is 1)
@@ -243,7 +242,7 @@ static int read_mv_component(vp9_reader *r,
 static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
-  const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
+  const MV_JOINT_TYPE j = vp9_read_tree(r, vp9_mv_joint_tree, ctx->joints);
   const int use_hp = allow_hp && vp9_use_mv_hp(ref);
   MV diff = {0, 0};
 
@@ -259,6 +258,15 @@ static INLINE void read_mv(vp9_reader *r, MV *mv, const MV *ref,
   mv->col = ref->col + diff.col;
 }
 
+static REFERENCE_MODE read_reference_mode(VP9_COMMON *cm, const MACROBLOCKD *xd,
+                                          vp9_reader *r) {
+  const int ctx = vp9_get_reference_mode_context(cm, xd);
+  const int mode = vp9_read(r, cm->fc.comp_inter_prob[ctx]);
+  if (!cm->frame_parallel_decoding_mode)
+    ++cm->counts.comp_inter[ctx][mode];
+  return mode;  // SINGLE_REFERENCE or COMPOUND_REFERENCE
+}
+
 // Read the referncence frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vp9_reader *r,
@@ -270,27 +278,20 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     ref_frame[0] = vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
     ref_frame[1] = NONE;
   } else {
-    const int comp_ctx = vp9_get_pred_context_comp_inter_inter(cm, xd);
-    int is_comp;
-
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      is_comp = vp9_read(r, fc->comp_inter_prob[comp_ctx]);
-      if (!cm->frame_parallel_decoding_mode)
-        ++counts->comp_inter[comp_ctx][is_comp];
-    } else {
-      is_comp = cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-    }
+    const REFERENCE_MODE mode = (cm->reference_mode == REFERENCE_MODE_SELECT)
+                                      ? read_reference_mode(cm, xd, r)
+                                      : cm->reference_mode;
 
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
-    if (is_comp) {
-      const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
-      const int ref_ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
-      const int b = vp9_read(r, fc->comp_ref_prob[ref_ctx]);
+    if (mode == COMPOUND_REFERENCE) {
+      const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+      const int ctx = vp9_get_pred_context_comp_ref_p(cm, xd);
+      const int bit = vp9_read(r, fc->comp_ref_prob[ctx]);
       if (!cm->frame_parallel_decoding_mode)
-        ++counts->comp_ref[ref_ctx][b];
-      ref_frame[fix_ref_idx] = cm->comp_fixed_ref;
-      ref_frame[!fix_ref_idx] = cm->comp_var_ref[b];
-    } else {
+        ++counts->comp_ref[ctx][bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+      ref_frame[!idx] = cm->comp_var_ref[bit];
+    } else if (mode == SINGLE_REFERENCE) {
       const int ctx0 = vp9_get_pred_context_single_ref_p1(xd);
       const int bit0 = vp9_read(r, fc->single_ref_prob[ctx0][0]);
       if (!cm->frame_parallel_decoding_mode)
@@ -298,14 +299,16 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       if (bit0) {
         const int ctx1 = vp9_get_pred_context_single_ref_p2(xd);
         const int bit1 = vp9_read(r, fc->single_ref_prob[ctx1][1]);
-        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
         if (!cm->frame_parallel_decoding_mode)
           ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
         ref_frame[0] = LAST_FRAME;
       }
 
       ref_frame[1] = NONE;
+    } else {
+      assert(0 && "Invalid prediction mode.");
     }
   }
 }
@@ -314,8 +317,8 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 static INLINE INTERPOLATION_TYPE read_switchable_filter_type(
     VP9_COMMON *const cm, MACROBLOCKD *const xd, vp9_reader *r) {
   const int ctx = vp9_get_pred_context_switchable_interp(xd);
-  const int type = treed_read(r, vp9_switchable_interp_tree,
-                              cm->fc.switchable_interp_prob[ctx]);
+  const int type = vp9_read_tree(r, vp9_switchable_interp_tree,
+                                 cm->fc.switchable_interp_prob[ctx]);
   if (!cm->frame_parallel_decoding_mode)
     ++cm->counts.switchable_interp[ctx][type];
   return type;
@@ -404,8 +407,8 @@ static int read_is_inter_block(VP9_COMMON *const cm, MACROBLOCKD *const xd,
     return vp9_get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) !=
            INTRA_FRAME;
   } else {
-    const int ctx = vp9_get_pred_context_intra_inter(xd);
-    const int is_inter = vp9_read(r, vp9_get_pred_prob_intra_inter(cm, xd));
+    const int ctx = vp9_get_intra_inter_context(xd);
+    const int is_inter = vp9_read(r, cm->fc.intra_inter_prob[ctx]);
     if (!cm->frame_parallel_decoding_mode)
       ++cm->counts.intra_inter[ctx][is_inter];
     return is_inter;
@@ -422,19 +425,18 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
   const int allow_hp = cm->allow_high_precision_mv;
 
   int_mv nearest[2], nearmv[2], best[2];
-  uint8_t inter_mode_ctx;
-  MV_REFERENCE_FRAME ref0;
-  int is_compound;
+  int inter_mode_ctx, ref, is_compound;
 
-  mbmi->uv_mode = DC_PRED;
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
-  ref0 = mbmi->ref_frame[0];
   is_compound = has_second_ref(mbmi);
 
-  vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref0, mbmi->ref_mvs[ref0],
-                   mi_row, mi_col);
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, frame, mbmi->ref_mvs[frame],
+                     mi_row, mi_col);
+  }
 
-  inter_mode_ctx = mbmi->mode_context[ref0];
+  inter_mode_ctx = mbmi->mode_context[mbmi->ref_frame[0]];
 
   if (vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
@@ -448,22 +450,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
       mbmi->mode = read_inter_mode(cm, r, inter_mode_ctx);
   }
 
-  // nearest, nearby
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-    vp9_find_best_ref_mvs(xd, allow_hp,
-                          mbmi->ref_mvs[ref0], &nearest[0], &nearmv[0]);
-    best[0].as_int = nearest[0].as_int;
-  }
-
-  if (is_compound) {
-    const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
-    vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi,
-                     ref1, mbmi->ref_mvs[ref1], mi_row, mi_col);
-
-    if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
-      vp9_find_best_ref_mvs(xd, allow_hp,
-                            mbmi->ref_mvs[ref1], &nearest[1], &nearmv[1]);
-      best[1].as_int = nearest[1].as_int;
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]],
+                            &nearest[ref], &nearmv[ref]);
+      best[ref].as_int = nearest[ref].as_int;
     }
   }
 
@@ -482,16 +473,10 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
         const int j = idy * 2 + idx;
         b_mode = read_inter_mode(cm, r, inter_mode_ctx);
 
-        if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[0],
-                                        &nearmv[0], j, 0,
-                                        mi_row, mi_col);
-
-          if (is_compound)
-            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, &nearest[1],
-                                          &nearmv[1], j, 1,
-                                          mi_row, mi_col);
-        }
+        if (b_mode == NEARESTMV || b_mode == NEARMV)
+          for (ref = 0; ref < 1 + is_compound; ++ref)
+            vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col,
+                                          &nearest[ref], &nearmv[ref]);
 
         if (!assign_mv(cm, b_mode, block, best, nearest, nearmv,
                        is_compound, allow_hp, r)) {
@@ -499,7 +484,6 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
           break;
         };
 
-
         mi->bmi[j].as_mv[0].as_int = block[0].as_int;
         if (is_compound)
           mi->bmi[j].as_mv[1].as_int = block[1].as_int;
@@ -525,8 +509,8 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm,
 static void read_inter_frame_mode_info(VP9_COMMON *const cm,
                                        MACROBLOCKD *const xd,
                                        const TileInfo *const tile,
-                                       MODE_INFO *const mi,
                                        int mi_row, int mi_col, vp9_reader *r) {
+  MODE_INFO *const mi = xd->mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   int inter_block;
 
@@ -544,25 +528,10 @@ static void read_inter_frame_mode_info(VP9_COMMON *const cm,
     read_intra_block_mode_info(cm, mi, r);
 }
 
-void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd,
-                        const TileInfo *const tile,
+void vp9_read_mode_info(VP9_COMMON *cm, MACROBLOCKD *xd, const TileInfo *tile,
                         int mi_row, int mi_col, vp9_reader *r) {
-  MODE_INFO *const mi = xd->mi_8x8[0];
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int bw = 1 << mi_width_log2(bsize);
-  const int bh = 1 << mi_height_log2(bsize);
-  const int y_mis = MIN(bh, cm->mi_rows - mi_row);
-  const int x_mis = MIN(bw, cm->mi_cols - mi_col);
-  int x, y, z;
-
   if (frame_is_intra_only(cm))
-    read_intra_frame_mode_info(cm, xd, mi, mi_row, mi_col, r);
+    read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
   else
-    read_inter_frame_mode_info(cm, xd, tile, mi, mi_row, mi_col, r);
-
-  for (y = 0, z = 0; y < y_mis; y++, z += cm->mode_info_stride) {
-    for (x = !y; x < x_mis; x++) {
-      xd->mi_8x8[z + x] = mi;
-    }
-  }
+    read_inter_frame_mode_info(cm, xd, tile, mi_row, mi_col, r);
 }
diff --git a/source/libvpx/vp9/decoder/vp9_detokenize.c b/source/libvpx/vp9/decoder/vp9_detokenize.c
index 6ecce28..63f1731 100644
--- a/source/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/source/libvpx/vp9/decoder/vp9_detokenize.c
@@ -13,24 +13,20 @@
 
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_seg_common.h"
 
-#include "vp9/decoder/vp9_dboolhuff.h"
 #include "vp9/decoder/vp9_detokenize.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
-#include "vp9/decoder/vp9_treereader.h"
 
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
+#define LOW_VAL_CONTEXT_NODE        0
+#define TWO_CONTEXT_NODE            1
+#define THREE_CONTEXT_NODE          2
+#define HIGH_LOW_CONTEXT_NODE       3
+#define CAT_ONE_CONTEXT_NODE        4
+#define CAT_THREEFOUR_CONTEXT_NODE  5
+#define CAT_THREE_CONTEXT_NODE      6
+#define CAT_FIVE_CONTEXT_NODE       7
 
 #define CAT1_MIN_VAL    5
 #define CAT2_MIN_VAL    7
@@ -61,92 +57,86 @@ static const vp9_prob cat6_prob[15] = {
   254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0
 };
 
-#define INCREMENT_COUNT(token)                           \
-  do {                                                   \
-    if (!cm->frame_parallel_decoding_mode) {             \
-      ++coef_counts[type][ref][band][pt]                 \
-                   [token >= TWO_TOKEN ?                 \
-                    (token == DCT_EOB_TOKEN ?            \
-                     DCT_EOB_MODEL_TOKEN : TWO_TOKEN) :  \
-                    token];                              \
-    }                                                    \
-    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
+#define INCREMENT_COUNT(token)                              \
+  do {                                                      \
+     if (!cm->frame_parallel_decoding_mode)                 \
+       ++coef_counts[band][ctx][token];                     \
   } while (0)
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
-    qcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
-                            dq[c > 0] / (1 + (tx_size == TX_32X32)); \
-    INCREMENT_COUNT(token);                              \
-    c++;                                                 \
+    v = (val * dqv) >> dq_shift;                         \
+    dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v;         \
+    token_cache[scan[c]] = vp9_pt_energy_class[token];   \
+    ++c;                                                 \
+    ctx = get_coef_context(nb, token_cache, c);          \
+    dqv = dq[1];                                         \
     continue;                                            \
   }
 
-#define ADJUST_COEF(prob, bits_count)  \
-  do {                                 \
-    if (vp9_read(r, prob))             \
-      val += 1 << bits_count;          \
-  } while (0);
+#define ADJUST_COEF(prob, bits_count)                   \
+  do {                                                  \
+    val += (vp9_read(r, prob) << bits_count);           \
+  } while (0)
 
-static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
-                        vp9_reader *r, int block_idx,
-                        PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr,
-                        TX_SIZE tx_size, const int16_t *dq, int pt) {
+static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, int block,
+                        PLANE_TYPE type, int16_t *dqcoeff, TX_SIZE tx_size,
+                        const int16_t *dq, int ctx, vp9_reader *r) {
+  const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   int band, c = 0;
-  const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
+  const vp9_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
-  vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } };
   const vp9_prob *prob;
-  vp9_coeff_count_model *coef_counts = counts->coef[tx_size];
-  const int16_t *scan, *nb;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  uint8_t token_cache[1024];
-  get_scan(xd, tx_size, type, block_idx, &scan, &nb);
-
-  while (1) {
+  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1] =
+      counts->coef[tx_size][type][ref];
+  unsigned int (*eob_branch_count)[COEFF_CONTEXTS] =
+      counts->eob_branch[tx_size][type][ref];
+  uint8_t token_cache[32 * 32];
+  const uint8_t *cat6;
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  const int dq_shift = (tx_size == TX_32X32);
+  const scan_order *so = get_scan(xd, tx_size, type, block);
+  const int16_t *scan = so->scan;
+  const int16_t *nb = so->neighbors;
+  int v;
+  int16_t dqv = dq[0];
+
+  while (c < max_eob) {
     int val;
-    const uint8_t *cat6 = cat6_prob;
-    if (c >= seg_eob)
-      break;
-    if (c)
-      pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
-    prob = coef_probs[band][pt];
+    band = *band_translate++;
+    prob = coef_probs[band][ctx];
     if (!cm->frame_parallel_decoding_mode)
-      ++counts->eob_branch[tx_size][type][ref][band][pt];
-    if (!vp9_read(r, prob[EOB_CONTEXT_NODE]))
+      ++eob_branch_count[band][ctx];
+    if (!vp9_read(r, prob[EOB_CONTEXT_NODE])) {
+      INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
+    }
 
-  SKIP_START:
-    if (c >= seg_eob)
-      break;
-    if (c)
-      pt = get_coef_context(nb, token_cache, c);
-    band = get_coef_band(band_translate, c);
-    prob = coef_probs[band][pt];
-
-    if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
+    while (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
+      dqv = dq[1];
+      token_cache[scan[c]] = 0;
       ++c;
-      goto SKIP_START;
+      if (c >= max_eob)
+        return c;  // zero tokens at the end (no eob token)
+      ctx = get_coef_context(nb, token_cache, c);
+      band = *band_translate++;
+      prob = coef_probs[band][ctx];
     }
 
     // ONE_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
+      INCREMENT_COUNT(ONE_TOKEN);
       WRITE_COEF_CONTINUE(1, ONE_TOKEN);
     }
-    // Load full probabilities if not already loaded
-    if (!load_map[band][pt]) {
-      vp9_model_to_full_probs(coef_probs[band][pt],
-                              coef_probs_full[band][pt]);
-      load_map[band][pt] = 1;
-    }
-    prob = coef_probs_full[band][pt];
-    // LOW_VAL_CONTEXT_NODE_0_
+
+    INCREMENT_COUNT(TWO_TOKEN);
+
+    prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1];
+
     if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
         WRITE_COEF_CONTINUE(2, TWO_TOKEN);
@@ -156,35 +146,35 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       }
       WRITE_COEF_CONTINUE(4, FOUR_TOKEN);
     }
-    // HIGH_LOW_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) {
         val = CAT1_MIN_VAL;
         ADJUST_COEF(CAT1_PROB0, 0);
-        WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY1);
+        WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN);
       }
       val = CAT2_MIN_VAL;
       ADJUST_COEF(CAT2_PROB1, 1);
       ADJUST_COEF(CAT2_PROB0, 0);
-      WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY2);
+      WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN);
     }
-    // CAT_THREEFOUR_CONTEXT_NODE_0_
+
     if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) {
         val = CAT3_MIN_VAL;
         ADJUST_COEF(CAT3_PROB2, 2);
         ADJUST_COEF(CAT3_PROB1, 1);
         ADJUST_COEF(CAT3_PROB0, 0);
-        WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY3);
+        WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN);
       }
       val = CAT4_MIN_VAL;
       ADJUST_COEF(CAT4_PROB3, 3);
       ADJUST_COEF(CAT4_PROB2, 2);
       ADJUST_COEF(CAT4_PROB1, 1);
       ADJUST_COEF(CAT4_PROB0, 0);
-      WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY4);
+      WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN);
     }
-    // CAT_FIVE_CONTEXT_NODE_0_:
+
     if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) {
       val = CAT5_MIN_VAL;
       ADJUST_COEF(CAT5_PROB4, 4);
@@ -192,19 +182,15 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
       ADJUST_COEF(CAT5_PROB2, 2);
       ADJUST_COEF(CAT5_PROB1, 1);
       ADJUST_COEF(CAT5_PROB0, 0);
-      WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY5);
+      WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN);
     }
     val = 0;
-    while (*cat6) {
+    cat6 = cat6_prob;
+    while (*cat6)
       val = (val << 1) | vp9_read(r, *cat6++);
-    }
     val += CAT6_MIN_VAL;
-    WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
-  }
 
-  if (c < seg_eob) {
-    if (!cm->frame_parallel_decoding_mode)
-      ++coef_counts[type][ref][band][pt][DCT_EOB_MODEL_TOKEN];
+    WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN);
   }
 
   return c;
@@ -212,22 +198,14 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r) {
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
-                                 tx_size);
-  int aoff, loff, eob, pt;
-  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-  pt = get_entropy_context(tx_size, pd->above_context + aoff,
-                                    pd->left_context + loff);
-
-  eob = decode_coefs(cm, xd, r, block,
-                     pd->plane_type, seg_eob, BLOCK_OFFSET(pd->qcoeff, block),
-                     tx_size, pd->dequant, pt);
-
-  set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, aoff, loff);
-
-  pd->eobs[block] = eob;
+  const int ctx = get_entropy_context(tx_size, pd->above_context + x,
+                                               pd->left_context + y);
+  const int eob = decode_coefs(cm, xd, block, pd->plane_type,
+                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
+                               pd->dequant, ctx, r);
+  set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   return eob;
 }
 
diff --git a/source/libvpx/vp9/decoder/vp9_detokenize.h b/source/libvpx/vp9/decoder/vp9_detokenize.h
index 94dd8e4..2a88073 100644
--- a/source/libvpx/vp9/decoder/vp9_detokenize.h
+++ b/source/libvpx/vp9/decoder/vp9_detokenize.h
@@ -17,6 +17,6 @@
 
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, vp9_reader *r);
+                            int x, int y, TX_SIZE tx_size, vp9_reader *r);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_
diff --git a/source/libvpx/vp9/decoder/vp9_onyxd_if.c b/source/libvpx/vp9/decoder/vp9_onyxd_if.c
index 5f970a3..e24ba42 100644
--- a/source/libvpx/vp9/decoder/vp9_onyxd_if.c
+++ b/source/libvpx/vp9/decoder/vp9_onyxd_if.c
@@ -25,7 +25,7 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "./vpx_scale_rtcd.h"
 
@@ -107,6 +107,15 @@ void vp9_initialize_dec() {
   }
 }
 
+static void init_macroblockd(VP9D_COMP *const pbi) {
+  MACROBLOCKD *xd = &pbi->mb;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
+
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    pd[i].dqcoeff = pbi->dqcoeff[i];
+}
+
 VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   VP9D_COMP *const pbi = vpx_memalign(32, sizeof(VP9D_COMP));
   VP9_COMMON *const cm = pbi ? &pbi->common : NULL;
@@ -116,6 +125,9 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
 
   vp9_zero(*pbi);
 
+  // Initialize the references to not point to any frame buffers.
+  memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+
   if (setjmp(cm->error.jmp)) {
     cm->error.setjmp = 0;
     vp9_remove_decompressor(pbi);
@@ -141,6 +153,8 @@ VP9D_PTR vp9_create_decompressor(VP9D_CONFIG *oxcf) {
   cm->error.setjmp = 0;
   pbi->decoded_key_frame = 0;
 
+  init_macroblockd(pbi);
+
   vp9_worker_init(&pbi->lf_worker);
 
   return pbi;
@@ -247,7 +261,7 @@ int vp9_get_reference_dec(VP9D_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
 
-  if (index < 0 || index >= NUM_REF_FRAMES)
+  if (index < 0 || index >= REF_FRAMES)
     return -1;
 
   *fb = &cm->yv12_fb[cm->ref_frame_map[index]];
@@ -365,10 +379,6 @@ int vp9_receive_compressed_data(VP9D_PTR ptr,
                            cm->current_video_frame + 3000);
 #endif
 
-  vp9_extend_frame_inner_borders(cm->frame_to_show,
-                                 cm->subsampling_x,
-                                 cm->subsampling_y);
-
 #if WRITE_RECON_BUFFER == 1
   if (cm->show_frame)
     recon_write_yuv_frame("recon.yuv", cm->frame_to_show,
diff --git a/source/libvpx/vp9/decoder/vp9_onyxd_int.h b/source/libvpx/vp9/decoder/vp9_onyxd_int.h
index 83ea967..e90f892 100644
--- a/source/libvpx/vp9/decoder/vp9_onyxd_int.h
+++ b/source/libvpx/vp9/decoder/vp9_onyxd_int.h
@@ -22,6 +22,8 @@ typedef struct VP9Decompressor {
 
   DECLARE_ALIGNED(16, VP9_COMMON, common);
 
+  DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
+
   VP9D_CONFIG oxcf;
 
   const uint8_t *source;
diff --git a/source/libvpx/vp9/decoder/vp9_treereader.h b/source/libvpx/vp9/decoder/vp9_treereader.h
deleted file mode 100644
index f612497..0000000
--- a/source/libvpx/vp9/decoder/vp9_treereader.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_DECODER_VP9_TREEREADER_H_
-#define VP9_DECODER_VP9_TREEREADER_H_
-
-#include "vp9/common/vp9_treecoder.h"
-#include "vp9/decoder/vp9_dboolhuff.h"
-
-#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))
-
-// Intent of tree data structure is to make decoding trivial.
-static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
-                      vp9_tree t,
-                      const vp9_prob *const p) {
-  register vp9_tree_index i = 0;
-
-  while ((i = t[ i + vp9_read(r, p[i >> 1])]) > 0)
-    continue;
-
-  return -i;
-}
-
-#endif  // VP9_DECODER_VP9_TREEREADER_H_
diff --git a/source/libvpx/vp9/encoder/vp9_bitstream.c b/source/libvpx/vp9/encoder/vp9_bitstream.c
index a996e0e..1356ca5 100644
--- a/source/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/source/libvpx/vp9/encoder/vp9_bitstream.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_subexp.h"
+#include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_write_bit_buffer.h"
 
 
@@ -43,113 +44,34 @@ unsigned __int64 Sectionbits[500];
 int intra_mode_stats[INTRA_MODES]
                     [INTRA_MODES]
                     [INTRA_MODES];
-vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
+vp9_coeff_stats tree_update_hist[TX_SIZES][PLANE_TYPES];
 
 extern unsigned int active_section;
 #endif
 
+static struct vp9_token intra_mode_encodings[INTRA_MODES];
+static struct vp9_token switchable_interp_encodings[SWITCHABLE_FILTERS];
+static struct vp9_token partition_encodings[PARTITION_TYPES];
+static struct vp9_token inter_mode_encodings[INTER_MODES];
 
-#ifdef MODE_STATS
-int64_t tx_count_32x32p_stats[TX_SIZE_CONTEXTS][TX_SIZES];
-int64_t tx_count_16x16p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-int64_t tx_count_8x8p_stats[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-int64_t switchable_interp_stats[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
-
-void init_tx_count_stats() {
-  vp9_zero(tx_count_32x32p_stats);
-  vp9_zero(tx_count_16x16p_stats);
-  vp9_zero(tx_count_8x8p_stats);
-}
-
-void init_switchable_interp_stats() {
-  vp9_zero(switchable_interp_stats);
-}
-
-static void update_tx_count_stats(VP9_COMMON *cm) {
-  int i, j;
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++) {
-      tx_count_32x32p_stats[i][j] += cm->fc.tx_count_32x32p[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      tx_count_16x16p_stats[i][j] += cm->fc.tx_count_16x16p[i][j];
-    }
-  }
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      tx_count_8x8p_stats[i][j] += cm->fc.tx_count_8x8p[i][j];
-    }
-  }
+void vp9_entropy_mode_init() {
+  vp9_tokens_from_tree(intra_mode_encodings, vp9_intra_mode_tree);
+  vp9_tokens_from_tree(switchable_interp_encodings, vp9_switchable_interp_tree);
+  vp9_tokens_from_tree(partition_encodings, vp9_partition_tree);
+  vp9_tokens_from_tree(inter_mode_encodings, vp9_inter_mode_tree);
 }
 
-static void update_switchable_interp_stats(VP9_COMMON *cm) {
-  int i, j;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    for (j = 0; j < SWITCHABLE_FILTERS; ++j)
-      switchable_interp_stats[i][j] += cm->fc.switchable_interp_count[i][j];
+static void write_intra_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  vp9_write_token(w, vp9_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-void write_tx_count_stats() {
-  int i, j;
-  FILE *fp = fopen("tx_count.bin", "wb");
-  fwrite(tx_count_32x32p_stats, sizeof(tx_count_32x32p_stats), 1, fp);
-  fwrite(tx_count_16x16p_stats, sizeof(tx_count_16x16p_stats), 1, fp);
-  fwrite(tx_count_8x8p_stats, sizeof(tx_count_8x8p_stats), 1, fp);
-  fclose(fp);
-
-  printf(
-      "vp9_default_tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZES] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES; j++) {
-      printf("%"PRId64", ", tx_count_32x32p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-  printf(
-      "vp9_default_tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZES-1] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES - 1; j++) {
-      printf("%"PRId64", ", tx_count_16x16p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-  printf(
-      "vp9_default_tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZES-2] = {\n");
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < TX_SIZES - 2; j++) {
-      printf("%"PRId64", ", tx_count_8x8p_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
-}
-
-void write_switchable_interp_stats() {
-  int i, j;
-  FILE *fp = fopen("switchable_interp.bin", "wb");
-  fwrite(switchable_interp_stats, sizeof(switchable_interp_stats), 1, fp);
-  fclose(fp);
-
-  printf(
-      "vp9_default_switchable_filter_count[SWITCHABLE_FILTER_CONTEXTS]"
-      "[SWITCHABLE_FILTERS] = {\n");
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-    printf("  { ");
-    for (j = 0; j < SWITCHABLE_FILTERS; j++) {
-      printf("%"PRId64", ", switchable_interp_stats[i][j]);
-    }
-    printf("},\n");
-  }
-  printf("};\n");
+static void write_inter_mode(vp9_writer *w, MB_PREDICTION_MODE mode,
+                             const vp9_prob *probs) {
+  assert(is_inter_mode(mode));
+  vp9_write_token(w, vp9_inter_mode_tree, probs,
+                  &inter_mode_encodings[INTER_OFFSET(mode)]);
 }
-#endif
 
 static INLINE void write_be32(uint8_t *p, int value) {
   p[0] = value >> 24;
@@ -163,46 +85,32 @@ void vp9_encode_unsigned_max(struct vp9_write_bit_buffer *wb,
   vp9_wb_write_literal(wb, data, get_unsigned_bits(max));
 }
 
-static void update_mode(
-  vp9_writer *w,
-  int n,
-  vp9_tree tree,
-  vp9_prob Pnew[/* n-1 */],
-  vp9_prob Pcur[/* n-1 */],
-  unsigned int bct[/* n-1 */] [2],
-  const unsigned int num_events[/* n */]
-) {
-  int i = 0;
-
-  vp9_tree_probs_from_distribution(tree, Pnew, bct, num_events, 0);
-  n--;
-
-  for (i = 0; i < n; ++i)
-    vp9_cond_prob_diff_update(w, &Pcur[i], bct[i]);
-}
+static void prob_diff_update(const vp9_tree_index *tree,
+                             vp9_prob probs[/*n - 1*/],
+                             const unsigned int counts[/*n - 1*/],
+                             int n, vp9_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
 
-static void update_mbintra_mode_probs(VP9_COMP* const cpi,
-                                      vp9_writer* const bc) {
-  VP9_COMMON *const cm = &cpi->common;
-  int j;
-  vp9_prob pnew[INTRA_MODES - 1];
-  unsigned int bct[INTRA_MODES - 1][2];
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
 
-  for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
-    update_mode(bc, INTRA_MODES, vp9_intra_mode_tree, pnew,
-                cm->fc.y_mode_prob[j], bct,
-                (unsigned int *)cpi->y_mode_count[j]);
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]);
 }
 
 static void write_selected_tx_size(const VP9_COMP *cpi, MODE_INFO *m,
                                    TX_SIZE tx_size, BLOCK_SIZE bsize,
                                    vp9_writer *w) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   const MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cpi->common.fc.tx_probs, m);
+  const vp9_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
+                                                 &cpi->common.fc.tx_probs);
   vp9_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (bsize >= BLOCK_16X16 && tx_size != TX_4X4) {
+  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
     vp9_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (bsize >= BLOCK_32X32 && tx_size != TX_8X8)
+    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
       vp9_write(w, tx_size != TX_16X16, tx_probs[2]);
   }
 }
@@ -213,9 +121,9 @@ static int write_skip_coeff(const VP9_COMP *cpi, int segment_id, MODE_INFO *m,
   if (vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
-    const int skip_coeff = m->mbmi.skip_coeff;
-    vp9_write(w, skip_coeff, vp9_get_pred_prob_mbskip(&cpi->common, xd));
-    return skip_coeff;
+    const int skip = m->mbmi.skip_coeff;
+    vp9_write(w, skip, vp9_get_skip_prob(&cpi->common, xd));
+    return skip;
   }
 }
 
@@ -227,73 +135,32 @@ void vp9_update_skip_probs(VP9_COMP *cpi, vp9_writer *w) {
     vp9_cond_prob_diff_update(w, &cm->fc.mbskip_probs[k], cm->counts.mbskip[k]);
 }
 
-static void write_intra_mode(vp9_writer *bc, int m, const vp9_prob *p) {
-  write_token(bc, vp9_intra_mode_tree, p, vp9_intra_mode_encodings + m);
-}
-
-static void update_switchable_interp_probs(VP9_COMP *const cpi,
-                                           vp9_writer* const bc) {
+static void update_switchable_interp_probs(VP9_COMP *cpi, vp9_writer *w) {
   VP9_COMMON *const cm = &cpi->common;
-  unsigned int branch_ct[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1][2];
-  vp9_prob new_prob[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS - 1];
-  int i, j;
-  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-    vp9_tree_probs_from_distribution(
-        vp9_switchable_interp_tree,
-        new_prob[j], branch_ct[j],
-        cm->counts.switchable_interp[j], 0);
-  }
-  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j) {
-    for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i) {
-      vp9_cond_prob_diff_update(bc, &cm->fc.switchable_interp_prob[j][i],
-                                branch_ct[j][i]);
-    }
-  }
+  int j;
+  for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+    prob_diff_update(vp9_switchable_interp_tree,
+                     cm->fc.switchable_interp_prob[j],
+                     cm->counts.switchable_interp[j], SWITCHABLE_FILTERS, w);
+
 #ifdef MODE_STATS
   if (!cpi->dummy_packing)
     update_switchable_interp_stats(cm);
 #endif
 }
 
-static void update_inter_mode_probs(VP9_COMMON *cm, vp9_writer* const bc) {
-  int i, j;
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
-    unsigned int branch_ct[INTER_MODES - 1][2];
-    vp9_prob new_prob[INTER_MODES - 1];
-
-    vp9_tree_probs_from_distribution(vp9_inter_mode_tree,
-                                     new_prob, branch_ct,
-                                     cm->counts.inter_mode[i], NEARESTMV);
-
-    for (j = 0; j < INTER_MODES - 1; ++j)
-      vp9_cond_prob_diff_update(bc, &cm->fc.inter_mode_probs[i][j],
-                                branch_ct[j]);
-  }
-}
-
-static void pack_mb_tokens(vp9_writer* const bc,
+static void pack_mb_tokens(vp9_writer* const w,
                            TOKENEXTRA **tp,
                            const TOKENEXTRA *const stop) {
   TOKENEXTRA *p = *tp;
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
-    const struct vp9_token *const a = vp9_coef_encodings + t;
-    const vp9_extra_bit *const b = vp9_extra_bits + t;
+    const struct vp9_token *const a = &vp9_coef_encodings[t];
+    const vp9_extra_bit *const b = &vp9_extra_bits[t];
     int i = 0;
-    const vp9_prob *pp;
     int v = a->value;
     int n = a->len;
-    vp9_prob probs[ENTROPY_NODES];
-
-    if (t >= TWO_TOKEN) {
-      vp9_model_to_full_probs(p->context_tree, probs);
-      pp = probs;
-    } else {
-      pp = p->context_tree;
-    }
-    assert(pp != 0);
 
     /* skip one or two nodes */
     if (p->skip_eob_node) {
@@ -301,11 +168,24 @@ static void pack_mb_tokens(vp9_writer* const bc,
       i = 2 * p->skip_eob_node;
     }
 
-    do {
-      const int bb = (v >> --n) & 1;
-      vp9_write(bc, bb, pp[i >> 1]);
-      i = vp9_coef_tree[i + bb];
-    } while (n);
+    // TODO(jbb): expanding this can lead to big gains.  It allows
+    // much better branch prediction and would enable us to avoid numerous
+    // lookups and compares.
+
+    // If we have a token that's in the constrained set, the coefficient tree
+    // is split into two treed writes.  The first treed write takes care of the
+    // unconstrained nodes.  The second treed write takes care of the
+    // constrained nodes.
+    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
+      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
+      int bits = v >> (n - len);
+      vp9_write_tree(w, vp9_coef_tree, p->context_tree, bits, len, i);
+      vp9_write_tree(w, vp9_coef_con_tree,
+                     vp9_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
+                     v, n - len, 0);
+    } else {
+      vp9_write_tree(w, vp9_coef_tree, p->context_tree, v, n, i);
+    }
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
@@ -318,12 +198,12 @@ static void pack_mb_tokens(vp9_writer* const bc,
 
         do {
           const int bb = (v >> --n) & 1;
-          vp9_write(bc, bb, pb[i >> 1]);
+          vp9_write(w, bb, pb[i >> 1]);
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vp9_write_bit(bc, e & 1);
+      vp9_write_bit(w, e & 1);
     }
     ++p;
   }
@@ -331,18 +211,10 @@ static void pack_mb_tokens(vp9_writer* const bc,
   *tp = p + (p->token == EOSB_TOKEN);
 }
 
-static void write_sb_mv_ref(vp9_writer *w, MB_PREDICTION_MODE mode,
-                            const vp9_prob *p) {
-  assert(is_inter_mode(mode));
-  write_token(w, vp9_inter_mode_tree, p,
-              &vp9_inter_mode_encodings[inter_mode_offset(mode)]);
-}
-
-
 static void write_segment_id(vp9_writer *w, const struct segmentation *seg,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
-    treed_write(w, vp9_segment_tree, seg->tree_probs, segment_id, 3);
+    vp9_write_tree(w, vp9_segment_tree, seg->tree_probs, segment_id, 3, 0);
 }
 
 // This function encodes the reference frame
@@ -359,12 +231,12 @@ static void encode_ref_frame(VP9_COMP *cpi, vp9_writer *bc) {
   if (!seg_ref_active) {
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       vp9_write(bc, mi->ref_frame[1] > INTRA_FRAME,
-                vp9_get_pred_prob_comp_inter_inter(cm, xd));
+                vp9_get_reference_mode_prob(cm, xd));
     } else {
       assert((mi->ref_frame[1] <= INTRA_FRAME) ==
-                 (cm->comp_pred_mode == SINGLE_PREDICTION_ONLY));
+             (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (mi->ref_frame[1] > INTRA_FRAME) {
@@ -420,8 +292,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
   skip_coeff = write_skip_coeff(cpi, segment_id, m, bc);
 
   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vp9_write(bc, rf != INTRA_FRAME,
-              vp9_get_pred_prob_intra_inter(cm, xd));
+    vp9_write(bc, rf != INTRA_FRAME, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
       !(rf != INTRA_FRAME &&
@@ -460,17 +331,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
     // If segment skip is not enabled code the mode.
     if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_sb_mv_ref(bc, mode, mv_ref_p);
-        ++cm->counts.inter_mode[mi->mode_context[rf]]
-                               [inter_mode_offset(mode)];
+        write_inter_mode(bc, mode, mv_ref_p);
+        ++cm->counts.inter_mode[mi->mode_context[rf]][INTER_OFFSET(mode)];
       }
     }
 
     if (cm->mcomp_filter_type == SWITCHABLE) {
       const int ctx = vp9_get_pred_context_switchable_interp(xd);
-      write_token(bc, vp9_switchable_interp_tree,
-                  cm->fc.switchable_interp_prob[ctx],
-                  &vp9_switchable_interp_encodings[mi->interp_filter]);
+      vp9_write_token(bc, vp9_switchable_interp_tree,
+                      cm->fc.switchable_interp_prob[ctx],
+                      &switchable_interp_encodings[mi->interp_filter]);
     } else {
       assert(mi->interp_filter == cm->mcomp_filter_type);
     }
@@ -483,9 +353,9 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) {
         for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
           const int j = idy * 2 + idx;
           const MB_PREDICTION_MODE blockmode = m->bmi[j].as_mode;
-          write_sb_mv_ref(bc, blockmode, mv_ref_p);
+          write_inter_mode(bc, blockmode, mv_ref_p);
           ++cm->counts.inter_mode[mi->mode_context[rf]]
-                                 [inter_mode_offset(blockmode)];
+                                 [INTER_OFFSET(blockmode)];
 
           if (blockmode == NEWMV) {
 #ifdef ENTROPY_STATS
@@ -559,119 +429,103 @@ static void write_mb_modes_kf(const VP9_COMP *cpi, MODE_INFO **mi_8x8,
 }
 
 static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile,
-                          MODE_INFO **mi_8x8, vp9_writer *bc,
-                          TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                          int mi_row, int mi_col, int index) {
+                          vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                          int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
-  MODE_INFO *m = mi_8x8[0];
+  MODE_INFO *m;
 
-  if (m->mbmi.sb_type < BLOCK_8X8)
-    if (index > 0)
-      return;
-
-  xd->mi_8x8 = mi_8x8;
+  xd->mi_8x8 = cm->mi_grid_visible + (mi_row * cm->mode_info_stride + mi_col);
+  m = xd->mi_8x8[0];
 
   set_mi_row_col(xd, tile,
                  mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
                  mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
                  cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
-    write_mb_modes_kf(cpi, mi_8x8, bc);
+    write_mb_modes_kf(cpi, xd->mi_8x8, w);
 #ifdef ENTROPY_STATS
     active_section = 8;
 #endif
   } else {
-    pack_inter_mode_mvs(cpi, m, bc);
+    pack_inter_mode_mvs(cpi, m, w);
 #ifdef ENTROPY_STATS
     active_section = 1;
 #endif
   }
 
   assert(*tok < tok_end);
-  pack_mb_tokens(bc, tok, tok_end);
+  pack_mb_tokens(w, tok, tok_end);
 }
 
-static void write_partition(PARTITION_TYPE partition,
-                            int hbs, int mi_rows, int mi_cols,
-                            int mi_row, int mi_col,
-                            vp9_prob probs[PARTITION_TYPES - 1],
-                            vp9_writer *w) {
-  const int has_rows = (mi_row + hbs) < mi_rows;
-  const int has_cols = (mi_col + hbs) < mi_cols;
+static void write_partition(VP9_COMP *cpi, int hbs, int mi_row, int mi_col,
+                            PARTITION_TYPE p, BLOCK_SIZE bsize, vp9_writer *w) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int ctx = partition_plane_context(cpi->above_seg_context,
+                                          cpi->left_seg_context,
+                                          mi_row, mi_col, bsize);
+  const vp9_prob *const probs = get_partition_probs(cm, ctx);
+  const int has_rows = (mi_row + hbs) < cm->mi_rows;
+  const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
-    write_token(w, vp9_partition_tree, probs,
-                &vp9_partition_encodings[partition]);
+    vp9_write_token(w, vp9_partition_tree, probs, &partition_encodings[p]);
   } else if (!has_rows && has_cols) {
-    assert(partition == PARTITION_SPLIT || partition == PARTITION_HORZ);
-    vp9_write(w, partition == PARTITION_SPLIT, probs[1]);
+    assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+    vp9_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
-    assert(partition == PARTITION_SPLIT || partition == PARTITION_VERT);
-    vp9_write(w, partition == PARTITION_SPLIT, probs[2]);
+    assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+    vp9_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
-    assert(partition == PARTITION_SPLIT);
+    assert(p == PARTITION_SPLIT);
   }
 }
 
 static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
-                           MODE_INFO **mi_8x8, vp9_writer *bc,
-                           TOKENEXTRA **tok, TOKENEXTRA *tok_end,
-                           int mi_row, int mi_col, BLOCK_SIZE bsize,
-                           int index) {
+                           vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
-  int bsl = b_width_log2(bsize);
-  int bs = (1 << bsl) / 4;  // mode_info step for subsize
-  int n;
-  PARTITION_TYPE partition = PARTITION_NONE;
+  const int bsl = b_width_log2(bsize);
+  const int bs = (1 << bsl) / 4;
+  PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
-  MODE_INFO *m = mi_8x8[0];
+  MODE_INFO *m = cm->mi_grid_visible[mi_row * cm->mode_info_stride + mi_col];
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = partition_lookup[bsl][m->mbmi.sb_type];
-
-  if (bsize < BLOCK_8X8) {
-    if (index > 0)
-      return;
-  } else {
-    const int ctx = partition_plane_context(cpi->above_seg_context,
-                                            cpi->left_seg_context,
-                                            mi_row, mi_col, bsize);
-    write_partition(partition, bs, cm->mi_rows, cm->mi_cols, mi_row, mi_col,
-                    cm->fc.partition_prob[cm->frame_type][ctx], bc);
-  }
-
+  write_partition(cpi, bs, mi_row, mi_col, partition, bsize, w);
   subsize = get_subsize(bsize, partition);
-
-  switch (partition) {
-    case PARTITION_NONE:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      break;
-    case PARTITION_HORZ:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_row + bs) < cm->mi_rows)
-        write_modes_b(cpi, tile, mi_8x8 + bs * mis, bc, tok, tok_end,
-                      mi_row + bs, mi_col, 1);
-      break;
-    case PARTITION_VERT:
-      write_modes_b(cpi, tile, mi_8x8, bc, tok, tok_end, mi_row, mi_col, 0);
-      if ((mi_col + bs) < cm->mi_cols)
-        write_modes_b(cpi, tile, mi_8x8 + bs, bc, tok, tok_end,
-                      mi_row, mi_col + bs, 1);
-      break;
-    case PARTITION_SPLIT:
-      for (n = 0; n < 4; n++) {
-        const int j = n >> 1, i = n & 1;
-        write_modes_sb(cpi, tile, mi_8x8 + j * bs * mis + i * bs, bc,
-                       tok, tok_end,
-                       mi_row + j * bs, mi_col + i * bs, subsize, n);
-      }
-      break;
-    default:
-      assert(0);
+  if (subsize < BLOCK_8X8) {
+    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        break;
+      case PARTITION_HORZ:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_row + bs < cm->mi_rows)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        break;
+      case PARTITION_VERT:
+        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        if (mi_col + bs < cm->mi_cols)
+          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        break;
+      case PARTITION_SPLIT:
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
+                       subsize);
+        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
+                       subsize);
+        break;
+      default:
+        assert(0);
+    }
   }
 
   // update partition context
@@ -682,61 +536,46 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile,
 }
 
 static void write_modes(VP9_COMP *cpi, const TileInfo *const tile,
-                        vp9_writer* const bc,
-                        TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int mis = cm->mode_info_stride;
+                        vp9_writer *w, TOKENEXTRA **tok, TOKENEXTRA *tok_end) {
   int mi_row, mi_col;
-  MODE_INFO **mi_8x8 = cm->mi_grid_visible;
-  MODE_INFO **m_8x8;
-
-  mi_8x8 += tile->mi_col_start + tile->mi_row_start * mis;
 
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += 8, mi_8x8 += 8 * mis) {
-    m_8x8 = mi_8x8;
-    vp9_zero(cpi->left_seg_context);
+       mi_row += MI_BLOCK_SIZE) {
+      vp9_zero(cpi->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE, m_8x8 += MI_BLOCK_SIZE) {
-      write_modes_sb(cpi, tile, m_8x8, bc, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64, 0);
-    }
+         mi_col += MI_BLOCK_SIZE)
+      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64);
   }
 }
 
 static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
   vp9_coeff_probs_model *coef_probs = cpi->frame_coef_probs[tx_size];
   vp9_coeff_count *coef_counts = cpi->coef_counts[tx_size];
-  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] =
+  unsigned int (*eob_branch_ct)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
       cpi->common.counts.eob_branch[tx_size];
   vp9_coeff_stats *coef_branch_ct = cpi->frame_branch_ct[tx_size];
-  vp9_prob full_probs[ENTROPY_NODES];
-  int i, j, k, l;
+  int i, j, k, l, m;
 
-  for (i = 0; i < BLOCK_TYPES; ++i) {
+  for (i = 0; i < PLANE_TYPES; ++i) {
     for (j = 0; j < REF_TYPES; ++j) {
       for (k = 0; k < COEF_BANDS; ++k) {
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
           vp9_tree_probs_from_distribution(vp9_coef_tree,
-                                           full_probs,
                                            coef_branch_ct[i][j][k][l],
-                                           coef_counts[i][j][k][l], 0);
-          vpx_memcpy(coef_probs[i][j][k][l], full_probs,
-                     sizeof(vp9_prob) * UNCONSTRAINED_NODES);
+                                           coef_counts[i][j][k][l]);
           coef_branch_ct[i][j][k][l][0][1] = eob_branch_ct[i][j][k][l] -
                                              coef_branch_ct[i][j][k][l][0][0];
-          coef_probs[i][j][k][l][0] =
-              get_binary_prob(coef_branch_ct[i][j][k][l][0][0],
-                              coef_branch_ct[i][j][k][l][0][1]);
+          for (m = 0; m < UNCONSTRAINED_NODES; ++m)
+            coef_probs[i][j][k][l][m] = get_binary_prob(
+                                            coef_branch_ct[i][j][k][l][m][0],
+                                            coef_branch_ct[i][j][k][l][m][1]);
 #ifdef ENTROPY_STATS
           if (!cpi->dummy_packing) {
             int t;
-            for (t = 0; t < MAX_ENTROPY_TOKENS; ++t)
+            for (t = 0; t < ENTROPY_TOKENS; ++t)
               context_counters[tx_size][i][j][k][l][t] +=
                   coef_counts[i][j][k][l][t];
-            context_counters[tx_size][i][j][k][l][MAX_ENTROPY_TOKENS] +=
+            context_counters[tx_size][i][j][k][l][ENTROPY_TOKENS] +=
                 eob_branch_ct[i][j][k][l];
           }
 #endif
@@ -746,12 +585,6 @@ static void build_tree_distribution(VP9_COMP *cpi, TX_SIZE tx_size) {
   }
 }
 
-static void build_coeff_contexts(VP9_COMP *cpi) {
-  TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    build_tree_distribution(cpi, t);
-}
-
 static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                                      TX_SIZE tx_size) {
   vp9_coeff_probs_model *new_frame_coef_probs = cpi->frame_coef_probs[tx_size];
@@ -766,18 +599,15 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
       /* dry run to see if there is any udpate at all needed */
       int savings = 0;
       int update[2] = {0, 0};
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
                 const vp9_prob oldp = old_frame_coef_probs[i][j][k][l][t];
                 int s;
                 int u = 0;
-
-                if (l >= 3 && k == 0)
-                  continue;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
@@ -805,10 +635,10 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
         return;
       }
       vp9_write_bit(bc, 1);
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
@@ -816,8 +646,6 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
                 const vp9_prob upd = DIFF_UPDATE_PROB;
                 int s;
                 int u = 0;
-                if (l >= 3 && k == 0)
-                  continue;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
                       frame_branch_ct[i][j][k][l][0],
@@ -849,25 +677,23 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
     case 1:
     case 2: {
       const int prev_coef_contexts_to_update =
-          (cpi->sf.use_fast_coef_updates == 2 ?
-           PREV_COEF_CONTEXTS >> 1 : PREV_COEF_CONTEXTS);
+          cpi->sf.use_fast_coef_updates == 2 ? COEFF_CONTEXTS >> 1
+                                             : COEFF_CONTEXTS;
       const int coef_band_to_update =
-          (cpi->sf.use_fast_coef_updates == 2 ?
-           COEF_BANDS >> 1 : COEF_BANDS);
+          cpi->sf.use_fast_coef_updates == 2 ? COEF_BANDS >> 1
+                                             : COEF_BANDS;
       int updates = 0;
       int noupdates_before_first = 0;
-      for (i = 0; i < BLOCK_TYPES; ++i) {
+      for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
-            for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
               // calc probs and branch cts for this frame only
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vp9_prob newp = new_frame_coef_probs[i][j][k][l][t];
                 vp9_prob *oldp = old_frame_coef_probs[i][j][k][l] + t;
                 int s;
                 int u = 0;
-                if (l >= 3 && k == 0)
-                  continue;
                 if (l >= prev_coef_contexts_to_update ||
                     k >= coef_band_to_update) {
                   u = 0;
@@ -925,25 +751,17 @@ static void update_coef_probs_common(vp9_writer* const bc, VP9_COMP *cpi,
   }
 }
 
-static void update_coef_probs(VP9_COMP* const cpi, vp9_writer* const bc) {
+static void update_coef_probs(VP9_COMP* cpi, vp9_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
-
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  TX_SIZE tx_size;
   vp9_clear_system_state();
 
-  // Build the cofficient contexts based on counts collected in encode loop
-  build_coeff_contexts(cpi);
+  for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size)
+    build_tree_distribution(cpi, tx_size);
 
-  update_coef_probs_common(bc, cpi, TX_4X4);
-
-  // do not do this if not even allowed
-  if (tx_mode > ONLY_4X4)
-    update_coef_probs_common(bc, cpi, TX_8X8);
-
-  if (tx_mode > ALLOW_8X8)
-    update_coef_probs_common(bc, cpi, TX_16X16);
-
-  if (tx_mode > ALLOW_16X16)
-    update_coef_probs_common(bc, cpi, TX_32X32);
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    update_coef_probs_common(w, cpi, tx_size);
 }
 
 static void encode_loopfilter(struct loopfilter *lf,
@@ -1237,7 +1055,7 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) {
     for (tile_col = 0; tile_col < tile_cols; tile_col++) {
       TileInfo tile;
 
-      vp9_tile_init(&tile, cm, 0, tile_col);
+      vp9_tile_init(&tile, cm, tile_row, tile_col);
       tok_end = tok[tile_row][tile_col] + cpi->tok_count[tile_row][tile_col];
 
       if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
@@ -1285,11 +1103,11 @@ static void write_frame_size(VP9_COMP *cpi,
 static void write_frame_size_with_refs(VP9_COMP *cpi,
                                        struct vp9_write_bit_buffer *wb) {
   VP9_COMMON *const cm = &cpi->common;
-  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                      cpi->alt_fb_idx};
+  int refs[REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                              cpi->alt_fb_idx};
   int i, found = 0;
 
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
     YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
     found = cm->width == cfg->y_crop_width &&
             cm->height == cfg->y_crop_height;
@@ -1354,8 +1172,8 @@ static void write_uncompressed_header(VP9_COMP *cpi,
 
     write_frame_size(cpi, wb);
   } else {
-    const int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                              cpi->alt_fb_idx};
+    const int refs[REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                                      cpi->alt_fb_idx};
     if (!cm->show_frame)
       vp9_wb_write_bit(wb, cm->intra_only);
 
@@ -1365,13 +1183,13 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     if (cm->intra_only) {
       write_sync_code(wb);
 
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
       write_frame_size(cpi, wb);
     } else {
       int i;
-      vp9_wb_write_literal(wb, get_refresh_mask(cpi), NUM_REF_FRAMES);
-      for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i) {
-        vp9_wb_write_literal(wb, refs[i], NUM_REF_FRAMES_LOG2);
+      vp9_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+      for (i = 0; i < REFS_PER_FRAME; ++i) {
+        vp9_wb_write_literal(wb, refs[i], REF_FRAMES_LOG2);
         vp9_wb_write_bit(wb, cm->ref_frame_sign_bias[LAST_FRAME + i]);
       }
 
@@ -1389,7 +1207,7 @@ static void write_uncompressed_header(VP9_COMP *cpi,
     vp9_wb_write_bit(wb, cm->frame_parallel_decoding_mode);
   }
 
-  vp9_wb_write_literal(wb, cm->frame_context_idx, NUM_FRAME_CONTEXTS_LOG2);
+  vp9_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
   encode_loopfilter(&cm->lf, wb);
   encode_quantization(cm, wb);
@@ -1425,7 +1243,10 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
     active_section = 1;
 #endif
 
-    update_inter_mode_probs(cm, &header_bc);
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      prob_diff_update(vp9_inter_mode_tree, cm->fc.inter_mode_probs[i],
+                       cm->counts.inter_mode[i], INTER_MODES, &header_bc);
+
     vp9_zero(cm->counts.inter_mode);
 
     if (cm->mcomp_filter_type == SWITCHABLE)
@@ -1436,9 +1257,9 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
                                 cpi->intra_inter_count[i]);
 
     if (cm->allow_comp_inter_inter) {
-      const int comp_pred_mode = cpi->common.comp_pred_mode;
-      const int use_compound_pred = comp_pred_mode != SINGLE_PREDICTION_ONLY;
-      const int use_hybrid_pred = comp_pred_mode == HYBRID_PREDICTION;
+      const int reference_mode = cpi->common.reference_mode;
+      const int use_compound_pred = reference_mode != SINGLE_REFERENCE;
+      const int use_hybrid_pred = reference_mode == REFERENCE_MODE_SELECT;
 
       vp9_write_bit(&header_bc, use_compound_pred);
       if (use_compound_pred) {
@@ -1450,7 +1271,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       }
     }
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
         vp9_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
                                   cpi->single_ref_count[i][0]);
@@ -1459,21 +1280,19 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
       }
     }
 
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY)
+    if (cm->reference_mode != SINGLE_REFERENCE)
       for (i = 0; i < REF_CONTEXTS; i++)
         vp9_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
                                   cpi->comp_ref_count[i]);
 
-    update_mbintra_mode_probs(cpi, &header_bc);
+    for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+      prob_diff_update(vp9_intra_mode_tree, cm->fc.y_mode_prob[i],
+                       (unsigned int *)cpi->y_mode_count[i], INTRA_MODES,
+                       &header_bc);
 
-    for (i = 0; i < PARTITION_CONTEXTS; ++i) {
-      vp9_prob pnew[PARTITION_TYPES - 1];
-      unsigned int bct[PARTITION_TYPES - 1][2];
-      update_mode(&header_bc, PARTITION_TYPES,
-                  vp9_partition_tree, pnew,
-                  fc->partition_prob[cm->frame_type][i], bct,
-                  (unsigned int *)cpi->partition_count[i]);
-    }
+    for (i = 0; i < PARTITION_CONTEXTS; ++i)
+      prob_diff_update(vp9_partition_tree, fc->partition_prob[i],
+                       cm->counts.partition[i], PARTITION_TYPES, &header_bc);
 
     vp9_write_nmv_probs(cpi, cm->allow_high_precision_mv, &header_bc);
   }
@@ -1484,7 +1303,7 @@ static size_t write_compressed_header(VP9_COMP *cpi, uint8_t *data) {
   return header_bc.pos;
 }
 
-void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) {
   uint8_t *data = dest;
   size_t first_part_size;
   struct vp9_write_bit_buffer wb = {data, 0};
@@ -1516,53 +1335,3 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, unsigned long *size) {
   *size = data - dest;
 }
 
-#ifdef ENTROPY_STATS
-static void print_tree_update_for_type(FILE *f,
-                                       vp9_coeff_stats *tree_update_hist,
-                                       int block_types, const char *header) {
-  int i, j, k, l, m;
-
-  fprintf(f, "const vp9_coeff_prob %s = {\n", header);
-  for (i = 0; i < block_types; i++) {
-    fprintf(f, "  { \n");
-    for (j = 0; j < REF_TYPES; j++) {
-      fprintf(f, "  { \n");
-      for (k = 0; k < COEF_BANDS; k++) {
-        fprintf(f, "    {\n");
-        for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
-          fprintf(f, "      {");
-          for (m = 0; m < ENTROPY_NODES; m++) {
-            fprintf(f, "%3d, ",
-                    get_binary_prob(tree_update_hist[i][j][k][l][m][0],
-                                    tree_update_hist[i][j][k][l][m][1]));
-          }
-          fprintf(f, "},\n");
-        }
-        fprintf(f, "},\n");
-      }
-      fprintf(f, "    },\n");
-    }
-    fprintf(f, "  },\n");
-  }
-  fprintf(f, "};\n");
-}
-
-void print_tree_update_probs() {
-  FILE *f = fopen("coefupdprob.h", "w");
-  fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
-
-  print_tree_update_for_type(f, tree_update_hist[TX_4X4],   BLOCK_TYPES,
-                             "vp9_coef_update_probs_4x4[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_8X8],   BLOCK_TYPES,
-                             "vp9_coef_update_probs_8x8[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_16X16], BLOCK_TYPES,
-                             "vp9_coef_update_probs_16x16[BLOCK_TYPES]");
-  print_tree_update_for_type(f, tree_update_hist[TX_32X32], BLOCK_TYPES,
-                             "vp9_coef_update_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-  f = fopen("treeupdate.bin", "wb");
-  fwrite(tree_update_hist, sizeof(tree_update_hist), 1, f);
-  fclose(f);
-}
-#endif
diff --git a/source/libvpx/vp9/encoder/vp9_block.h b/source/libvpx/vp9/encoder/vp9_block.h
index 583c6c8..737fad4 100644
--- a/source/libvpx/vp9/encoder/vp9_block.h
+++ b/source/libvpx/vp9/encoder/vp9_block.h
@@ -27,6 +27,18 @@ typedef struct {
 typedef struct {
   MODE_INFO mic;
   uint8_t *zcoeff_blk;
+  int16_t *coeff[MAX_MB_PLANE][3];
+  int16_t *qcoeff[MAX_MB_PLANE][3];
+  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  uint16_t *eobs[MAX_MB_PLANE][3];
+
+  // dual buffer pointers, 0: in use, 1: best in store
+  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
+
+  int is_coded;
   int num_4x4_blk;
   int skip;
   int_mv best_ref_mv;
@@ -47,6 +59,7 @@ typedef struct {
   // motion vector cache for adaptive motion search control in partition
   // search loop
   int_mv pred_mv[MAX_REF_FRAMES];
+  int pred_filter_type;
 
   // Bit flag for each mode whether it has high error in comparison to others.
   unsigned int modes_with_high_error;
@@ -57,7 +70,9 @@ typedef struct {
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  DECLARE_ALIGNED(16, int16_t, coeff[64 * 64]);
+  int16_t *qcoeff;
+  int16_t *coeff;
+  uint16_t *eobs;
   struct buf_2d src;
 
   // Quantizer setings
@@ -72,8 +87,8 @@ struct macroblock_plane {
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
  * coefficient in this block was zero) or not. */
-typedef unsigned int vp9_coeff_cost[BLOCK_TYPES][REF_TYPES][COEF_BANDS][2]
-                                   [PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
+typedef unsigned int vp9_coeff_cost[PLANE_TYPES][REF_TYPES][COEF_BANDS][2]
+                                   [COEFF_CONTEXTS][ENTROPY_TOKENS];
 
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
@@ -81,6 +96,10 @@ struct macroblock {
 
   MACROBLOCKD e_mbd;
   int skip_block;
+  int select_txfm_size;
+  int skip_recode;
+  int skip_optimize;
+  int q_index;
 
   search_site *ss;
   int ss_count;
@@ -120,6 +139,11 @@ struct macroblock {
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
 
+  unsigned char sb_index;   // index of 32x32 block inside the 64x64 block
+  unsigned char mb_index;   // index of 16x16 block inside the 32x32 block
+  unsigned char b_index;    // index of 8x8 block inside the 16x16 block
+  unsigned char ab_index;   // index of 4x4 block inside the 8x8 block
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
@@ -136,7 +160,7 @@ struct macroblock {
 
   // note that token_costs is the cost when eob node is skipped
   vp9_coeff_cost token_costs[TX_SIZES];
-  uint8_t token_cache[1024];
+  DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
 
   int optimize;
 
@@ -179,35 +203,33 @@ struct macroblock {
 // refactoring on organizing the temporary buffers, when recursive
 // partition down to 4x4 block size is enabled.
 static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_context;
     case BLOCK_64X32:
-      return &x->sb64x32_context[xd->sb_index];
+      return &x->sb64x32_context[x->sb_index];
     case BLOCK_32X64:
-      return &x->sb32x64_context[xd->sb_index];
+      return &x->sb32x64_context[x->sb_index];
     case BLOCK_32X32:
-      return &x->sb32_context[xd->sb_index];
+      return &x->sb32_context[x->sb_index];
     case BLOCK_32X16:
-      return &x->sb32x16_context[xd->sb_index][xd->mb_index];
+      return &x->sb32x16_context[x->sb_index][x->mb_index];
     case BLOCK_16X32:
-      return &x->sb16x32_context[xd->sb_index][xd->mb_index];
+      return &x->sb16x32_context[x->sb_index][x->mb_index];
     case BLOCK_16X16:
-      return &x->mb_context[xd->sb_index][xd->mb_index];
+      return &x->mb_context[x->sb_index][x->mb_index];
     case BLOCK_16X8:
-      return &x->sb16x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb16x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X16:
-      return &x->sb8x16_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x16_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X8:
-      return &x->sb8x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_8X4:
-      return &x->sb8x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb8x4_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_4X8:
-      return &x->sb4x8_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->sb4x8_context[x->sb_index][x->mb_index][x->b_index];
     case BLOCK_4X4:
-      return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->ab4x4_context[x->sb_index][x->mb_index][x->b_index];
     default:
       assert(0);
       return NULL;
diff --git a/source/libvpx/vp9/encoder/vp9_boolhuff.h b/source/libvpx/vp9/encoder/vp9_boolhuff.h
index c3f340d..a0fff38 100644
--- a/source/libvpx/vp9/encoder/vp9_boolhuff.h
+++ b/source/libvpx/vp9/encoder/vp9_boolhuff.h
@@ -111,5 +111,6 @@ static void vp9_write_literal(vp9_writer *w, int data, int bits) {
     vp9_write_bit(w, 1 & (data >> bit));
 }
 
+#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
 
 #endif  // VP9_ENCODER_VP9_BOOLHUFF_H_
diff --git a/source/libvpx/vp9/encoder/vp9_dct.c b/source/libvpx/vp9/encoder/vp9_dct.c
index 065992a..0f4a6bb 100644
--- a/source/libvpx/vp9/encoder/vp9_dct.c
+++ b/source/libvpx/vp9/encoder/vp9_dct.c
@@ -20,6 +20,12 @@
 
 #include "vp9/encoder/vp9_dct.h"
 
+static INLINE int fdct_round_shift(int input) {
+  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+  return rv;
+}
+
 static void fdct4(const int16_t *input, int16_t *output) {
   int16_t step[4];
   int temp1, temp2;
@@ -31,12 +37,12 @@ static void fdct4(const int16_t *input, int16_t *output) {
 
   temp1 = (step[0] + step[1]) * cospi_16_64;
   temp2 = (step[0] - step[1]) * cospi_16_64;
-  output[0] = dct_const_round_shift(temp1);
-  output[2] = dct_const_round_shift(temp2);
+  output[0] = fdct_round_shift(temp1);
+  output[2] = fdct_round_shift(temp2);
   temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
   temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-  output[1] = dct_const_round_shift(temp1);
-  output[3] = dct_const_round_shift(temp2);
+  output[1] = fdct_round_shift(temp1);
+  output[3] = fdct_round_shift(temp2);
 }
 
 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
@@ -80,12 +86,12 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
       step[3] = input[0] - input[3];
       temp1 = (step[0] + step[1]) * cospi_16_64;
       temp2 = (step[0] - step[1]) * cospi_16_64;
-      out[0] = dct_const_round_shift(temp1);
-      out[2] = dct_const_round_shift(temp2);
+      out[0] = fdct_round_shift(temp1);
+      out[2] = fdct_round_shift(temp2);
       temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
       temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
-      out[1] = dct_const_round_shift(temp1);
-      out[3] = dct_const_round_shift(temp2);
+      out[1] = fdct_round_shift(temp1);
+      out[3] = fdct_round_shift(temp2);
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
       out += 4;
@@ -138,10 +144,10 @@ static void fadst4(const int16_t *input, int16_t *output) {
   s3 = x2 - x0 + x3;
 
   // 1-D transform scaling factor is sqrt(2).
-  output[0] = dct_const_round_shift(s0);
-  output[1] = dct_const_round_shift(s1);
-  output[2] = dct_const_round_shift(s2);
-  output[3] = dct_const_round_shift(s3);
+  output[0] = fdct_round_shift(s0);
+  output[1] = fdct_round_shift(s1);
+  output[2] = fdct_round_shift(s2);
+  output[3] = fdct_round_shift(s3);
 }
 
 static const transform_2d FHT_4[] = {
@@ -204,16 +210,16 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = (x0 - x1) * cospi_16_64;
   t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
   t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-  output[0] = dct_const_round_shift(t0);
-  output[2] = dct_const_round_shift(t2);
-  output[4] = dct_const_round_shift(t1);
-  output[6] = dct_const_round_shift(t3);
+  output[0] = fdct_round_shift(t0);
+  output[2] = fdct_round_shift(t2);
+  output[4] = fdct_round_shift(t1);
+  output[6] = fdct_round_shift(t3);
 
   // Stage 2
   t0 = (s6 - s5) * cospi_16_64;
   t1 = (s6 + s5) * cospi_16_64;
-  t2 = dct_const_round_shift(t0);
-  t3 = dct_const_round_shift(t1);
+  t2 = fdct_round_shift(t0);
+  t3 = fdct_round_shift(t1);
 
   // Stage 3
   x0 = s4 + t2;
@@ -226,10 +232,10 @@ static void fdct8(const int16_t *input, int16_t *output) {
   t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
   t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
   t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-  output[1] = dct_const_round_shift(t0);
-  output[3] = dct_const_round_shift(t2);
-  output[5] = dct_const_round_shift(t1);
-  output[7] = dct_const_round_shift(t3);
+  output[1] = fdct_round_shift(t0);
+  output[3] = fdct_round_shift(t2);
+  output[5] = fdct_round_shift(t1);
+  output[7] = fdct_round_shift(t3);
 }
 
 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
@@ -264,16 +270,16 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = (x0 - x1) * cospi_16_64;
       t2 =  x2 * cospi_24_64 + x3 *  cospi_8_64;
       t3 = -x2 * cospi_8_64  + x3 * cospi_24_64;
-      output[0 * 8] = dct_const_round_shift(t0);
-      output[2 * 8] = dct_const_round_shift(t2);
-      output[4 * 8] = dct_const_round_shift(t1);
-      output[6 * 8] = dct_const_round_shift(t3);
+      output[0 * 8] = fdct_round_shift(t0);
+      output[2 * 8] = fdct_round_shift(t2);
+      output[4 * 8] = fdct_round_shift(t1);
+      output[6 * 8] = fdct_round_shift(t3);
 
       // Stage 2
       t0 = (s6 - s5) * cospi_16_64;
       t1 = (s6 + s5) * cospi_16_64;
-      t2 = dct_const_round_shift(t0);
-      t3 = dct_const_round_shift(t1);
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
 
       // Stage 3
       x0 = s4 + t2;
@@ -286,10 +292,10 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
       t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
       t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
       t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-      output[1 * 8] = dct_const_round_shift(t0);
-      output[3 * 8] = dct_const_round_shift(t2);
-      output[5 * 8] = dct_const_round_shift(t1);
-      output[7 * 8] = dct_const_round_shift(t3);
+      output[1 * 8] = fdct_round_shift(t0);
+      output[3 * 8] = fdct_round_shift(t2);
+      output[5 * 8] = fdct_round_shift(t1);
+      output[7 * 8] = fdct_round_shift(t3);
       input++;
       output++;
     }
@@ -388,16 +394,16 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = (x0 - x1) * cospi_16_64;
         t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
         t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = dct_const_round_shift(t0);
-        out[4] = dct_const_round_shift(t2);
-        out[8] = dct_const_round_shift(t1);
-        out[12] = dct_const_round_shift(t3);
+        out[0] = fdct_round_shift(t0);
+        out[4] = fdct_round_shift(t2);
+        out[8] = fdct_round_shift(t1);
+        out[12] = fdct_round_shift(t3);
 
         // Stage 2
         t0 = (s6 - s5) * cospi_16_64;
         t1 = (s6 + s5) * cospi_16_64;
-        t2 = dct_const_round_shift(t0);
-        t3 = dct_const_round_shift(t1);
+        t2 = fdct_round_shift(t0);
+        t3 = fdct_round_shift(t1);
 
         // Stage 3
         x0 = s4 + t2;
@@ -410,22 +416,22 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
         t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
         t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-        out[2] = dct_const_round_shift(t0);
-        out[6] = dct_const_round_shift(t2);
-        out[10] = dct_const_round_shift(t1);
-        out[14] = dct_const_round_shift(t3);
+        out[2] = fdct_round_shift(t0);
+        out[6] = fdct_round_shift(t2);
+        out[10] = fdct_round_shift(t1);
+        out[14] = fdct_round_shift(t3);
       }
       // Work on the next eight values; step1 -> odd_results
       {
         // step 2
         temp1 = (step1[5] - step1[2]) * cospi_16_64;
         temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = dct_const_round_shift(temp1);
-        step2[3] = dct_const_round_shift(temp2);
+        step2[2] = fdct_round_shift(temp1);
+        step2[3] = fdct_round_shift(temp2);
         temp1 = (step1[4] + step1[3]) * cospi_16_64;
         temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = dct_const_round_shift(temp1);
-        step2[5] = dct_const_round_shift(temp2);
+        step2[4] = fdct_round_shift(temp1);
+        step2[5] = fdct_round_shift(temp2);
         // step 3
         step3[0] = step1[0] + step2[3];
         step3[1] = step1[1] + step2[2];
@@ -438,12 +444,12 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 4
         temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
         temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-        step2[1] = dct_const_round_shift(temp1);
-        step2[2] = dct_const_round_shift(temp2);
+        step2[1] = fdct_round_shift(temp1);
+        step2[2] = fdct_round_shift(temp2);
         temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
         temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-        step2[5] = dct_const_round_shift(temp1);
-        step2[6] = dct_const_round_shift(temp2);
+        step2[5] = fdct_round_shift(temp1);
+        step2[6] = fdct_round_shift(temp2);
         // step 5
         step1[0] = step3[0] + step2[1];
         step1[1] = step3[0] - step2[1];
@@ -456,20 +462,20 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
         // step 6
         temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
         temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = dct_const_round_shift(temp1);
-        out[9] = dct_const_round_shift(temp2);
+        out[1] = fdct_round_shift(temp1);
+        out[9] = fdct_round_shift(temp2);
         temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
         temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = dct_const_round_shift(temp1);
-        out[13] = dct_const_round_shift(temp2);
+        out[5] = fdct_round_shift(temp1);
+        out[13] = fdct_round_shift(temp2);
         temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
         temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = dct_const_round_shift(temp1);
-        out[11] = dct_const_round_shift(temp2);
+        out[3] = fdct_round_shift(temp1);
+        out[11] = fdct_round_shift(temp2);
         temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
         temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = dct_const_round_shift(temp1);
-        out[15] = dct_const_round_shift(temp2);
+        out[7] = fdct_round_shift(temp1);
+        out[15] = fdct_round_shift(temp2);
       }
       // Do next column (which is a transposed row in second/horizontal pass)
       in++;
@@ -503,14 +509,14 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
 
-  x0 = dct_const_round_shift(s0 + s4);
-  x1 = dct_const_round_shift(s1 + s5);
-  x2 = dct_const_round_shift(s2 + s6);
-  x3 = dct_const_round_shift(s3 + s7);
-  x4 = dct_const_round_shift(s0 - s4);
-  x5 = dct_const_round_shift(s1 - s5);
-  x6 = dct_const_round_shift(s2 - s6);
-  x7 = dct_const_round_shift(s3 - s7);
+  x0 = fdct_round_shift(s0 + s4);
+  x1 = fdct_round_shift(s1 + s5);
+  x2 = fdct_round_shift(s2 + s6);
+  x3 = fdct_round_shift(s3 + s7);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
 
   // stage 2
   s0 = x0;
@@ -526,10 +532,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
 
   // stage 3
   s2 = cospi_16_64 * (x2 + x3);
@@ -537,10 +543,10 @@ static void fadst8(const int16_t *input, int16_t *output) {
   s6 = cospi_16_64 * (x6 + x7);
   s7 = cospi_16_64 * (x6 - x7);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
 
   output[0] =   x0;
   output[1] = - x4;
@@ -693,16 +699,16 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = (x0 - x1) * cospi_16_64;
     t2 = x3 * cospi_8_64  + x2 * cospi_24_64;
     t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-    out[0] = dct_const_round_shift(t0);
-    out[4] = dct_const_round_shift(t2);
-    out[8] = dct_const_round_shift(t1);
-    out[12] = dct_const_round_shift(t3);
+    out[0] = fdct_round_shift(t0);
+    out[4] = fdct_round_shift(t2);
+    out[8] = fdct_round_shift(t1);
+    out[12] = fdct_round_shift(t3);
 
     // Stage 2
     t0 = (s6 - s5) * cospi_16_64;
     t1 = (s6 + s5) * cospi_16_64;
-    t2 = dct_const_round_shift(t0);
-    t3 = dct_const_round_shift(t1);
+    t2 = fdct_round_shift(t0);
+    t3 = fdct_round_shift(t1);
 
     // Stage 3
     x0 = s4 + t2;
@@ -715,21 +721,21 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
     t1 = x1 * cospi_12_64 + x2 *  cospi_20_64;
     t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
     t3 = x3 * cospi_28_64 + x0 *  -cospi_4_64;
-    out[2] = dct_const_round_shift(t0);
-    out[6] = dct_const_round_shift(t2);
-    out[10] = dct_const_round_shift(t1);
-    out[14] = dct_const_round_shift(t3);
+    out[2] = fdct_round_shift(t0);
+    out[6] = fdct_round_shift(t2);
+    out[10] = fdct_round_shift(t1);
+    out[14] = fdct_round_shift(t3);
   }
 
   // step 2
   temp1 = (step1[5] - step1[2]) * cospi_16_64;
   temp2 = (step1[4] - step1[3]) * cospi_16_64;
-  step2[2] = dct_const_round_shift(temp1);
-  step2[3] = dct_const_round_shift(temp2);
+  step2[2] = fdct_round_shift(temp1);
+  step2[3] = fdct_round_shift(temp2);
   temp1 = (step1[4] + step1[3]) * cospi_16_64;
   temp2 = (step1[5] + step1[2]) * cospi_16_64;
-  step2[4] = dct_const_round_shift(temp1);
-  step2[5] = dct_const_round_shift(temp2);
+  step2[4] = fdct_round_shift(temp1);
+  step2[5] = fdct_round_shift(temp2);
 
   // step 3
   step3[0] = step1[0] + step2[3];
@@ -744,12 +750,12 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 4
   temp1 = step3[1] *  -cospi_8_64 + step3[6] * cospi_24_64;
   temp2 = step3[2] * -cospi_24_64 - step3[5] *  cospi_8_64;
-  step2[1] = dct_const_round_shift(temp1);
-  step2[2] = dct_const_round_shift(temp2);
+  step2[1] = fdct_round_shift(temp1);
+  step2[2] = fdct_round_shift(temp2);
   temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64;
   temp2 = step3[1] * cospi_24_64 + step3[6] *  cospi_8_64;
-  step2[5] = dct_const_round_shift(temp1);
-  step2[6] = dct_const_round_shift(temp2);
+  step2[5] = fdct_round_shift(temp1);
+  step2[6] = fdct_round_shift(temp2);
 
   // step 5
   step1[0] = step3[0] + step2[1];
@@ -764,23 +770,23 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
   // step 6
   temp1 = step1[0] * cospi_30_64 + step1[7] *  cospi_2_64;
   temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  out[1] = dct_const_round_shift(temp1);
-  out[9] = dct_const_round_shift(temp2);
+  out[1] = fdct_round_shift(temp1);
+  out[9] = fdct_round_shift(temp2);
 
   temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
   temp2 = step1[3] *  cospi_6_64 + step1[4] * cospi_26_64;
-  out[5] = dct_const_round_shift(temp1);
-  out[13] = dct_const_round_shift(temp2);
+  out[5] = fdct_round_shift(temp1);
+  out[13] = fdct_round_shift(temp2);
 
   temp1 = step1[3] * -cospi_26_64 + step1[4] *  cospi_6_64;
   temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  out[3] = dct_const_round_shift(temp1);
-  out[11] = dct_const_round_shift(temp2);
+  out[3] = fdct_round_shift(temp1);
+  out[11] = fdct_round_shift(temp2);
 
   temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
   temp2 = step1[0] *  -cospi_2_64 + step1[7] * cospi_30_64;
-  out[7] = dct_const_round_shift(temp1);
-  out[15] = dct_const_round_shift(temp2);
+  out[7] = fdct_round_shift(temp1);
+  out[15] = fdct_round_shift(temp2);
 }
 
 static void fadst16(const int16_t *input, int16_t *output) {
@@ -821,22 +827,22 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
 
-  x0 = dct_const_round_shift(s0 + s8);
-  x1 = dct_const_round_shift(s1 + s9);
-  x2 = dct_const_round_shift(s2 + s10);
-  x3 = dct_const_round_shift(s3 + s11);
-  x4 = dct_const_round_shift(s4 + s12);
-  x5 = dct_const_round_shift(s5 + s13);
-  x6 = dct_const_round_shift(s6 + s14);
-  x7 = dct_const_round_shift(s7 + s15);
-  x8  = dct_const_round_shift(s0 - s8);
-  x9  = dct_const_round_shift(s1 - s9);
-  x10 = dct_const_round_shift(s2 - s10);
-  x11 = dct_const_round_shift(s3 - s11);
-  x12 = dct_const_round_shift(s4 - s12);
-  x13 = dct_const_round_shift(s5 - s13);
-  x14 = dct_const_round_shift(s6 - s14);
-  x15 = dct_const_round_shift(s7 - s15);
+  x0 = fdct_round_shift(s0 + s8);
+  x1 = fdct_round_shift(s1 + s9);
+  x2 = fdct_round_shift(s2 + s10);
+  x3 = fdct_round_shift(s3 + s11);
+  x4 = fdct_round_shift(s4 + s12);
+  x5 = fdct_round_shift(s5 + s13);
+  x6 = fdct_round_shift(s6 + s14);
+  x7 = fdct_round_shift(s7 + s15);
+  x8  = fdct_round_shift(s0 - s8);
+  x9  = fdct_round_shift(s1 - s9);
+  x10 = fdct_round_shift(s2 - s10);
+  x11 = fdct_round_shift(s3 - s11);
+  x12 = fdct_round_shift(s4 - s12);
+  x13 = fdct_round_shift(s5 - s13);
+  x14 = fdct_round_shift(s6 - s14);
+  x15 = fdct_round_shift(s7 - s15);
 
   // stage 2
   s0 = x0;
@@ -864,14 +870,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x5 = s1 - s5;
   x6 = s2 - s6;
   x7 = s3 - s7;
-  x8 = dct_const_round_shift(s8 + s12);
-  x9 = dct_const_round_shift(s9 + s13);
-  x10 = dct_const_round_shift(s10 + s14);
-  x11 = dct_const_round_shift(s11 + s15);
-  x12 = dct_const_round_shift(s8 - s12);
-  x13 = dct_const_round_shift(s9 - s13);
-  x14 = dct_const_round_shift(s10 - s14);
-  x15 = dct_const_round_shift(s11 - s15);
+  x8 = fdct_round_shift(s8 + s12);
+  x9 = fdct_round_shift(s9 + s13);
+  x10 = fdct_round_shift(s10 + s14);
+  x11 = fdct_round_shift(s11 + s15);
+  x12 = fdct_round_shift(s8 - s12);
+  x13 = fdct_round_shift(s9 - s13);
+  x14 = fdct_round_shift(s10 - s14);
+  x15 = fdct_round_shift(s11 - s15);
 
   // stage 3
   s0 = x0;
@@ -895,18 +901,18 @@ static void fadst16(const int16_t *input, int16_t *output) {
   x1 = s1 + s3;
   x2 = s0 - s2;
   x3 = s1 - s3;
-  x4 = dct_const_round_shift(s4 + s6);
-  x5 = dct_const_round_shift(s5 + s7);
-  x6 = dct_const_round_shift(s4 - s6);
-  x7 = dct_const_round_shift(s5 - s7);
+  x4 = fdct_round_shift(s4 + s6);
+  x5 = fdct_round_shift(s5 + s7);
+  x6 = fdct_round_shift(s4 - s6);
+  x7 = fdct_round_shift(s5 - s7);
   x8 = s8 + s10;
   x9 = s9 + s11;
   x10 = s8 - s10;
   x11 = s9 - s11;
-  x12 = dct_const_round_shift(s12 + s14);
-  x13 = dct_const_round_shift(s13 + s15);
-  x14 = dct_const_round_shift(s12 - s14);
-  x15 = dct_const_round_shift(s13 - s15);
+  x12 = fdct_round_shift(s12 + s14);
+  x13 = fdct_round_shift(s13 + s15);
+  x14 = fdct_round_shift(s12 - s14);
+  x15 = fdct_round_shift(s13 - s15);
 
   // stage 4
   s2 = (- cospi_16_64) * (x2 + x3);
@@ -918,14 +924,14 @@ static void fadst16(const int16_t *input, int16_t *output) {
   s14 = (- cospi_16_64) * (x14 + x15);
   s15 = cospi_16_64 * (x14 - x15);
 
-  x2 = dct_const_round_shift(s2);
-  x3 = dct_const_round_shift(s3);
-  x6 = dct_const_round_shift(s6);
-  x7 = dct_const_round_shift(s7);
-  x10 = dct_const_round_shift(s10);
-  x11 = dct_const_round_shift(s11);
-  x14 = dct_const_round_shift(s14);
-  x15 = dct_const_round_shift(s15);
+  x2 = fdct_round_shift(s2);
+  x3 = fdct_round_shift(s3);
+  x6 = fdct_round_shift(s6);
+  x7 = fdct_round_shift(s7);
+  x10 = fdct_round_shift(s10);
+  x11 = fdct_round_shift(s11);
+  x14 = fdct_round_shift(s14);
+  x15 = fdct_round_shift(s15);
 
   output[0] = x0;
   output[1] = - x8;
diff --git a/source/libvpx/vp9/encoder/vp9_encodeframe.c b/source/libvpx/vp9/encoder/vp9_encodeframe.c
index 44ade18..5f9d0c9 100644
--- a/source/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/source/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
@@ -31,9 +30,9 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -50,25 +49,25 @@
 int enc_debug = 0;
 #endif
 
-static INLINE uint8_t *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE subsize) {
+static INLINE uint8_t *get_sb_index(MACROBLOCK *x, BLOCK_SIZE subsize) {
   switch (subsize) {
     case BLOCK_64X64:
     case BLOCK_64X32:
     case BLOCK_32X64:
     case BLOCK_32X32:
-      return &xd->sb_index;
+      return &x->sb_index;
     case BLOCK_32X16:
     case BLOCK_16X32:
     case BLOCK_16X16:
-      return &xd->mb_index;
+      return &x->mb_index;
     case BLOCK_16X8:
     case BLOCK_8X16:
     case BLOCK_8X8:
-      return &xd->b_index;
+      return &x->b_index;
     case BLOCK_8X4:
     case BLOCK_4X8:
     case BLOCK_4X4:
-      return &xd->ab_index;
+      return &x->ab_index;
     default:
       assert(0);
       return NULL;
@@ -361,12 +360,60 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
   adjust_act_zbin(cpi, x);
 }
 
+// Select a segment for the current SB64
+static void select_in_frame_q_segment(VP9_COMP *cpi,
+                                      int mi_row, int mi_col,
+                                      int output_enabled, int projected_rate) {
+  VP9_COMMON * const cm = &cpi->common;
+  int target_rate = cpi->rc.sb64_target_rate << 8;   // convert to bits << 8
+
+  const int mi_offset = mi_row * cm->mi_cols + mi_col;
+  const int bw = 1 << mi_width_log2(BLOCK_64X64);
+  const int bh = 1 << mi_height_log2(BLOCK_64X64);
+  const int xmis = MIN(cm->mi_cols - mi_col, bw);
+  const int ymis = MIN(cm->mi_rows - mi_row, bh);
+  int complexity_metric = 64;
+  int x, y;
+
+  unsigned char segment;
+
+  if (!output_enabled) {
+    segment = 0;
+  } else {
+    // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+    // It is converted to bits * 256 units
+    target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh);
+
+    if (projected_rate < (target_rate / 4)) {
+      segment = 2;
+    } else if (projected_rate < (target_rate / 2)) {
+      segment = 1;
+    } else {
+      segment = 0;
+    }
+
+    complexity_metric =
+      clamp((int)((projected_rate * 64) / target_rate), 16, 255);
+  }
+
+  // Fill in the entires in the segment map corresponding to this SB64
+  for (y = 0; y < ymis; y++) {
+    for (x = 0; x < xmis; x++) {
+      cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+      cpi->complexity_map[mi_offset + y * cm->mi_cols + x] =
+        (unsigned char)complexity_metric;
+    }
+  }
+}
+
 static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
                          BLOCK_SIZE bsize, int output_enabled) {
   int i, x_idx, y;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
   MODE_INFO *mi = &ctx->mic;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   MODE_INFO *mi_addr = xd->mi_8x8[0];
@@ -375,23 +422,46 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int max_plane;
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
   assert(mi->mbmi.ref_frame[1] < MAX_REF_FRAMES);
   assert(mi->mbmi.sb_type == bsize);
 
+  // For in frame adaptive Q copy over the chosen segment id into the
+  // mode innfo context for the chosen mode / partition.
+  if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && output_enabled)
+    mi->mbmi.segment_id = xd->mi_8x8[0]->mbmi.segment_id;
+
   *mi_addr = *mi;
 
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    p[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
     for (x_idx = 0; x_idx < mi_width; x_idx++)
       if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
-          && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y)
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
         xd->mi_8x8[x_idx + y * mis] = mi_addr;
+      }
 
-  if (cpi->sf.variance_adaptive_quantization) {
+    if ((cpi->oxcf.aq_mode == VARIANCE_AQ) ||
+        (cpi->oxcf.aq_mode == COMPLEXITY_AQ)) {
     vp9_mb_init_quantizer(cpi, x);
   }
 
@@ -461,9 +531,9 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
       ++cm->counts.switchable_interp[ctx][mbmi->interp_filter];
     }
 
-    cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
-    cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
-    cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
+    cpi->rd_comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    cpi->rd_comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    cpi->rd_comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
 
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       cpi->rd_filter_diff[i] += ctx->best_filter_diff[i];
@@ -540,7 +610,7 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile,
 
   /* segment ID */
   if (seg->enabled) {
-    if (!cpi->sf.variance_adaptive_quantization) {
+    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
       uint8_t *map = seg->update_map ? cpi->segmentation_map
           : cm->last_frame_seg_map;
       mbmi->segment_id = vp9_get_segment_id(cm, map, bsize, mi_row, mi_col);
@@ -578,6 +648,9 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int i;
   int orig_rdmult = x->rdmult;
   double rdmult_ratio;
 
@@ -590,7 +663,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *totalrate = 0;
       *totaldist = 0;
       return;
@@ -600,12 +673,21 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
   xd->mi_8x8[0]->mbmi.sb_type = bsize;
 
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][0];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][0];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][0];
+    p[i].eobs = ctx->eobs_pbuf[i][0];
+  }
+  ctx->is_coded = 0;
+  x->skip_recode = 0;
+
   // Set to zero to make sure we do not use the previous encoded frame stats
   xd->mi_8x8[0]->mbmi.skip_coeff = 0;
 
   x->source_variance = get_sby_perpixel_variance(cpi, x, bsize);
 
-  if (cpi->sf.variance_adaptive_quantization) {
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     int energy;
     if (bsize <= BLOCK_16X16) {
       energy = x->mb_energy;
@@ -621,9 +703,17 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
   if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
     vp9_activity_masking(cpi, x);
 
-  if (cpi->sf.variance_adaptive_quantization) {
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     vp9_clear_system_state();  // __asm emms;
     x->rdmult = round(x->rdmult * rdmult_ratio);
+  } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    const int mi_offset = mi_row * cm->mi_cols + mi_col;
+    unsigned char complexity = cpi->complexity_map[mi_offset];
+    const int is_edge = (mi_row == 0) || (mi_row == (cm->mi_rows - 1)) ||
+                        (mi_col == 0) || (mi_col == (cm->mi_cols - 1));
+
+    if (!is_edge && (complexity > 128))
+      x->rdmult = x->rdmult  + ((x->rdmult * (complexity - 128)) / 256);
   }
 
   // Find best coding mode & reconstruct the MB so it is available
@@ -640,7 +730,7 @@ static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile,
                                     totaldist, bsize, ctx, best_rd);
   }
 
-  if (cpi->sf.variance_adaptive_quantization) {
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
     x->rdmult = orig_rdmult;
     if (*totalrate != INT_MAX) {
       vp9_clear_system_state();  // __asm emms;
@@ -661,15 +751,15 @@ static void update_stats(VP9_COMP *cpi) {
                                                      SEG_LVL_REF_FRAME);
 
     if (!seg_ref_active)
-      cpi->intra_inter_count[vp9_get_pred_context_intra_inter(xd)]
+      cpi->intra_inter_count[vp9_get_intra_inter_context(xd)]
                             [is_inter_block(mbmi)]++;
 
     // If the segment reference feature is enabled we have only a single
     // reference frame allowed for the segment so exclude it from
     // the reference frame counts used to work out probabilities.
     if (is_inter_block(mbmi) && !seg_ref_active) {
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
-        cpi->comp_inter_count[vp9_get_pred_context_comp_inter_inter(cm, xd)]
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
+        cpi->comp_inter_count[vp9_get_reference_mode_context(cm, xd)]
                              [has_second_ref(mbmi)]++;
 
       if (has_second_ref(mbmi)) {
@@ -683,24 +773,19 @@ static void update_stats(VP9_COMP *cpi) {
                                [mbmi->ref_frame[0] != GOLDEN_FRAME]++;
       }
     }
-
-    // Count of last ref frame 0,0 usage
-    if (mbmi->mode == ZEROMV && mbmi->ref_frame[0] == LAST_FRAME)
-      cpi->inter_zz_count++;
   }
 }
 
 static BLOCK_SIZE *get_sb_partitioning(MACROBLOCK *x, BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
   switch (bsize) {
     case BLOCK_64X64:
       return &x->sb64_partitioning;
     case BLOCK_32X32:
-      return &x->sb_partitioning[xd->sb_index];
+      return &x->sb_partitioning[x->sb_index];
     case BLOCK_16X16:
-      return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+      return &x->mb_partitioning[x->sb_index][x->mb_index];
     case BLOCK_8X8:
-      return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
+      return &x->b_partitioning[x->sb_index][x->mb_index][x->b_index];
     default:
       assert(0);
       return NULL;
@@ -773,20 +858,19 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
 static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize, int sub_index) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   if (sub_index != -1)
-    *get_sb_index(xd, bsize) = sub_index;
+    *get_sb_index(x, bsize) = sub_index;
 
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index > 0)
+    if (x->ab_index > 0)
       return;
   }
   set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -804,9 +888,8 @@ static void encode_b(VP9_COMP *cpi, const TileInfo *const tile,
 static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   BLOCK_SIZE c1 = BLOCK_8X8;
   const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
   int pl = 0;
@@ -828,18 +911,18 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
   switch (partition) {
     case PARTITION_NONE:
       if (output_enabled && bsize >= BLOCK_8X8)
-        cpi->partition_count[pl][PARTITION_NONE]++;
+        cm->counts.partition[pl][PARTITION_NONE]++;
       encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, -1);
       break;
     case PARTITION_VERT:
       if (output_enabled)
-        cpi->partition_count[pl][PARTITION_VERT]++;
+        cm->counts.partition[pl][PARTITION_VERT]++;
       encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
       encode_b(cpi, tile, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
       break;
     case PARTITION_HORZ:
       if (output_enabled)
-        cpi->partition_count[pl][PARTITION_HORZ]++;
+        cm->counts.partition[pl][PARTITION_HORZ]++;
       encode_b(cpi, tile, tp, mi_row, mi_col, output_enabled, c1, 0);
       encode_b(cpi, tile, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
       break;
@@ -847,12 +930,12 @@ static void encode_sb(VP9_COMP *cpi, const TileInfo *const tile,
       subsize = get_subsize(bsize, PARTITION_SPLIT);
 
       if (output_enabled)
-        cpi->partition_count[pl][PARTITION_SPLIT]++;
+        cm->counts.partition[pl][PARTITION_SPLIT]++;
 
       for (i = 0; i < 4; i++) {
         const int x_idx = i & 1, y_idx = i >> 1;
 
-        *get_sb_index(xd, subsize) = i;
+        *get_sb_index(x, subsize) = i;
         encode_sb(cpi, tile, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
                   output_enabled, subsize);
       }
@@ -979,9 +1062,8 @@ static void rd_use_partition(VP9_COMP *cpi,
                              TOKENEXTRA **tp, int mi_row, int mi_col,
                              BLOCK_SIZE bsize, int *rate, int64_t *dist,
                              int do_recon) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD *xd = &cpi->mb.e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   const int mis = cm->mode_info_stride;
   int bsl = b_width_log2(bsize);
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
@@ -1016,7 +1098,7 @@ static void rd_use_partition(VP9_COMP *cpi,
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
@@ -1074,7 +1156,7 @@ static void rd_use_partition(VP9_COMP *cpi,
                     bsize, get_block_context(x, bsize), INT64_MAX);
       break;
     case PARTITION_HORZ:
-      *get_sb_index(xd, subsize) = 0;
+      *get_sb_index(x, subsize) = 0;
       pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1083,7 +1165,7 @@ static void rd_use_partition(VP9_COMP *cpi,
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
+        *get_sb_index(x, subsize) = 1;
         pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1097,7 +1179,7 @@ static void rd_use_partition(VP9_COMP *cpi,
       }
       break;
     case PARTITION_VERT:
-      *get_sb_index(xd, subsize) = 0;
+      *get_sb_index(x, subsize) = 0;
       pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist,
                     subsize, get_block_context(x, subsize), INT64_MAX);
       if (last_part_rate != INT_MAX &&
@@ -1106,7 +1188,7 @@ static void rd_use_partition(VP9_COMP *cpi,
         int64_t dt = 0;
         update_state(cpi, get_block_context(x, subsize), subsize, 0);
         encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-        *get_sb_index(xd, subsize) = 1;
+        *get_sb_index(x, subsize) = 1;
         pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize,
                       get_block_context(x, subsize), INT64_MAX);
         if (rt == INT_MAX || dt == INT_MAX) {
@@ -1132,7 +1214,7 @@ static void rd_use_partition(VP9_COMP *cpi,
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
-        *get_sb_index(xd, subsize) = i;
+        *get_sb_index(x, subsize) = i;
 
         rd_use_partition(cpi, tile, mi_8x8 + jj * bss * mis + ii * bss, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize, &rt, &dt,
@@ -1173,11 +1255,10 @@ static void rd_use_partition(VP9_COMP *cpi,
       ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
       PARTITION_CONTEXT sl[8], sa[8];
 
-      if ((mi_row + y_idx >= cm->mi_rows)
-          || (mi_col + x_idx >= cm->mi_cols))
+      if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      *get_sb_index(xd, split_subsize) = i;
+      *get_sb_index(x, split_subsize) = i;
       *get_sb_partitioning(x, bsize) = split_subsize;
       *get_sb_partitioning(x, split_subsize) = split_subsize;
 
@@ -1203,7 +1284,8 @@ static void rd_use_partition(VP9_COMP *cpi,
       split_dist += dt;
       pl = partition_plane_context(cpi->above_seg_context,
                                    cpi->left_seg_context,
-                                   mi_row + y_idx, mi_col + x_idx, bsize);
+                                   mi_row + y_idx, mi_col + x_idx,
+                                   split_subsize);
       split_rate += x->partition_cost[pl][PARTITION_NONE];
     }
     pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context,
@@ -1241,8 +1323,19 @@ static void rd_use_partition(VP9_COMP *cpi,
   if ( bsize == BLOCK_64X64)
     assert(chosen_rate < INT_MAX && chosen_dist < INT_MAX);
 
-  if (do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col,
+                                output_enabled, chosen_rate);
+    }
+
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
 
   *rate = chosen_rate;
   *dist = chosen_dist;
@@ -1357,7 +1450,6 @@ static void rd_auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
 static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
 
   // Only use 8x8 result for non HD videos.
   // int use_8x8 = (MIN(cpi->common.width, cpi->common.height) < 720) ? 1 : 0;
@@ -1370,9 +1462,9 @@ static void compute_fast_motion_search_level(VP9_COMP *cpi, BLOCK_SIZE bsize) {
     PICK_MODE_CONTEXT *block_context = NULL;
 
     if (bsize == BLOCK_16X16) {
-      block_context = x->sb8x8_context[xd->sb_index][xd->mb_index];
+      block_context = x->sb8x8_context[x->sb_index][x->mb_index];
     } else if (bsize == BLOCK_32X32) {
-      block_context = x->mb_context[xd->sb_index];
+      block_context = x->mb_context[x->sb_index];
     } else if (bsize == BLOCK_64X64) {
       block_context = x->sb32_context;
     }
@@ -1460,9 +1552,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
                               TOKENEXTRA **tp, int mi_row,
                               int mi_col, BLOCK_SIZE bsize, int *rate,
                               int64_t *dist, int do_recon, int64_t best_rd) {
-  VP9_COMMON * const cm = &cpi->common;
-  MACROBLOCK * const x = &cpi->mb;
-  MACROBLOCKD * const xd = &x->e_mbd;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &cpi->mb;
   const int ms = num_8x8_blocks_wide_lookup[bsize] / 2;
   ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
   PARTITION_CONTEXT sl[8], sa[8];
@@ -1477,10 +1568,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + ms >= cm->mi_rows);
   const int force_vert_split = (mi_col + ms >= cm->mi_cols);
+  const int xss = x->e_mbd.plane[1].subsampling_x;
+  const int yss = x->e_mbd.plane[1].subsampling_y;
 
   int partition_none_allowed = !force_horz_split && !force_vert_split;
-  int partition_horz_allowed = !force_vert_split && bsize >= BLOCK_8X8;
-  int partition_vert_allowed = !force_horz_split && bsize >= BLOCK_8X8;
+  int partition_horz_allowed = !force_vert_split && yss <= xss &&
+                               bsize >= BLOCK_8X8;
+  int partition_vert_allowed = !force_horz_split && xss <= yss &&
+                               bsize >= BLOCK_8X8;
 
   int partition_split_done = 0;
   (void) *tp_orig;
@@ -1488,13 +1583,14 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (bsize < BLOCK_8X8) {
     // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
     // there is nothing to be done.
-    if (xd->ab_index != 0) {
+    if (x->ab_index != 0) {
       *rate = 0;
       *dist = 0;
       return;
     }
   }
-  assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+  assert(num_8x8_blocks_wide_lookup[bsize] ==
+             num_8x8_blocks_high_lookup[bsize]);
 
   if (bsize == BLOCK_16X16) {
     set_offsets(cpi, tile, mi_row, mi_col, bsize);
@@ -1545,7 +1641,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       }
       sum_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_dist);
       if (sum_rd < best_rd) {
-        int64_t stop_thresh = 2048;
+        int64_t stop_thresh = 4096;
+        int64_t stop_thresh_rd;
 
         best_rate = this_rate;
         best_dist = this_dist;
@@ -1557,9 +1654,10 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
         stop_thresh >>= 8 - (b_width_log2_lookup[bsize] +
             b_height_log2_lookup[bsize]);
 
+        stop_thresh_rd = RDCOST(x->rdmult, x->rddiv, 0, stop_thresh);
         // If obtained distortion is very small, choose current partition
         // and stop splitting.
-        if (this_dist < stop_thresh) {
+        if (!x->e_mbd.lossless && best_rd < stop_thresh_rd) {
           do_split = 0;
           do_rect = 0;
         }
@@ -1585,9 +1683,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
         continue;
 
-      *get_sb_index(xd, subsize) = i;
+      *get_sb_index(x, subsize) = i;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
+      if (cpi->sf.adaptive_pred_filter_type && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        get_block_context(x, subsize)->pred_filter_type =
+            get_block_context(x, bsize)->mic.mbmi.interp_filter;
       rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, subsize,
                         &this_rate, &this_dist, i != 3, best_rd - sum_rd);
 
@@ -1632,9 +1734,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   // PARTITION_HORZ
   if (partition_horz_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_HORZ);
-    *get_sb_index(xd, subsize) = 0;
+    *get_sb_index(x, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
+    if (cpi->sf.adaptive_pred_filter_type && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      get_block_context(x, subsize)->pred_filter_type =
+          get_block_context(x, bsize)->mic.mbmi.interp_filter;
     pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -1643,9 +1749,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *get_sb_index(xd, subsize) = 1;
+      *get_sb_index(x, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
+      if (cpi->sf.adaptive_pred_filter_type && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        get_block_context(x, subsize)->pred_filter_type =
+            get_block_context(x, bsize)->mic.mbmi.interp_filter;
       pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
@@ -1677,9 +1787,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   if (partition_vert_allowed && do_rect) {
     subsize = get_subsize(bsize, PARTITION_VERT);
 
-    *get_sb_index(xd, subsize) = 0;
+    *get_sb_index(x, subsize) = 0;
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, get_block_context(x, bsize));
+    if (cpi->sf.adaptive_pred_filter_type && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      get_block_context(x, subsize)->pred_filter_type =
+          get_block_context(x, bsize)->mic.mbmi.interp_filter;
     pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize,
                   get_block_context(x, subsize), best_rd);
     sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
@@ -1687,9 +1801,13 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
       update_state(cpi, get_block_context(x, subsize), subsize, 0);
       encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
 
-      *get_sb_index(xd, subsize) = 1;
+      *get_sb_index(x, subsize) = 1;
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, get_block_context(x, bsize));
+      if (cpi->sf.adaptive_pred_filter_type && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        get_block_context(x, subsize)->pred_filter_type =
+            get_block_context(x, bsize)->mic.mbmi.interp_filter;
       pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate,
                     &this_dist, subsize, get_block_context(x, subsize),
                     best_rd - sum_rd);
@@ -1721,8 +1839,17 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
   *rate = best_rate;
   *dist = best_dist;
 
-  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon)
-    encode_sb(cpi, tile, tp, mi_row, mi_col, bsize == BLOCK_64X64, bsize);
+  if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+    int output_enabled = (bsize == BLOCK_64X64);
+
+    // Check the projected output rate for this SB against it's target
+    // and and if necessary apply a Q delta using segmentation to get
+    // closer to the target.
+    if ((cpi->oxcf.aq_mode == COMPLEXITY_AQ) && cm->seg.update_map) {
+      select_in_frame_q_segment(cpi, mi_row, mi_col, output_enabled, best_rate);
+    }
+    encode_sb(cpi, tile, tp, mi_row, mi_col, output_enabled, bsize);
+  }
   if (bsize == BLOCK_64X64) {
     assert(tp_orig < *tp);
     assert(best_rate < INT_MAX);
@@ -1768,7 +1895,7 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile,
 }
 
 static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
-                          int mi_row, TOKENEXTRA **tp, int *totalrate) {
+                          int mi_row, TOKENEXTRA **tp) {
   VP9_COMMON * const cm = &cpi->common;
   int mi_col;
 
@@ -1782,6 +1909,18 @@ static void encode_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
     int dummy_rate;
     int64_t dummy_dist;
 
+    BLOCK_SIZE i;
+    MACROBLOCK *x = &cpi->mb;
+    for (i = BLOCK_4X4; i < BLOCK_8X8; ++i) {
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[i];
+      const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index)
+        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index)
+          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index)
+            get_block_context(x, i)->pred_filter_type = SWITCHABLE;
+    }
+
     vp9_zero(cpi->mb.pred_mv);
 
     if (cpi->sf.reference_masking)
@@ -1849,10 +1988,6 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
 
   xd->mode_info_stride = cm->mode_info_stride;
 
-  // reset intra mode contexts
-  if (frame_is_intra_only(cm))
-    vp9_init_mbmode_probs(cm);
-
   // Copy data over into macro block data structures.
   vp9_setup_src_planes(x, cpi->Source, 0, 0);
 
@@ -1869,7 +2004,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
   vp9_zero(cpi->y_mode_count);
   vp9_zero(cpi->y_uv_mode_count);
   vp9_zero(cm->counts.inter_mode);
-  vp9_zero(cpi->partition_count);
+  vp9_zero(cm->counts.partition);
   vp9_zero(cpi->intra_inter_count);
   vp9_zero(cpi->comp_inter_count);
   vp9_zero(cpi->single_ref_count);
@@ -1913,7 +2048,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   MACROBLOCK * const x = &cpi->mb;
   VP9_COMMON * const cm = &cpi->common;
   MACROBLOCKD * const xd = &x->e_mbd;
-  int totalrate;
 
 //  fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
 //           cpi->common.current_video_frame, cpi->common.show_frame,
@@ -1929,11 +2063,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   }
 #endif
 
-  totalrate = 0;
-
-  // Reset frame count of inter 0,0 motion vector usage.
-  cpi->inter_zz_count = 0;
-
   vp9_zero(cm->counts.switchable_interp);
   vp9_zero(cpi->tx_stepdown_count);
 
@@ -1995,7 +2124,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
           vp9_tile_init(&tile, cm, tile_row, tile_col);
           for (mi_row = tile.mi_row_start;
                mi_row < tile.mi_row_end; mi_row += 8)
-            encode_sb_row(cpi, &tile, mi_row, &tp, &totalrate);
+            encode_sb_row(cpi, &tile, mi_row, &tp);
 
           cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
           assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -2021,10 +2150,6 @@ static void encode_frame_internal(VP9_COMP *cpi) {
     cpi->sf.skip_encode_frame = 0;
   }
 
-  // 256 rate units to the bit,
-  // projected_frame_size in units of BYTES
-  cpi->projected_frame_size = totalrate >> 8;
-
 #if 0
   // Keep record of the total distortion this time around for future use
   cpi->last_frame_distortion = cpi->frame_distortion;
@@ -2225,18 +2350,18 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     /* prediction (compound, single or hybrid) mode selection */
     if (frame_type == 3 || !cm->allow_comp_inter_inter)
-      pred_type = SINGLE_PREDICTION_ONLY;
+      pred_type = SINGLE_REFERENCE;
     else if (cpi->rd_prediction_type_threshes[frame_type][1]
              > cpi->rd_prediction_type_threshes[frame_type][0]
              && cpi->rd_prediction_type_threshes[frame_type][1]
              > cpi->rd_prediction_type_threshes[frame_type][2]
              && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
-      pred_type = COMP_PREDICTION_ONLY;
+      pred_type = COMPOUND_REFERENCE;
     else if (cpi->rd_prediction_type_threshes[frame_type][0]
              > cpi->rd_prediction_type_threshes[frame_type][2])
-      pred_type = SINGLE_PREDICTION_ONLY;
+      pred_type = SINGLE_REFERENCE;
     else
-      pred_type = HYBRID_PREDICTION;
+      pred_type = REFERENCE_MODE_SELECT;
 
     /* filter type selection */
     // FIXME(rbultje) for some odd reason, we often select smooth_filter
@@ -2269,11 +2394,11 @@ void vp9_encode_frame(VP9_COMP *cpi) {
 
     /* transform size selection (4x4, 8x8, 16x16 or select-per-mb) */
     select_tx_mode(cpi);
-    cpi->common.comp_pred_mode = pred_type;
+    cpi->common.reference_mode = pred_type;
     cpi->common.mcomp_filter_type = filter_type;
     encode_frame_internal(cpi);
 
-    for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+    for (i = 0; i < REFERENCE_MODES; ++i) {
       const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
       cpi->rd_prediction_type_threshes[frame_type][i] += diff;
       cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
@@ -2296,7 +2421,7 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       cpi->rd_tx_select_threshes[frame_type][i] /= 2;
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cpi->common.reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
 
@@ -2306,10 +2431,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
 
       if (comp_count_zero == 0) {
-        cpi->common.comp_pred_mode = SINGLE_PREDICTION_ONLY;
+        cpi->common.reference_mode = SINGLE_REFERENCE;
         vp9_zero(cpi->comp_inter_count);
       } else if (single_count_zero == 0) {
-        cpi->common.comp_pred_mode = COMP_PREDICTION_ONLY;
+        cpi->common.reference_mode = COMPOUND_REFERENCE;
         vp9_zero(cpi->comp_inter_count);
       }
     }
@@ -2401,13 +2526,18 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
   MODE_INFO **mi_8x8 = xd->mi_8x8;
   MODE_INFO *mi = mi_8x8[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
+  PICK_MODE_CONTEXT *ctx = get_block_context(x, bsize);
   unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8 &&
+                   (cpi->oxcf.aq_mode != COMPLEXITY_AQ);
+  x->skip_optimize = ctx->is_coded;
+  ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
   x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
-                    xd->q_index < QIDX_SKIP_THRESH);
+                    x->q_index < QIDX_SKIP_THRESH);
   if (x->skip_encode)
     return;
 
@@ -2492,31 +2622,23 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
         !(is_inter_block(mbmi) &&
             (mbmi->skip_coeff ||
              vp9_segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)))) {
-      const uint8_t context = vp9_get_pred_context_tx_size(xd);
-      ++get_tx_counts(bsize, context, &cm->counts.tx)[mbmi->tx_size];
+      ++get_tx_counts(max_txsize_lookup[bsize], vp9_get_tx_size_context(xd),
+                      &cm->counts.tx)[mbmi->tx_size];
     } else {
       int x, y;
-      TX_SIZE sz = tx_mode_to_biggest_tx_size[cm->tx_mode];
-      assert(sizeof(tx_mode_to_biggest_tx_size) /
-             sizeof(tx_mode_to_biggest_tx_size[0]) == TX_MODES);
+      TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
       if (is_inter_block(&mi->mbmi)) {
-        if (sz == TX_32X32 && bsize < BLOCK_32X32)
-          sz = TX_16X16;
-        if (sz == TX_16X16 && bsize < BLOCK_16X16)
-          sz = TX_8X8;
-        if (sz == TX_8X8 && bsize < BLOCK_8X8)
-          sz = TX_4X4;
-      } else if (bsize >= BLOCK_8X8) {
-        sz = mbmi->tx_size;
+        tx_size = MIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                      max_txsize_lookup[bsize]);
       } else {
-        sz = TX_4X4;
+        tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
       }
 
       for (y = 0; y < mi_height; y++)
         for (x = 0; x < mi_width; x++)
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
-            mi_8x8[mis * y + x]->mbmi.tx_size = sz;
+            mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
   }
 }
diff --git a/source/libvpx/vp9/encoder/vp9_encodeintra.c b/source/libvpx/vp9/encoder/vp9_encodeintra.c
deleted file mode 100644
index 32b4593..0000000
--- a/source/libvpx/vp9/encoder/vp9_encodeintra.c
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
-  x->skip_encode = 0;
-  mbmi->mode = DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
-                                                                 : TX_8X8)
-                                   : TX_4X4;
-  vp9_encode_intra_block_y(x, mbmi->sb_type);
-  return vp9_get_mb_ss(x->plane[0].src_diff);
-}
diff --git a/source/libvpx/vp9/encoder/vp9_encodeintra.h b/source/libvpx/vp9/encoder/vp9_encodeintra.h
deleted file mode 100644
index e217924..0000000
--- a/source/libvpx/vp9/encoder/vp9_encodeintra.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_ENCODEINTRA_H_
-#define VP9_ENCODER_VP9_ENCODEINTRA_H_
-
-#include "vp9/encoder/vp9_onyx_int.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg);
-
-#endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_
diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.c b/source/libvpx/vp9/encoder/vp9_encodemb.c
index e52e8ec..e05ba1b 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/source/libvpx/vp9/encoder/vp9_encodemb.c
@@ -25,6 +25,26 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATION_TYPE mcomp_filter_type,
+                              VP9_COMMON *cm) {
+  if (xd->mi_8x8 && xd->mi_8x8[0]) {
+    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
+                          mbmi->ref_frame[1] - LAST_FRAME,
+                          cm->active_ref_scale);
+  } else {
+    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
+  }
+
+  xd->subpix.filter_x = xd->subpix.filter_y =
+      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
+                               EIGHTTAP : mcomp_filter_type);
+
+  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+}
+
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
                           const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -43,13 +63,12 @@ void vp9_subtract_block_c(int rows, int cols,
 
 static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int bw = plane_block_width(bsize, pd);
-  const int bh = plane_block_height(bsize, pd);
+  const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
 
-  vp9_subtract_block(bh, bw, p->src_diff, bw,
-                     p->src.buf, p->src.stride,
+  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
 
@@ -117,6 +136,7 @@ static void optimize_b(MACROBLOCK *mb,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
                        TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &mb->e_mbd;
+  struct macroblock_plane *p = &mb->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   vp9_token_state tokens[1025][2];
@@ -124,7 +144,7 @@ static void optimize_b(MACROBLOCK *mb,
   const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   int16_t *qcoeff_ptr;
   int16_t *dqcoeff_ptr;
-  int eob = pd->eobs[block], final_eob, sz = 0;
+  int eob = p->eobs[block], final_eob, sz = 0;
   const int i0 = 0;
   int rc, x, next, i;
   int64_t rdmult, rddiv, rd_cost0, rd_cost1;
@@ -133,29 +153,30 @@ static void optimize_b(MACROBLOCK *mb,
   PLANE_TYPE type = pd->plane_type;
   int err_mult = plane_rd_mult[type];
   const int default_eob = 16 << (tx_size << 1);
-  const int16_t *scan, *nb;
+
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
-  const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
   const int16_t *dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
+  const scan_order *so = get_scan(xd, tx_size, type, block);
+  const int16_t *scan = so->scan;
+  const int16_t *nb = so->neighbors;
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
-  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  get_scan(xd, tx_size, type, ib, &scan, &nb);
+  qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   rdmult = mb->rdmult * err_mult;
-  if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
+  if (!is_inter_block(&mb->e_mbd.mi_8x8[0]->mbmi))
     rdmult = (rdmult * 9) >> 4;
   rddiv = mb->rddiv;
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
   tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = DCT_EOB_TOKEN;
+  tokens[eob][0].token = EOB_TOKEN;
   tokens[eob][0].qc = 0;
   *(tokens[eob] + 1) = *(tokens[eob] + 0);
   next = eob;
@@ -179,7 +200,7 @@ static void optimize_b(MACROBLOCK *mb,
       t0 = (vp9_dct_value_tokens_ptr + x)->token;
       /* Consider both possible successor states. */
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
+        band = band_translate[i + 1];
         pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
         rate0 +=
           mb->token_costs[tx_size][type][ref][band][0][pt]
@@ -222,21 +243,19 @@ static void optimize_b(MACROBLOCK *mb,
         /* If we reduced this coefficient to zero, check to see if
          *  we need to move the EOB back here.
          */
-        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
-             DCT_EOB_TOKEN : ZERO_TOKEN;
+        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
+        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
       } else {
         t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
       }
       if (next < default_eob) {
-        band = get_coef_band(band_translate, i + 1);
-        if (t0 != DCT_EOB_TOKEN) {
+        band = band_translate[i + 1];
+        if (t0 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
           rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][0].token];
         }
-        if (t1 != DCT_EOB_TOKEN) {
+        if (t1 != EOB_TOKEN) {
           pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
           rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
                                   [tokens[next][1].token];
@@ -264,16 +283,16 @@ static void optimize_b(MACROBLOCK *mb,
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = get_coef_band(band_translate, i + 1);
+      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
       /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != DCT_EOB_TOKEN) {
+      if (t0 != EOB_TOKEN) {
         tokens[next][0].rate +=
             mb->token_costs[tx_size][type][ref][band][1][0][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
-      if (t1 != DCT_EOB_TOKEN) {
+      if (t1 != EOB_TOKEN) {
         tokens[next][1].rate +=
             mb->token_costs[tx_size][type][ref][band][1][0][t1];
         tokens[next][1].token = ZERO_TOKEN;
@@ -284,7 +303,7 @@ static void optimize_b(MACROBLOCK *mb,
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = get_coef_band(band_translate, i + 1);
+  band = band_translate[i + 1];
   pt = combine_entropy_contexts(*a, *l);
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
@@ -313,7 +332,7 @@ static void optimize_b(MACROBLOCK *mb,
   }
   final_eob++;
 
-  xd->plane[plane].eobs[block] = final_eob;
+  mb->plane[plane].eobs[block] = final_eob;
   *a = *l = (final_eob > 0);
 }
 
@@ -348,65 +367,51 @@ void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *scan, *iscan;
-  uint16_t *eob = &pd->eobs[block];
-  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
-  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
-  int xoff, yoff;
+  const scan_order *scan_order;
+  uint16_t *eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  int i, j;
   int16_t *src_diff;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   switch (tx_size) {
     case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      iscan = vp9_default_iscan_32x32;
-      block >>= 6;
-      xoff = 32 * (block & twmask);
-      yoff = 32 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
+      scan_order = &vp9_default_scan_orders[TX_32X32];
       if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
+        vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
       else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
+        vp9_fdct32x32(src_diff, coeff, diff_stride);
       vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
                            p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+                           pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                           scan_order->iscan);
       break;
     case TX_16X16:
-      scan = vp9_default_scan_16x16;
-      iscan = vp9_default_iscan_16x16;
-      block >>= 4;
-      xoff = 16 * (block & twmask);
-      yoff = 16 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct16x16(src_diff, coeff, bw * 4);
+      scan_order = &vp9_default_scan_orders[TX_16X16];
+      vp9_fdct16x16(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
-      scan = vp9_default_scan_8x8;
-      iscan = vp9_default_iscan_8x8;
-      block >>= 2;
-      xoff = 8 * (block & twmask);
-      yoff = 8 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_fdct8x8(src_diff, coeff, bw * 4);
+      scan_order = &vp9_default_scan_orders[TX_8X8];
+      vp9_fdct8x8(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      scan = vp9_default_scan_4x4;
-      iscan = vp9_default_iscan_4x4;
-      xoff = 4 * (block & twmask);
-      yoff = 4 * (block >> twl);
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      x->fwd_txm4x4(src_diff, coeff, bw * 4);
+      scan_order = &vp9_default_scan_orders[TX_4X4];
+      x->fwd_txm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                      p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                     pd->dequant, p->zbin_extra, eob,
+                     scan_order->scan, scan_order->iscan);
       break;
     default:
       assert(0);
@@ -419,51 +424,54 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx *const ctx = args->ctx;
+  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    int x, y;
-    pd->eobs[block] = 0;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-    ctx->ta[plane][x] = 0;
-    ctx->tl[plane][y] = 0;
+    p->eobs[block] = 0;
+    ctx->ta[plane][i] = 0;
+    ctx->tl[plane][j] = 0;
     return;
   }
 
-  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
+  if (!x->skip_recode)
+    vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
-  if (x->optimize)
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
+  } else {
+    ctx->ta[plane][i] = p->eobs[block] > 0;
+    ctx->tl[plane][j] = p->eobs[block] > 0;
+  }
 
-  if (x->skip_encode || pd->eobs[block] == 0)
+  if (x->skip_encode || p->eobs[block] == 0)
     return;
 
   switch (tx_size) {
     case TX_32X32:
-      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_16X16:
-      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_8X8:
-      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     case TX_4X4:
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+      xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default:
-      assert(!"Invalid transform size");
+      assert(0 && "Invalid transform size");
   }
 }
 
@@ -472,20 +480,20 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
-  if (pd->eobs[block] == 0)
+  if (p->eobs[block] == 0)
     return;
 
-  xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
+  xd->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
 }
 
 void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
@@ -505,9 +513,10 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
 
-  vp9_subtract_sb(x, bsize);
+  if (!x->skip_recode)
+    vp9_subtract_sb(x, bsize);
 
-  if (x->optimize) {
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; ++i)
       optimize_init_b(i, bsize, &arg);
@@ -525,118 +534,117 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
+  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int16_t *scan, *iscan;
+  const scan_order *scan_order;
   TX_TYPE tx_type;
   MB_PREDICTION_MODE mode;
-  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
-  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
-  int xoff, yoff;
+  const int bwl = b_width_log2(plane_bsize);
+  const int diff_stride = 4 * (1 << bwl);
   uint8_t *src, *dst;
   int16_t *src_diff;
-  uint16_t *eob = &pd->eobs[block];
+  uint16_t *eob = &p->eobs[block];
+  int i, j;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * (j * pd->dst.stride + i)];
+  src = &p->src.buf[4 * (j * p->src.stride + i)];
+  src_diff = &p->src_diff[4 * (j * diff_stride + i)];
 
   if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
-    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
+    extend_for_intra(xd, plane_bsize, plane, i, j);
 
   // if (x->optimize)
   // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
 
   switch (tx_size) {
     case TX_32X32:
-      scan = vp9_default_scan_32x32;
-      iscan = vp9_default_iscan_32x32;
+      scan_order = &vp9_default_scan_orders[TX_32X32];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 6;
-      xoff = 32 * (block & twmask);
-      yoff = 32 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(32, 32, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
-      else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 6, bwl, TX_32X32, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
+      if (!x->skip_recode) {
+        vp9_subtract_block(32, 32, src_diff, diff_stride,
+                           src, p->src.stride, dst, pd->dst.stride);
+        if (x->use_lp32x32fdct)
+          vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+        else
+          vp9_fdct32x32(src_diff, coeff, diff_stride);
+        vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                             scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_16X16:
       tx_type = get_tx_type_16x16(pd->plane_type, xd);
-      scan = get_scan_16x16(tx_type);
-      iscan = get_iscan_16x16(tx_type);
+      scan_order = &vp9_scan_orders[TX_16X16][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 4;
-      xoff = 16 * (block & twmask);
-      yoff = 16 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(16, 16, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 4, bwl, TX_16X16, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
+      if (!x->skip_recode) {
+        vp9_subtract_block(16, 16, src_diff, diff_stride,
+                           src, p->src.stride, dst, pd->dst.stride);
+        vp9_fht16x16(tx_type, src_diff, coeff, diff_stride);
+        vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_8X8:
       tx_type = get_tx_type_8x8(pd->plane_type, xd);
-      scan = get_scan_8x8(tx_type);
-      iscan = get_iscan_8x8(tx_type);
+      scan_order = &vp9_scan_orders[TX_8X8][tx_type];
       mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
-      block >>= 2;
-      xoff = 8 * (block & twmask);
-      yoff = 8 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(8, 8, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      vp9_predict_intra_block(xd, block >> 2, bwl, TX_8X8, mode,
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
+      if (!x->skip_recode) {
+        vp9_subtract_block(8, 8, src_diff, diff_stride,
+                           src, p->src.stride, dst, pd->dst.stride);
+        vp9_fht8x8(tx_type, src_diff, coeff, diff_stride);
+        vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
     case TX_4X4:
       tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
-      scan = get_scan_4x4(tx_type);
-      iscan = get_iscan_4x4(tx_type);
+      scan_order = &vp9_scan_orders[TX_4X4][tx_type];
       if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
         mode = xd->mi_8x8[0]->bmi[block].as_mode;
       else
         mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
 
-      xoff = 4 * (block & twmask);
-      yoff = 4 * (block >> twl);
-      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
-                              dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(4, 4, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
-      else
-        x->fwd_txm4x4(src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+                              x->skip_encode ? src : dst,
+                              x->skip_encode ? p->src.stride : pd->dst.stride,
+                              dst, pd->dst.stride);
+
+      if (!x->skip_recode) {
+        vp9_subtract_block(4, 4, src_diff, diff_stride,
+                           src, p->src.stride, dst, pd->dst.stride);
+        if (tx_type != DCT_DCT)
+          vp9_short_fht4x4(src_diff, coeff, diff_stride, tx_type);
+        else
+          x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan_order->scan,
+                       scan_order->iscan);
+      }
+
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -667,3 +675,14 @@ void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
 }
 
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
+  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  x->skip_encode = 0;
+  mbmi->mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+                                                                 : TX_8X8)
+                                   : TX_4X4;
+  vp9_encode_intra_block_y(x, mbmi->sb_type);
+  return vp9_get_mb_ss(x->plane[0].src_diff);
+}
diff --git a/source/libvpx/vp9/encoder/vp9_encodemb.h b/source/libvpx/vp9/encoder/vp9_encodemb.h
index 61dd735..7be6621 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemb.h
+++ b/source/libvpx/vp9/encoder/vp9_encodemb.h
@@ -47,8 +47,14 @@ void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
+
 void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATION_TYPE mcomp_filter_type,
+                              VP9_COMMON *cm);
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_
diff --git a/source/libvpx/vp9/encoder/vp9_encodemv.c b/source/libvpx/vp9/encoder/vp9_encodemv.c
index 9ebcc49..9af28f9 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemv.c
+++ b/source/libvpx/vp9/encoder/vp9_encodemv.c
@@ -15,11 +15,22 @@
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_encodemv.h"
 
-
 #ifdef ENTROPY_STATS
 extern unsigned int active_section;
 #endif
 
+static struct vp9_token mv_joint_encodings[MV_JOINTS];
+static struct vp9_token mv_class_encodings[MV_CLASSES];
+static struct vp9_token mv_fp_encodings[MV_FP_SIZE];
+static struct vp9_token mv_class0_encodings[CLASS0_SIZE];
+
+void vp9_entropy_mv_init() {
+  vp9_tokens_from_tree(mv_joint_encodings, vp9_mv_joint_tree);
+  vp9_tokens_from_tree(mv_class_encodings, vp9_mv_class_tree);
+  vp9_tokens_from_tree(mv_class0_encodings, vp9_mv_class0_tree);
+  vp9_tokens_from_tree(mv_fp_encodings, vp9_mv_fp_tree);
+}
+
 static void encode_mv_component(vp9_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
@@ -36,13 +47,13 @@ static void encode_mv_component(vp9_writer* w, int comp,
   vp9_write(w, sign, mvcomp->sign);
 
   // Class
-  write_token(w, vp9_mv_class_tree, mvcomp->classes,
-              &vp9_mv_class_encodings[mv_class]);
+  vp9_write_token(w, vp9_mv_class_tree, mvcomp->classes,
+                  &mv_class_encodings[mv_class]);
 
   // Integer bits
   if (mv_class == MV_CLASS_0) {
-    write_token(w, vp9_mv_class0_tree, mvcomp->class0,
-                &vp9_mv_class0_encodings[d]);
+    vp9_write_token(w, vp9_mv_class0_tree, mvcomp->class0,
+                    &mv_class0_encodings[d]);
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -51,9 +62,9 @@ static void encode_mv_component(vp9_writer* w, int comp,
   }
 
   // Fractional bits
-  write_token(w, vp9_mv_fp_tree,
-              mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
-              &vp9_mv_fp_encodings[fr]);
+  vp9_write_token(w, vp9_mv_fp_tree,
+                  mv_class == MV_CLASS_0 ?  mvcomp->class0_fp[d] : mvcomp->fp,
+                  &mv_fp_encodings[fr]);
 
   // High precision bit
   if (usehp)
@@ -68,7 +79,7 @@ static void build_nmv_component_cost_table(int *mvcost,
   int i, v;
   int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
   int bits_cost[MV_OFFSET_BITS][2];
-  int class0_fp_cost[CLASS0_SIZE][4], fp_cost[4];
+  int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
   int class0_hp_cost[2], hp_cost[2];
 
   sign_cost[0] = vp9_cost_zero(mvcomp->sign);
@@ -124,155 +135,68 @@ static void build_nmv_component_cost_table(int *mvcost,
   }
 }
 
-static int update_mv(vp9_writer *w, const unsigned int ct[2],
-                     vp9_prob *cur_p, vp9_prob new_p, vp9_prob upd_p) {
-  vp9_prob mod_p = new_p | 1;
-  const int cur_b = cost_branch256(ct, *cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-  if (cur_b - mod_b > cost) {
-    *cur_p = mod_p;
-    vp9_write(w, 1, upd_p);
-    vp9_write_literal(w, mod_p >> 1, 7);
-    return 1;
-  } else {
-    vp9_write(w, 0, upd_p);
-    return 0;
+static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
+                     vp9_prob upd_p) {
+  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+  const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
+                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
+  vp9_write(w, update, upd_p);
+  if (update) {
+    *cur_p = new_p;
+    vp9_write_literal(w, new_p >> 1, 7);
   }
+  return update;
 }
 
-static void counts_to_nmv_context(
-    nmv_context_counts *nmv_count,
-    nmv_context *prob,
-    int usehp,
-    unsigned int (*branch_ct_joint)[2],
-    unsigned int (*branch_ct_sign)[2],
-    unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2],
-    unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2],
-    unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2],
-    unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2],
-    unsigned int (*branch_ct_fp)[4 - 1][2],
-    unsigned int (*branch_ct_class0_hp)[2],
-    unsigned int (*branch_ct_hp)[2]) {
-  int i, j, k;
-  vp9_tree_probs_from_distribution(vp9_mv_joint_tree,
-                                   prob->joints,
-                                   branch_ct_joint,
-                                   nmv_count->joints, 0);
-  for (i = 0; i < 2; ++i) {
-    const uint32_t s0 = nmv_count->comps[i].sign[0];
-    const uint32_t s1 = nmv_count->comps[i].sign[1];
-
-    prob->comps[i].sign = get_binary_prob(s0, s1);
-    branch_ct_sign[i][0] = s0;
-    branch_ct_sign[i][1] = s1;
-    vp9_tree_probs_from_distribution(vp9_mv_class_tree,
-                                     prob->comps[i].classes,
-                                     branch_ct_classes[i],
-                                     nmv_count->comps[i].classes, 0);
-    vp9_tree_probs_from_distribution(vp9_mv_class0_tree,
-                                     prob->comps[i].class0,
-                                     branch_ct_class0[i],
-                                     nmv_count->comps[i].class0, 0);
-    for (j = 0; j < MV_OFFSET_BITS; ++j) {
-      const uint32_t b0 = nmv_count->comps[i].bits[j][0];
-      const uint32_t b1 = nmv_count->comps[i].bits[j][1];
-
-      prob->comps[i].bits[j] = get_binary_prob(b0, b1);
-      branch_ct_bits[i][j][0] = b0;
-      branch_ct_bits[i][j][1] = b1;
-    }
-  }
-  for (i = 0; i < 2; ++i) {
-    for (k = 0; k < CLASS0_SIZE; ++k) {
-      vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                       prob->comps[i].class0_fp[k],
-                                       branch_ct_class0_fp[i][k],
-                                       nmv_count->comps[i].class0_fp[k], 0);
-    }
-    vp9_tree_probs_from_distribution(vp9_mv_fp_tree,
-                                     prob->comps[i].fp,
-                                     branch_ct_fp[i],
-                                     nmv_count->comps[i].fp, 0);
-  }
-  if (usehp) {
-    for (i = 0; i < 2; ++i) {
-      const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0];
-      const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1];
-      const uint32_t hp0 = nmv_count->comps[i].hp[0];
-      const uint32_t hp1 = nmv_count->comps[i].hp[1];
-
-      prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1);
-      branch_ct_class0_hp[i][0] = c0_hp0;
-      branch_ct_class0_hp[i][1] = c0_hp1;
-
-      prob->comps[i].hp = get_binary_prob(hp0, hp1);
-      branch_ct_hp[i][0] = hp0;
-      branch_ct_hp[i][1] = hp1;
-    }
-  }
+static void write_mv_update(const vp9_tree_index *tree,
+                            vp9_prob probs[/*n - 1*/],
+                            const unsigned int counts[/*n - 1*/],
+                            int n, vp9_writer *w) {
+  int i;
+  unsigned int branch_ct[32][2];
+
+  // Assuming max number of probabilities <= 32
+  assert(n <= 32);
+
+  vp9_tree_probs_from_distribution(tree, branch_ct, counts);
+  for (i = 0; i < n - 1; ++i)
+    update_mv(w, branch_ct[i], &probs[i], NMV_UPDATE_PROB);
 }
 
-void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer* const bc) {
+void vp9_write_nmv_probs(VP9_COMP* const cpi, int usehp, vp9_writer *w) {
   int i, j;
-  nmv_context prob;
-  unsigned int branch_ct_joint[MV_JOINTS - 1][2];
-  unsigned int branch_ct_sign[2][2];
-  unsigned int branch_ct_classes[2][MV_CLASSES - 1][2];
-  unsigned int branch_ct_class0[2][CLASS0_SIZE - 1][2];
-  unsigned int branch_ct_bits[2][MV_OFFSET_BITS][2];
-  unsigned int branch_ct_class0_fp[2][CLASS0_SIZE][4 - 1][2];
-  unsigned int branch_ct_fp[2][4 - 1][2];
-  unsigned int branch_ct_class0_hp[2][2];
-  unsigned int branch_ct_hp[2][2];
   nmv_context *mvc = &cpi->common.fc.nmvc;
+  nmv_context_counts *counts = &cpi->NMVcount;
 
-  counts_to_nmv_context(&cpi->NMVcount, &prob, usehp,
-                        branch_ct_joint, branch_ct_sign, branch_ct_classes,
-                        branch_ct_class0, branch_ct_bits,
-                        branch_ct_class0_fp, branch_ct_fp,
-                        branch_ct_class0_hp, branch_ct_hp);
-
-  for (j = 0; j < MV_JOINTS - 1; ++j)
-    update_mv(bc, branch_ct_joint[j], &mvc->joints[j], prob.joints[j],
-              NMV_UPDATE_PROB);
+  write_mv_update(vp9_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
   for (i = 0; i < 2; ++i) {
-    update_mv(bc, branch_ct_sign[i], &mvc->comps[i].sign,
-              prob.comps[i].sign, NMV_UPDATE_PROB);
-    for (j = 0; j < MV_CLASSES - 1; ++j)
-      update_mv(bc, branch_ct_classes[i][j], &mvc->comps[i].classes[j],
-                prob.comps[i].classes[j], NMV_UPDATE_PROB);
-
-    for (j = 0; j < CLASS0_SIZE - 1; ++j)
-      update_mv(bc, branch_ct_class0[i][j], &mvc->comps[i].class0[j],
-                prob.comps[i].class0[j], NMV_UPDATE_PROB);
-
+    nmv_component *comp = &mvc->comps[i];
+    nmv_component_counts *comp_counts = &counts->comps[i];
+
+    update_mv(w, comp_counts->sign, &comp->sign, NMV_UPDATE_PROB);
+    write_mv_update(vp9_mv_class_tree, comp->classes, comp_counts->classes,
+                    MV_CLASSES, w);
+    write_mv_update(vp9_mv_class0_tree, comp->class0, comp_counts->class0,
+                    CLASS0_SIZE, w);
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      update_mv(bc, branch_ct_bits[i][j], &mvc->comps[i].bits[j],
-                prob.comps[i].bits[j], NMV_UPDATE_PROB);
+      update_mv(w, comp_counts->bits[j], &comp->bits[j], NMV_UPDATE_PROB);
   }
 
   for (i = 0; i < 2; ++i) {
-    for (j = 0; j < CLASS0_SIZE; ++j) {
-      int k;
-      for (k = 0; k < 3; ++k)
-        update_mv(bc, branch_ct_class0_fp[i][j][k],
-                  &mvc->comps[i].class0_fp[j][k],
-                  prob.comps[i].class0_fp[j][k], NMV_UPDATE_PROB);
-    }
+    for (j = 0; j < CLASS0_SIZE; ++j)
+      write_mv_update(vp9_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                      counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
 
-    for (j = 0; j < 3; ++j)
-      update_mv(bc, branch_ct_fp[i][j], &mvc->comps[i].fp[j],
-                prob.comps[i].fp[j], NMV_UPDATE_PROB);
+    write_mv_update(vp9_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                    MV_FP_SIZE, w);
   }
 
   if (usehp) {
     for (i = 0; i < 2; ++i) {
-      update_mv(bc, branch_ct_class0_hp[i], &mvc->comps[i].class0_hp,
-                prob.comps[i].class0_hp, NMV_UPDATE_PROB);
-      update_mv(bc, branch_ct_hp[i], &mvc->comps[i].hp,
-                prob.comps[i].hp, NMV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                NMV_UPDATE_PROB);
+      update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, NMV_UPDATE_PROB);
     }
   }
 }
@@ -285,7 +209,7 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w,
   const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
   usehp = usehp && vp9_use_mv_hp(ref);
 
-  write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
+  vp9_write_token(w, vp9_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
 
@@ -345,3 +269,4 @@ void vp9_update_mv_count(VP9_COMP *cpi, MACROBLOCK *x, int_mv best_ref_mv[2]) {
     inc_mvs(mbmi->mv, best_ref_mv, is_compound, &cpi->NMVcount);
   }
 }
+
diff --git a/source/libvpx/vp9/encoder/vp9_encodemv.h b/source/libvpx/vp9/encoder/vp9_encodemv.h
index 6331778..4cc10da 100644
--- a/source/libvpx/vp9/encoder/vp9_encodemv.h
+++ b/source/libvpx/vp9/encoder/vp9_encodemv.h
@@ -14,6 +14,8 @@
 
 #include "vp9/encoder/vp9_onyx_int.h"
 
+void vp9_entropy_mv_init();
+
 void vp9_write_nmv_probs(VP9_COMP* const, int usehp, vp9_writer* const);
 
 void vp9_encode_mv(VP9_COMP *cpi, vp9_writer* w, const MV* mv, const MV* ref,
diff --git a/source/libvpx/vp9/common/vp9_extend.c b/source/libvpx/vp9/encoder/vp9_extend.c
index 07c68c8..dcbb5ac 100644
--- a/source/libvpx/vp9/common/vp9_extend.c
+++ b/source/libvpx/vp9/encoder/vp9_extend.c
@@ -11,7 +11,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_extend.h"
+#include "vp9/encoder/vp9_extend.h"
 
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch,
@@ -62,7 +62,7 @@ void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int et_y = 16;
   const int el_y = 16;
   // Motion estimation may use src block variance with the block size up
-  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
   const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
                        16);
diff --git a/source/libvpx/vp9/common/vp9_extend.h b/source/libvpx/vp9/encoder/vp9_extend.h
index 7ff79b7..7ff79b7 100644
--- a/source/libvpx/vp9/common/vp9_extend.h
+++ b/source/libvpx/vp9/encoder/vp9_extend.h
diff --git a/source/libvpx/vp9/encoder/vp9_firstpass.c b/source/libvpx/vp9/encoder/vp9_firstpass.c
index c83954e..cd6831a 100644
--- a/source/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/source/libvpx/vp9/encoder/vp9_firstpass.c
@@ -11,17 +11,16 @@
 #include <math.h>
 #include <limits.h>
 #include <stdio.h>
+#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_encodeframe.h"
+#include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
 #include "vpx_scale/vpx_scale.h"
-#include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_systemdependent.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -77,6 +76,19 @@ static int select_cq_level(int qindex) {
   return ret_val;
 }
 
+static int gfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000828 * q * q * q) +
+               (-0.0055 * q * q) +
+               (1.32 * q) + 79.3);
+}
+
+static int kfboost_qadjust(int qindex) {
+  const double q = vp9_convert_qindex_to_q(qindex);
+  return (int)((0.00000973 * q * q * q) +
+               (-0.00613 * q * q) +
+               (1.316 * q) + 121.2);
+}
 
 // Resets the first pass file to the given position using a relative seek from
 // the current position.
@@ -337,9 +349,11 @@ static int frame_max_bits(VP9_COMP *cpi) {
   const double max_bits = (1.0 * cpi->twopass.bits_left /
       (cpi->twopass.total_stats.count - cpi->common.current_video_frame)) *
       (cpi->oxcf.two_pass_vbrmax_section / 100.0);
-
-  // Trap case where we are out of bits.
-  return MAX((int)max_bits, 0);
+  if (max_bits < 0)
+      return 0;
+  if (max_bits >= INT_MAX)
+    return INT_MAX;
+  return (int)max_bits;
 }
 
 void vp9_init_first_pass(VP9_COMP *cpi) {
@@ -350,36 +364,32 @@ void vp9_end_first_pass(VP9_COMP *cpi) {
   output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.total_stats);
 }
 
-static void zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
-                             YV12_BUFFER_CONFIG *recon_buffer,
-                             int *best_motion_err, int recon_yoffset) {
+static unsigned int zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
+                                     YV12_BUFFER_CONFIG *recon_buffer,
+                                     int recon_yoffset) {
   MACROBLOCKD *const xd = &x->e_mbd;
+  const uint8_t *const src = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const ref = xd->plane[0].pre[0].buf
+                           = recon_buffer->y_buffer + recon_yoffset;
+  const int ref_stride = xd->plane[0].pre[0].stride;
 
-  // Set up pointers for this macro block recon buffer
-  xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset;
-
+  unsigned int sse;
   switch (xd->mi_8x8[0]->mbmi.sb_type) {
     case BLOCK_8X8:
-      vp9_mse8x8(x->plane[0].src.buf, x->plane[0].src.stride,
-                 xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                 (unsigned int *)(best_motion_err));
+      vp9_mse8x8(src, src_stride, ref, ref_stride, &sse);
       break;
     case BLOCK_16X8:
-      vp9_mse16x8(x->plane[0].src.buf, x->plane[0].src.stride,
-                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                  (unsigned int *)(best_motion_err));
+      vp9_mse16x8(src, src_stride, ref, ref_stride, &sse);
       break;
     case BLOCK_8X16:
-      vp9_mse8x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                  xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                  (unsigned int *)(best_motion_err));
+      vp9_mse8x16(src, src_stride, ref, ref_stride, &sse);
       break;
     default:
-      vp9_mse16x16(x->plane[0].src.buf, x->plane[0].src.stride,
-                   xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride,
-                   (unsigned int *)(best_motion_err));
+      vp9_mse16x16(src, src_stride, ref, ref_stride, &sse);
       break;
   }
+  return sse;
 }
 
 static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
@@ -396,8 +406,7 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
-  vp9_variance_fn_ptr_t v_fn_ptr =
-      cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
+  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type];
   int new_mv_mode_penalty = 256;
 
   int sr = 0;
@@ -407,8 +416,6 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   // for first pass test
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
     sr++;
-  if (sr)
-    sr--;
 
   step_param    += sr;
   further_steps -= sr;
@@ -436,10 +443,11 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   tmp_mv.as_int = 0;
   ref_mv_full.as_mv.col = ref_mv->as_mv.col >> 3;
   ref_mv_full.as_mv.row = ref_mv->as_mv.row >> 3;
-  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param,
+  tmp_err = cpi->diamond_search_sad(x, &ref_mv_full.as_mv, &tmp_mv.as_mv,
+                                    step_param,
                                     x->sadperbit16, &num00, &v_fn_ptr,
                                     x->nmvjointcost,
-                                    x->mvcost, ref_mv);
+                                    x->mvcost, &ref_mv->as_mv);
   if (tmp_err < INT_MAX - new_mv_mode_penalty)
     tmp_err += new_mv_mode_penalty;
 
@@ -459,11 +467,11 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       num00--;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv,
+      tmp_err = cpi->diamond_search_sad(x, &ref_mv_full.as_mv, &tmp_mv.as_mv,
                                         step_param + n, x->sadperbit16,
                                         &num00, &v_fn_ptr,
                                         x->nmvjointcost,
-                                        x->mvcost, ref_mv);
+                                        x->mvcost, &ref_mv->as_mv);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
         tmp_err += new_mv_mode_penalty;
 
@@ -482,6 +490,10 @@ void vp9_first_pass(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   TileInfo tile;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  PICK_MODE_CONTEXT *ctx = &x->sb64_context;
+  int i;
 
   int recon_yoffset, recon_uvoffset;
   const int lst_yv12_idx = cm->ref_frame_map[cpi->lst_fb_idx];
@@ -525,6 +537,15 @@ void vp9_first_pass(VP9_COMP *cpi) {
 
   vp9_frame_init_quantizer(cpi);
 
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs = ctx->eobs_pbuf[i][1];
+  }
+  x->skip_recode = 0;
+
+
   // Initialise the MV cost table to the defaults
   // if( cm->current_video_frame == 0)
   // if ( 0 )
@@ -558,10 +579,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       int this_error;
       int gf_motion_error = INT_MAX;
       int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      double error_weight;
+      double error_weight = 1.0;
 
       vp9_clear_system_state();  // __asm emms;
-      error_weight = 1.0;  // avoid uninitialized warnings
 
       xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
       xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
@@ -584,19 +604,19 @@ void vp9_first_pass(VP9_COMP *cpi) {
       xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
       set_mi_row_col(xd, &tile,
                      mb_row << 1,
-                     1 << mi_height_log2(xd->mi_8x8[0]->mbmi.sb_type),
+                     num_8x8_blocks_high_lookup[xd->mi_8x8[0]->mbmi.sb_type],
                      mb_col << 1,
-                     1 << mi_width_log2(xd->mi_8x8[0]->mbmi.sb_type),
+                     num_8x8_blocks_wide_lookup[xd->mi_8x8[0]->mbmi.sb_type],
                      cm->mi_rows, cm->mi_cols);
 
-      if (cpi->sf.variance_adaptive_quantization) {
+      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
         int energy = vp9_block_energy(cpi, x, xd->mi_8x8[0]->mbmi.sb_type);
         error_weight = vp9_vaq_inv_q_ratio(energy);
       }
 
       // do intra 16x16 prediction
       this_error = vp9_encode_intra(x, use_dc_pred);
-      if (cpi->sf.variance_adaptive_quantization) {
+      if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
         vp9_clear_system_state();  // __asm emms;
         this_error *= error_weight;
       }
@@ -622,11 +642,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
       // Other than for the first frame do a motion search
       if (cm->current_video_frame > 0) {
         int tmp_err;
-        int motion_error = INT_MAX;
+        int motion_error = zz_motion_search(cpi, x, lst_yv12, recon_yoffset);
         int_mv mv, tmp_mv;
-
         // Simple 0,0 motion with no mv overhead
-        zz_motion_search(cpi, x, lst_yv12, &motion_error, recon_yoffset);
         mv.as_int = tmp_mv.as_int = 0;
 
         // Test last reference frame using the previous best mv as the
@@ -634,7 +652,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
         first_pass_motion_search(cpi, x, &best_ref_mv,
                                  &mv.as_mv, lst_yv12,
                                  &motion_error, recon_yoffset);
-        if (cpi->sf.variance_adaptive_quantization) {
+        if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
           vp9_clear_system_state();  // __asm emms;
           motion_error *= error_weight;
         }
@@ -645,7 +663,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           tmp_err = INT_MAX;
           first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv.as_mv,
                                    lst_yv12, &tmp_err, recon_yoffset);
-          if (cpi->sf.variance_adaptive_quantization) {
+          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
             tmp_err *= error_weight;
           }
@@ -659,13 +677,12 @@ void vp9_first_pass(VP9_COMP *cpi) {
         // Experimental search in an older reference frame
         if (cm->current_video_frame > 1) {
           // Simple 0,0 motion with no mv overhead
-          zz_motion_search(cpi, x, gld_yv12,
-                           &gf_motion_error, recon_yoffset);
+          gf_motion_error = zz_motion_search(cpi, x, gld_yv12, recon_yoffset);
 
           first_pass_motion_search(cpi, x, &zero_ref_mv,
                                    &tmp_mv.as_mv, gld_yv12,
                                    &gf_motion_error, recon_yoffset);
-          if (cpi->sf.variance_adaptive_quantization) {
+          if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
             vp9_clear_system_state();  // __asm emms;
             gf_motion_error *= error_weight;
           }
@@ -699,11 +716,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
           // very close and very low. This helps with scene cut
           // detection for example in cropped clips with black bars
           // at the sides or top and bottom.
-          if ((((this_error - intrapenalty) * 9) <=
-               (motion_error * 10)) &&
-              (this_error < (2 * intrapenalty))) {
+          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+              this_error < 2 * intrapenalty)
             neutral_count++;
-          }
 
           mv.as_mv.row *= 8;
           mv.as_mv.col *= 8;
@@ -712,8 +727,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
           xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
           xd->mi_8x8[0]->mbmi.ref_frame[0] = LAST_FRAME;
           xd->mi_8x8[0]->mbmi.ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1,
-                                         mb_col << 1,
+          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1,
                                          xd->mi_8x8[0]->mbmi.sb_type);
           vp9_encode_sby(x, xd->mi_8x8[0]->mbmi.sb_type);
           sum_mvr += mv.as_mv.row;
@@ -916,11 +930,11 @@ static int64_t estimate_modemvcost(VP9_COMP *cpi,
   intra_cost = bitcost(av_intra);
 
   // Estimate of extra bits per mv overhead for mbs
-  // << 9 is the normalization to the (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to the (bits * 512) used in vp9_rc_bits_per_mb
   mv_cost = ((int)(fpstats->new_mv_count / fpstats->count) * 8) << 9;
 
   // Crude estimate of overhead cost from modes
-  // << 9 is the normalization to (bits * 512) used in vp9_bits_per_mb
+  // << 9 is the normalization to (bits * 512) used in vp9_rc_bits_per_mb
   mode_cost =
     (int)((((av_pct_inter - av_pct_motion) * zz_cost) +
            (av_pct_motion * motion_cost) +
@@ -956,19 +970,19 @@ static double calc_correction_factor(double err_per_mb,
 // (now uses the actual quantizer) but has not been tuned.
 static void adjust_maxq_qrange(VP9_COMP *cpi) {
   int i;
-  // Set the max corresponding to cpi->avg_q * 2.0
-  double q = cpi->avg_q * 2.0;
-  cpi->twopass.maxq_max_limit = cpi->worst_quality;
-  for (i = cpi->best_quality; i <= cpi->worst_quality; i++) {
+  // Set the max corresponding to cpi->rc.avg_q * 2.0
+  double q = cpi->rc.avg_q * 2.0;
+  cpi->twopass.maxq_max_limit = cpi->rc.worst_quality;
+  for (i = cpi->rc.best_quality; i <= cpi->rc.worst_quality; i++) {
     cpi->twopass.maxq_max_limit = i;
     if (vp9_convert_qindex_to_q(i) >= q)
       break;
   }
 
-  // Set the min corresponding to cpi->avg_q * 0.5
-  q = cpi->avg_q * 0.5;
-  cpi->twopass.maxq_min_limit = cpi->best_quality;
-  for (i = cpi->worst_quality; i >= cpi->best_quality; i--) {
+  // Set the min corresponding to cpi->rc.avg_q * 0.5
+  q = cpi->rc.avg_q * 0.5;
+  cpi->twopass.maxq_min_limit = cpi->rc.best_quality;
+  for (i = cpi->rc.worst_quality; i >= cpi->rc.best_quality; i--) {
     cpi->twopass.maxq_min_limit = i;
     if (vp9_convert_qindex_to_q(i) <= q)
       break;
@@ -983,10 +997,8 @@ static int estimate_max_q(VP9_COMP *cpi,
   int target_norm_bits_per_mb;
 
   double section_err = fpstats->coded_error / fpstats->count;
-  double sr_correction;
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
-  double speed_correction = 1.0;
 
   if (section_target_bandwitdh <= 0)
     return cpi->twopass.maxq_max_limit;          // Highest value allowed
@@ -995,40 +1007,6 @@ static int estimate_max_q(VP9_COMP *cpi,
                               ? (512 * section_target_bandwitdh) / num_mbs
                               : 512 * (section_target_bandwitdh / num_mbs);
 
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  if (fpstats->sr_coded_error > fpstats->coded_error) {
-    double sr_err_diff = (fpstats->sr_coded_error - fpstats->coded_error) /
-                             (fpstats->count * cpi->common.MBs);
-    sr_correction = fclamp(pow(sr_err_diff / 32.0, 0.25), 0.75, 1.25);
-  } else {
-    sr_correction = 0.75;
-  }
-
-  // Calculate a corrective factor based on a rolling ratio of bits spent
-  // vs target bits
-  if (cpi->rolling_target_bits > 0 &&
-      cpi->active_worst_quality < cpi->worst_quality) {
-    double rolling_ratio = (double)cpi->rolling_actual_bits /
-                               (double)cpi->rolling_target_bits;
-
-    if (rolling_ratio < 0.95)
-      cpi->twopass.est_max_qcorrection_factor -= 0.005;
-    else if (rolling_ratio > 1.05)
-      cpi->twopass.est_max_qcorrection_factor += 0.005;
-
-    cpi->twopass.est_max_qcorrection_factor = fclamp(
-        cpi->twopass.est_max_qcorrection_factor, 0.1, 10.0);
-  }
-
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
-  // change this code.
-  if (cpi->compressor_speed == 1)
-    speed_correction = cpi->oxcf.cpu_used <= 5 ?
-                          1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
-                          1.25;
 
   // Try and pick a max Q that will be high enough to encode the
   // content at the given rate.
@@ -1036,12 +1014,10 @@ static int estimate_max_q(VP9_COMP *cpi,
     int bits_per_mb_at_this_q;
 
     err_correction_factor = calc_correction_factor(err_per_mb,
-                                                   ERR_DIVISOR, 0.4, 0.90, q) *
-                                sr_correction * speed_correction *
-                                cpi->twopass.est_max_qcorrection_factor;
+                                                   ERR_DIVISOR, 0.4, 0.90, q);
 
-    bits_per_mb_at_this_q = vp9_bits_per_mb(INTER_FRAME, q,
-                                            err_correction_factor);
+    bits_per_mb_at_this_q = vp9_rc_bits_per_mb(INTER_FRAME, q,
+                                               err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1052,14 +1028,6 @@ static int estimate_max_q(VP9_COMP *cpi,
       q < cpi->cq_target_quality)
     q = cpi->cq_target_quality;
 
-  // Adjust maxq_min_limit and maxq_max_limit limits based on
-  // average q observed in clip for non kf/gf/arf frames
-  // Give average a chance to settle though.
-  // PGW TODO.. This code is broken for the extended Q range
-  if (cpi->ni_frames > ((int)cpi->twopass.total_stats.count >> 8) &&
-      cpi->ni_frames > 25)
-    adjust_maxq_qrange(cpi);
-
   return q;
 }
 
@@ -1075,9 +1043,6 @@ static int estimate_cq(VP9_COMP *cpi,
   double section_err = (fpstats->coded_error / fpstats->count);
   double err_per_mb = section_err / num_mbs;
   double err_correction_factor;
-  double sr_err_diff;
-  double sr_correction;
-  double speed_correction = 1.0;
   double clip_iiratio;
   double clip_iifactor;
 
@@ -1086,31 +1051,6 @@ static int estimate_cq(VP9_COMP *cpi,
                             : 512 * (section_target_bandwitdh / num_mbs);
 
 
-  // Corrections for higher compression speed settings
-  // (reduced compression expected)
-  if (cpi->compressor_speed == 1) {
-    if (cpi->oxcf.cpu_used <= 5)
-      speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
-    else
-      speed_correction = 1.25;
-  }
-
-  // Look at the drop in prediction quality between the last frame
-  // and the GF buffer (which contained an older frame).
-  if (fpstats->sr_coded_error > fpstats->coded_error) {
-    sr_err_diff =
-      (fpstats->sr_coded_error - fpstats->coded_error) /
-      (fpstats->count * cpi->common.MBs);
-    sr_correction = (sr_err_diff / 32.0);
-    sr_correction = pow(sr_correction, 0.25);
-    if (sr_correction < 0.75)
-      sr_correction = 0.75;
-    else if (sr_correction > 1.25)
-      sr_correction = 1.25;
-  } else {
-    sr_correction = 0.75;
-  }
-
   // II ratio correction factor for clip as a whole
   clip_iiratio = cpi->twopass.total_stats.intra_error /
                  DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.coded_error);
@@ -1124,11 +1064,10 @@ static int estimate_cq(VP9_COMP *cpi,
 
     // Error per MB based correction factor
     err_correction_factor =
-      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) *
-      sr_correction * speed_correction * clip_iifactor;
+      calc_correction_factor(err_per_mb, 100.0, 0.4, 0.90, q) * clip_iifactor;
 
     bits_per_mb_at_this_q =
-      vp9_bits_per_mb(INTER_FRAME, q, err_correction_factor);
+      vp9_rc_bits_per_mb(INTER_FRAME, q, err_correction_factor);
 
     if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
       break;
@@ -1136,10 +1075,10 @@ static int estimate_cq(VP9_COMP *cpi,
 
   // Clip value to range "best allowed to (worst allowed - 1)"
   q = select_cq_level(q);
-  if (q >= cpi->worst_quality)
-    q = cpi->worst_quality - 1;
-  if (q < cpi->best_quality)
-    q = cpi->best_quality;
+  if (q >= cpi->rc.worst_quality)
+    q = cpi->rc.worst_quality - 1;
+  if (q < cpi->rc.best_quality)
+    q = cpi->rc.best_quality;
 
   return q;
 }
@@ -1589,13 +1528,13 @@ void define_fixed_arf_period(VP9_COMP *cpi) {
   if (cpi->twopass.frames_to_key <= (FIXED_ARF_GROUP_SIZE + 8)) {
     // Setup a GF group close to the keyframe.
     cpi->source_alt_ref_pending = 0;
-    cpi->baseline_gf_interval = cpi->twopass.frames_to_key;
-    schedule_frames(cpi, 0, (cpi->baseline_gf_interval - 1), 2, 0, 0);
+    cpi->rc.baseline_gf_interval = cpi->twopass.frames_to_key;
+    schedule_frames(cpi, 0, (cpi->rc.baseline_gf_interval - 1), 2, 0, 0);
   } else {
     // Setup a fixed period ARF group.
     cpi->source_alt_ref_pending = 1;
-    cpi->baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
-    schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
+    cpi->rc.baseline_gf_interval = FIXED_ARF_GROUP_SIZE;
+    schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
   }
 
   // Replace level indicator of -1 with correct level.
@@ -1692,10 +1631,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // At high Q when there are few bits to spare we are better with a longer
   // interval to spread the cost of the GF.
   active_max_gf_interval =
-    12 + ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 5);
+    12 + ((int)vp9_convert_qindex_to_q(cpi->rc.active_worst_quality) >> 5);
 
-  if (active_max_gf_interval > cpi->max_gf_interval)
-    active_max_gf_interval = cpi->max_gf_interval;
+  if (active_max_gf_interval > cpi->rc.max_gf_interval)
+    active_max_gf_interval = cpi->rc.max_gf_interval;
 
   i = 0;
   while (((i < cpi->twopass.static_scene_max_gf_interval) ||
@@ -1789,7 +1728,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   // Set the interval until the next gf or arf.
-  cpi->baseline_gf_interval = i;
+  cpi->rc.baseline_gf_interval = i;
 
 #if CONFIG_MULTIPLE_ARF
   if (cpi->multi_arf_enabled) {
@@ -1815,24 +1754,25 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
        (mv_in_out_accumulator > -2.0)) &&
       (boost_score > 100)) {
     // Alternative boost calculation for alt ref
-    cpi->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+    cpi->rc.gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
                                     &b_boost);
     cpi->source_alt_ref_pending = 1;
 
 #if CONFIG_MULTIPLE_ARF
     // Set the ARF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, -(cpi->baseline_gf_interval - 1), 2, 1, 0);
+      schedule_frames(cpi, 0, -(cpi->rc.baseline_gf_interval - 1), 2, 1, 0);
     }
 #endif
   } else {
-    cpi->gfu_boost = (int)boost_score;
+    cpi->rc.gfu_boost = (int)boost_score;
     cpi->source_alt_ref_pending = 0;
 #if CONFIG_MULTIPLE_ARF
     // Set the GF schedule.
     if (cpi->multi_arf_enabled) {
-      schedule_frames(cpi, 0, cpi->baseline_gf_interval - 1, 2, 0, 0);
-      assert(cpi->new_frame_coding_order_period == cpi->baseline_gf_interval);
+      schedule_frames(cpi, 0, cpi->rc.baseline_gf_interval - 1, 2, 0, 0);
+      assert(cpi->new_frame_coding_order_period ==
+             cpi->rc.baseline_gf_interval);
     }
 #endif
   }
@@ -1905,8 +1845,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Clip cpi->twopass.gf_group_bits based on user supplied data rate
   // variability limit (cpi->oxcf.two_pass_vbrmax_section)
   if (cpi->twopass.gf_group_bits >
-      (int64_t)max_bits * cpi->baseline_gf_interval)
-    cpi->twopass.gf_group_bits = (int64_t)max_bits * cpi->baseline_gf_interval;
+      (int64_t)max_bits * cpi->rc.baseline_gf_interval)
+    cpi->twopass.gf_group_bits =
+        (int64_t)max_bits * cpi->rc.baseline_gf_interval;
 
   // Reset the file position
   reset_fpf_position(cpi, start_pos);
@@ -1919,19 +1860,18 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       i <= (cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME);
       ++i) {
     int allocation_chunks;
-    int q = cpi->oxcf.fixed_q < 0 ? cpi->last_q[INTER_FRAME]
-                                  : cpi->oxcf.fixed_q;
+    int q = cpi->rc.last_q[INTER_FRAME];
     int gf_bits;
 
-    int boost = (cpi->gfu_boost * vp9_gfboost_qadjust(q)) / 100;
+    int boost = (cpi->rc.gfu_boost * gfboost_qadjust(q)) / 100;
 
     // Set max and minimum boost and hence minimum allocation
-    boost = clamp(boost, 125, (cpi->baseline_gf_interval + 1) * 200);
+    boost = clamp(boost, 125, (cpi->rc.baseline_gf_interval + 1) * 200);
 
     if (cpi->source_alt_ref_pending && i == 0)
-      allocation_chunks = ((cpi->baseline_gf_interval + 1) * 100) + boost;
+      allocation_chunks = ((cpi->rc.baseline_gf_interval + 1) * 100) + boost;
     else
-      allocation_chunks = (cpi->baseline_gf_interval * 100) + (boost - 100);
+      allocation_chunks = (cpi->rc.baseline_gf_interval * 100) + (boost - 100);
 
     // Prevent overflow
     if (boost > 1023) {
@@ -1948,10 +1888,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // If the frame that is to be boosted is simpler than the average for
     // the gf/arf group then use an alternative calculation
     // based on the error score of the frame itself
-    if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval) {
+    if (mod_frame_err < gf_group_err / (double)cpi->rc.baseline_gf_interval) {
       double alt_gf_grp_bits =
         (double)cpi->twopass.kf_group_bits  *
-        (mod_frame_err * (double)cpi->baseline_gf_interval) /
+        (mod_frame_err * (double)cpi->rc.baseline_gf_interval) /
         DOUBLE_DIVIDE_CHECK(cpi->twopass.kf_group_error_left);
 
       int alt_gf_bits = (int)((double)boost * (alt_gf_grp_bits /
@@ -1976,7 +1916,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       gf_bits = 0;
 
     // Add in minimum for a frame
-    gf_bits += cpi->min_frame_bandwidth;
+    gf_bits += cpi->rc.min_frame_bandwidth;
 
     if (i == 0) {
       cpi->twopass.gf_bits = gf_bits;
@@ -1984,7 +1924,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (i == 1 || (!cpi->source_alt_ref_pending
         && (cpi->common.frame_type != KEY_FRAME))) {
       // Per frame bit target for this frame
-      cpi->per_frame_bandwidth = gf_bits;
+      cpi->rc.per_frame_bandwidth = gf_bits;
     }
   }
 
@@ -2007,7 +1947,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       cpi->twopass.gf_group_error_left = (int64_t)gf_group_err;
 
     cpi->twopass.gf_group_bits -= cpi->twopass.gf_bits
-        - cpi->min_frame_bandwidth;
+        - cpi->rc.min_frame_bandwidth;
 
     if (cpi->twopass.gf_group_bits < 0)
       cpi->twopass.gf_group_bits = 0;
@@ -2015,8 +1955,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // This condition could fail if there are two kfs very close together
     // despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
     // calculation of alt_extra_bits.
-    if (cpi->baseline_gf_interval >= 3) {
-      const int boost = cpi->source_alt_ref_pending ? b_boost : cpi->gfu_boost;
+    if (cpi->rc.baseline_gf_interval >= 3) {
+      const int boost = cpi->source_alt_ref_pending ?
+          b_boost : cpi->rc.gfu_boost;
 
       if (boost >= 150) {
         int alt_extra_bits;
@@ -2035,7 +1976,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     zero_stats(&sectionstats);
     reset_fpf_position(cpi, start_pos);
 
-    for (i = 0; i < cpi->baseline_gf_interval; i++) {
+    for (i = 0; i < cpi->rc.baseline_gf_interval; i++) {
       input_stats(cpi, &next_frame);
       accumulate_stats(&sectionstats, &next_frame);
     }
@@ -2092,10 +2033,10 @@ static void assign_std_frame_bits(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     cpi->twopass.gf_group_bits = 0;
 
   // Add in the minimum number of bits that is set aside for every frame.
-  target_frame_size += cpi->min_frame_bandwidth;
+  target_frame_size += cpi->rc.min_frame_bandwidth;
 
   // Per frame bit target for this frame.
-  cpi->per_frame_bandwidth = target_frame_size;
+  cpi->rc.per_frame_bandwidth = target_frame_size;
 }
 
 // Make a damped adjustment to the active max q.
@@ -2135,74 +2076,29 @@ void vp9_second_pass(VP9_COMP *cpi) {
   vp9_clear_system_state();
 
   if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    cpi->active_worst_quality = cpi->oxcf.cq_level;
-  } else {
+    cpi->rc.active_worst_quality = cpi->oxcf.cq_level;
+  } else if (cpi->common.current_video_frame == 0) {
     // Special case code for first frame.
-    if (cpi->common.current_video_frame == 0) {
-      int section_target_bandwidth =
-          (int)(cpi->twopass.bits_left / frames_left);
-      cpi->twopass.est_max_qcorrection_factor = 1.0;
-
-      // Set a cq_level in constrained quality mode.
-      // Commenting this code out for now since it does not seem to be
-      // working well.
-      /*
-      if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-        int est_cq = estimate_cq(cpi, &cpi->twopass.total_left_stats,
-           section_target_bandwidth);
-
-        if (est_cq > cpi->cq_target_quality)
-          cpi->cq_target_quality = est_cq;
-        else
-          cpi->cq_target_quality = cpi->oxcf.cq_level;
-      }
-      */
-
-      // guess at maxq needed in 2nd pass
-      cpi->twopass.maxq_max_limit = cpi->worst_quality;
-      cpi->twopass.maxq_min_limit = cpi->best_quality;
-
-      tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
-                             section_target_bandwidth);
-
-      cpi->active_worst_quality = tmp_q;
-      cpi->ni_av_qi = tmp_q;
-      cpi->avg_q = vp9_convert_qindex_to_q(tmp_q);
-
-#ifndef ONE_SHOT_Q_ESTIMATE
-      // Limit the maxq value returned subsequently.
-      // This increases the risk of overspend or underspend if the initial
-      // estimate for the clip is bad, but helps prevent excessive
-      // variation in Q, especially near the end of a clip
-      // where for example a small overspend may cause Q to crash
-      adjust_maxq_qrange(cpi);
-#endif
-    }
+    int section_target_bandwidth =
+        (int)(cpi->twopass.bits_left / frames_left);
 
-#ifndef ONE_SHOT_Q_ESTIMATE
-    // The last few frames of a clip almost always have to few or too many
-    // bits and for the sake of over exact rate control we dont want to make
-    // radical adjustments to the allowed quantizer range just to use up a
-    // few surplus bits or get beneath the target rate.
-    else if ((cpi->common.current_video_frame <
-              (((unsigned int)cpi->twopass.total_stats.count * 255) >> 8)) &&
-             ((cpi->common.current_video_frame + cpi->baseline_gf_interval) <
-              (unsigned int)cpi->twopass.total_stats.count)) {
-      int section_target_bandwidth =
-          (int)(cpi->twopass.bits_left / frames_left);
-      if (frames_left < 1)
-        frames_left = 1;
-
-      tmp_q = estimate_max_q(
-          cpi,
-          &cpi->twopass.total_left_stats,
-          section_target_bandwidth);
-
-      // Make a damped adjustment to active max Q
-      cpi->active_worst_quality =
-          adjust_active_maxq(cpi->active_worst_quality, tmp_q);
-    }
-#endif
+    // guess at maxq needed in 2nd pass
+    cpi->twopass.maxq_max_limit = cpi->rc.worst_quality;
+    cpi->twopass.maxq_min_limit = cpi->rc.best_quality;
+
+    tmp_q = estimate_max_q(cpi, &cpi->twopass.total_left_stats,
+                           section_target_bandwidth);
+
+    cpi->rc.active_worst_quality = tmp_q;
+    cpi->rc.ni_av_qi = tmp_q;
+    cpi->rc.avg_q = vp9_convert_qindex_to_q(tmp_q);
+
+    // Limit the maxq value returned subsequently.
+    // This increases the risk of overspend or underspend if the initial
+    // estimate for the clip is bad, but helps prevent excessive
+    // variation in Q, especially near the end of a clip
+    // where for example a small overspend may cause Q to crash
+    adjust_maxq_qrange(cpi);
   }
   vp9_zero(this_frame);
   if (EOF == input_stats(cpi, &this_frame))
@@ -2219,7 +2115,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
   }
 
   // Is this a GF / ARF (Note that a KF is always also a GF)
-  if (cpi->frames_till_gf_update_due == 0) {
+  if (cpi->rc.frames_till_gf_update_due == 0) {
     // Define next gf group and assign bits to it
     this_frame_copy = this_frame;
 
@@ -2253,10 +2149,10 @@ void vp9_second_pass(VP9_COMP *cpi) {
     if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME)) {
       // Assign a standard frames worth of bits from those allocated
       // to the GF group
-      int bak = cpi->per_frame_bandwidth;
+      int bak = cpi->rc.per_frame_bandwidth;
       this_frame_copy = this_frame;
       assign_std_frame_bits(cpi, &this_frame_copy);
-      cpi->per_frame_bandwidth = bak;
+      cpi->rc.per_frame_bandwidth = bak;
     }
   } else {
     // Otherwise this is an ordinary frame
@@ -2277,7 +2173,7 @@ void vp9_second_pass(VP9_COMP *cpi) {
   }
 
   // Set nominal per second bandwidth for this frame
-  cpi->target_bandwidth = (int)(cpi->per_frame_bandwidth
+  cpi->target_bandwidth = (int)(cpi->rc.per_frame_bandwidth
                                 * cpi->output_framerate);
   if (cpi->target_bandwidth < 0)
     cpi->target_bandwidth = 0;
@@ -2410,7 +2306,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   cpi->source_alt_ref_active = 0;
 
   // Kf is always a gf so clear frames till next gf counter
-  cpi->frames_till_gf_update_due = 0;
+  cpi->rc.frames_till_gf_update_due = 0;
 
   cpi->twopass.frames_to_key = 1;
 
@@ -2573,7 +2469,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     // For the first few frames collect data to decide kf boost.
-    if (i <= (cpi->max_gf_interval * 2)) {
+    if (i <= (cpi->rc.max_gf_interval * 2)) {
       if (next_frame.intra_error > cpi->twopass.kf_intra_err_min)
         r = (IIKFACTOR2 * next_frame.intra_error /
              DOUBLE_DIVIDE_CHECK(next_frame.coded_error));
@@ -2631,7 +2527,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     // Make a note of baseline boost and the zero motion
     // accumulator value for use elsewhere.
-    cpi->kf_boost = kf_boost;
+    cpi->rc.kf_boost = kf_boost;
     cpi->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
 
     // We do three calculations for kf size.
@@ -2701,10 +2597,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
     cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits;
     // Add in the minimum frame allowance
-    cpi->twopass.kf_bits += cpi->min_frame_bandwidth;
+    cpi->twopass.kf_bits += cpi->rc.min_frame_bandwidth;
 
     // Peer frame bit target for this frame
-    cpi->per_frame_bandwidth = cpi->twopass.kf_bits;
+    cpi->rc.per_frame_bandwidth = cpi->twopass.kf_bits;
     // Convert to a per second bitrate
     cpi->target_bandwidth = (int)(cpi->twopass.kf_bits *
                                   cpi->output_framerate);
@@ -2718,3 +2614,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // sizes.
   cpi->twopass.modified_error_left -= kf_group_err;
 }
+
+void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
+#ifdef DISABLE_RC_LONG_TERM_MEM
+  cpi->twopass.bits_left -=  cpi->rc.this_frame_target;
+#else
+  cpi->twopass.bits_left -= 8 * bytes_used;
+#endif
+  if (!cpi->refresh_alt_ref_frame) {
+    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth *
+                                        cpi->oxcf.two_pass_vbrmin_section
+                                        / 100);
+    if (two_pass_min_rate < lower_bounds_min_rate)
+      two_pass_min_rate = lower_bounds_min_rate;
+    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate /
+                                        cpi->oxcf.framerate);
+  }
+}
diff --git a/source/libvpx/vp9/encoder/vp9_lookahead.c b/source/libvpx/vp9/encoder/vp9_lookahead.c
index c28c868..277bd7d 100644
--- a/source/libvpx/vp9/encoder/vp9_lookahead.c
+++ b/source/libvpx/vp9/encoder/vp9_lookahead.c
@@ -12,8 +12,8 @@
 
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
-#include "vp9/common/vp9_extend.h"
 
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */
diff --git a/source/libvpx/vp9/encoder/vp9_mbgraph.c b/source/libvpx/vp9/encoder/vp9_mbgraph.c
index 7b605b2..e2ef256 100644
--- a/source/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/source/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -11,7 +11,6 @@
 #include <limits.h>
 
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -43,7 +42,7 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi,
       (cpi->speed < 8 ? (cpi->speed > 5 ? 1 : 0) : 2);
   step_param = MIN(step_param, (cpi->sf.max_step_search_steps - 2));
 
-  vp9_clamp_mv_min_max(x, &ref_mv->as_mv);
+  vp9_set_mv_search_range(x, &ref_mv->as_mv);
 
   ref_full.as_mv.col = ref_mv->as_mv.col >> 3;
   ref_full.as_mv.row = ref_mv->as_mv.row >> 3;
@@ -324,8 +323,8 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
                              1));
 
   // We are not interested in results beyond the alt ref itself.
-  if (n_frames > cpi->frames_till_gf_update_due)
-    n_frames = cpi->frames_till_gf_update_due;
+  if (n_frames > cpi->rc.frames_till_gf_update_due)
+    n_frames = cpi->rc.frames_till_gf_update_due;
 
   // defer cost to reference frames
   for (i = n_frames - 1; i >= 0; i--) {
@@ -397,7 +396,7 @@ void vp9_update_mbgraph_stats(VP9_COMP *cpi) {
 
   // we need to look ahead beyond where the ARF transitions into
   // being a GF - so exit if we don't look ahead beyond that
-  if (n_frames <= cpi->frames_till_gf_update_due)
+  if (n_frames <= cpi->rc.frames_till_gf_update_due)
     return;
   if (n_frames > (int)cpi->frames_till_alt_ref_frame)
     n_frames = cpi->frames_till_alt_ref_frame;
diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.c b/source/libvpx/vp9/encoder/vp9_mcomp.c
index a52f5b1..87b5988 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.c
@@ -24,7 +24,7 @@
 
 // #define NEW_DIAMOND_SEARCH
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv) {
+void vp9_set_mv_search_range(MACROBLOCK *x, MV *mv) {
   const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
   const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
   const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
@@ -51,9 +51,6 @@ int vp9_init_search_range(VP9_COMP *cpi, int size) {
   while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  if (sr)
-    sr--;
-
   sr += cpi->sf.reduce_first_step_size;
   sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
   return sr;
@@ -1069,11 +1066,130 @@ int vp9_square_search(MACROBLOCK *x,
 #undef CHECK_POINT
 #undef CHECK_BETTER
 
+int vp9_full_range_search_c(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+                            int search_param, int sad_per_bit, int *num00,
+                            vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
+                            int *mvcost[2], const MV *center_mv) {
+  const MACROBLOCKD* const xd = &x->e_mbd;
+  uint8_t *what = x->plane[0].src.buf;
+  int what_stride = x->plane[0].src.stride;
+  uint8_t *in_what;
+  int in_what_stride = xd->plane[0].pre[0].stride;
+  uint8_t *best_address;
+
+  MV this_mv;
+
+  int bestsad = INT_MAX;
+  int ref_row, ref_col;
+
+  uint8_t *check_here;
+  int thissad;
+  MV fcenter_mv;
+
+  int *mvjsadcost = x->nmvjointsadcost;
+  int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+  int tr, tc;
+  int best_tr = 0;
+  int best_tc = 0;
+  int range = 64;
+
+  int start_col, end_col;
+  int start_row, end_row;
+  int i;
+
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
+  *num00 = 11;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
+
+  // Work out the start point for the search
+  in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
+                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
+  best_address = in_what;
+
+  // Check the starting position
+  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
+                + mvsad_err_cost(best_mv, &fcenter_mv,
+                                 mvjsadcost, mvsadcost, sad_per_bit);
+
+  start_row = MAX(-range, x->mv_row_min - ref_row);
+  start_col = MAX(-range, x->mv_col_min - ref_col);
+  end_row = MIN(range, x->mv_row_max - ref_row);
+  end_col = MIN(range, x->mv_col_max - ref_col);
+
+  for (tr = start_row; tr <= end_row; ++tr) {
+    for (tc = start_col; tc <= end_col; tc += 4) {
+      if ((tc + 3) <= end_col) {
+        unsigned int sad_array[4];
+        unsigned char const *addr_ref[4];
+        for (i = 0; i < 4; ++i)
+          addr_ref[i] = in_what + tr * in_what_stride + tc + i;
+
+        fn_ptr->sdx4df(what, what_stride, addr_ref, in_what_stride, sad_array);
+
+        for (i = 0; i < 4; ++i) {
+          if (sad_array[i] < bestsad) {
+            this_mv.row = ref_row + tr;
+            this_mv.col = ref_col + tc + i;
+            thissad = sad_array[i] +
+                      mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_tr = tr;
+              best_tc = tc + i;
+            }
+          }
+        }
+      } else {
+        for (i = 0; i < end_col - tc; ++i) {
+          check_here = in_what + tr * in_what_stride + tc + i;
+          thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
+                                bestsad);
+
+          if (thissad < bestsad) {
+            this_mv.row = ref_row + tr;
+            this_mv.col = ref_col + tc + i;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
+                                      mvjsadcost, mvsadcost, sad_per_bit);
+
+            if (thissad < bestsad) {
+              bestsad = thissad;
+              best_tr = tr;
+              best_tc = tc + i;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  best_mv->row += best_tr;
+  best_mv->col += best_tc;
+
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
+
+  if (bestsad == INT_MAX)
+    return INT_MAX;
+
+  return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
+                    (unsigned int *)(&thissad)) +
+                       mv_err_cost(&this_mv, center_mv,
+                                   mvjcost, mvcost, x->errorperbit);
+}
+
 int vp9_diamond_search_sad_c(MACROBLOCK *x,
-                             int_mv *ref_mv, int_mv *best_mv,
+                             MV *ref_mv, MV *best_mv,
                              int search_param, int sad_per_bit, int *num00,
                              vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                             int *mvcost[2], int_mv *center_mv) {
+                             int *mvcost[2], const MV *center_mv) {
   int i, j, step;
 
   const MACROBLOCKD* const xd = &x->e_mbd;
@@ -1084,7 +1200,7 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   uint8_t *best_address;
 
   int tot_steps;
-  int_mv this_mv;
+  MV this_mv;
 
   int bestsad = INT_MAX;
   int best_site = 0;
@@ -1101,25 +1217,24 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.as_mv.row = center_mv->row >> 3;
+  fcenter_mv.as_mv.col = center_mv->col >> 3;
 
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
   *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
 
   // Work out the start point for the search
   in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
-                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
+                        ref_row * in_what_stride + ref_col);
   best_address = in_what;
 
   // Check the starting position
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                + mvsad_err_cost(best_mv, &fcenter_mv.as_mv,
                                  mvjsadcost, mvsadcost, sad_per_bit);
 
   // search_param determines the length of the initial step and hence the number
@@ -1134,8 +1249,8 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
   for (step = 0; step < tot_steps; step++) {
     for (j = 0; j < x->searches_per_step; j++) {
       // Trap illegal vectors
-      this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-      this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+      this_row_offset = best_mv->row + ss[i].mv.row;
+      this_col_offset = best_mv->col + ss[i].mv.col;
 
       if ((this_col_offset > x->mv_col_min) &&
           (this_col_offset < x->mv_col_max) &&
@@ -1146,9 +1261,9 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
                               bestsad);
 
         if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.row = this_row_offset;
+          this_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                     mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1162,14 +1277,14 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
     }
 
     if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
 #if defined(NEW_DIAMOND_SEARCH)
       while (1) {
-        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        this_row_offset = best_mv->row + ss[best_site].mv.row;
+        this_col_offset = best_mv->col + ss[best_site].mv.col;
         if ((this_col_offset > x->mv_col_min) &&
             (this_col_offset < x->mv_col_max) &&
             (this_row_offset > x->mv_row_min) &&
@@ -1178,14 +1293,14 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
           thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
                                 bestsad);
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            this_mv.row = this_row_offset;
+            this_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
-              best_mv->as_mv.row += ss[best_site].mv.row;
-              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
               best_address += ss[best_site].offset;
               continue;
             }
@@ -1199,23 +1314,24 @@ int vp9_diamond_search_sad_c(MACROBLOCK *x,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
 
   if (bestsad == INT_MAX)
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                     (unsigned int *)(&thissad)) +
-                       mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                       mv_err_cost(&this_mv, center_mv,
                                    mvjcost, mvcost, x->errorperbit);
 }
 
 int vp9_diamond_search_sadx4(MACROBLOCK *x,
-                             int_mv *ref_mv, int_mv *best_mv, int search_param,
+                             MV *ref_mv, MV *best_mv, int search_param,
                              int sad_per_bit, int *num00,
                              vp9_variance_fn_ptr_t *fn_ptr,
-                             int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+                             int *mvjcost, int *mvcost[2],
+                             const MV *center_mv) {
   int i, j, step;
 
   const MACROBLOCKD* const xd = &x->e_mbd;
@@ -1226,7 +1342,7 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   uint8_t *best_address;
 
   int tot_steps;
-  int_mv this_mv;
+  MV this_mv;
 
   unsigned int bestsad = INT_MAX;
   int best_site = 0;
@@ -1245,25 +1361,24 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.as_mv.row = center_mv->row >> 3;
+  fcenter_mv.as_mv.col = center_mv->col >> 3;
 
-  clamp_mv(&ref_mv->as_mv,
-           x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  ref_row = ref_mv->as_mv.row;
-  ref_col = ref_mv->as_mv.col;
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  ref_row = ref_mv->row;
+  ref_col = ref_mv->col;
   *num00 = 0;
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
+  best_mv->row = ref_row;
+  best_mv->col = ref_col;
 
   // Work out the start point for the search
   in_what = (uint8_t *)(xd->plane[0].pre[0].buf +
-                        (ref_row * (xd->plane[0].pre[0].stride)) + ref_col);
+                        ref_row * in_what_stride + ref_col);
   best_address = in_what;
 
   // Check the starting position
   bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff)
-                + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                + mvsad_err_cost(best_mv, &fcenter_mv.as_mv,
                                  mvjsadcost, mvsadcost, sad_per_bit);
 
   // search_param determines the length of the initial step and hence the number
@@ -1281,10 +1396,10 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 
     // All_in is true if every one of the points we are checking are within
     // the bounds of the image.
-    all_in &= ((best_mv->as_mv.row + ss[i].mv.row) > x->mv_row_min);
-    all_in &= ((best_mv->as_mv.row + ss[i + 1].mv.row) < x->mv_row_max);
-    all_in &= ((best_mv->as_mv.col + ss[i + 2].mv.col) > x->mv_col_min);
-    all_in &= ((best_mv->as_mv.col + ss[i + 3].mv.col) < x->mv_col_max);
+    all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_row_min);
+    all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_row_max);
+    all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_col_min);
+    all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_col_max);
 
     // If all the pixels are within the bounds we don't check whether the
     // search point is valid in this loop,  otherwise we check each point
@@ -1303,9 +1418,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
-            this_mv.as_mv.row = best_mv->as_mv.row + ss[i].mv.row;
-            this_mv.as_mv.col = best_mv->as_mv.col + ss[i].mv.col;
-            sad_array[t] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            this_mv.row = best_mv->row + ss[i].mv.row;
+            this_mv.col = best_mv->col + ss[i].mv.col;
+            sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                            mvjsadcost, mvsadcost, sad_per_bit);
 
             if (sad_array[t] < bestsad) {
@@ -1318,8 +1433,8 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
     } else {
       for (j = 0; j < x->searches_per_step; j++) {
         // Trap illegal vectors
-        this_row_offset = best_mv->as_mv.row + ss[i].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[i].mv.col;
+        this_row_offset = best_mv->row + ss[i].mv.row;
+        this_col_offset = best_mv->col + ss[i].mv.col;
 
         if ((this_col_offset > x->mv_col_min) &&
             (this_col_offset < x->mv_col_max) &&
@@ -1330,9 +1445,9 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
                                 bestsad);
 
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            this_mv.row = this_row_offset;
+            this_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
 
             if (thissad < bestsad) {
@@ -1345,14 +1460,14 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
       }
     }
     if (best_site != last_site) {
-      best_mv->as_mv.row += ss[best_site].mv.row;
-      best_mv->as_mv.col += ss[best_site].mv.col;
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
       best_address += ss[best_site].offset;
       last_site = best_site;
 #if defined(NEW_DIAMOND_SEARCH)
       while (1) {
-        this_row_offset = best_mv->as_mv.row + ss[best_site].mv.row;
-        this_col_offset = best_mv->as_mv.col + ss[best_site].mv.col;
+        this_row_offset = best_mv->row + ss[best_site].mv.row;
+        this_col_offset = best_mv->col + ss[best_site].mv.col;
         if ((this_col_offset > x->mv_col_min) &&
             (this_col_offset < x->mv_col_max) &&
             (this_row_offset > x->mv_row_min) &&
@@ -1361,14 +1476,14 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
           thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
                                 bestsad);
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            this_mv.row = this_row_offset;
+            this_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv.as_mv,
                                       mvjsadcost, mvsadcost, sad_per_bit);
             if (thissad < bestsad) {
               bestsad = thissad;
-              best_mv->as_mv.row += ss[best_site].mv.row;
-              best_mv->as_mv.col += ss[best_site].mv.col;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
               best_address += ss[best_site].offset;
               continue;
             }
@@ -1382,15 +1497,15 @@ int vp9_diamond_search_sadx4(MACROBLOCK *x,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.row = best_mv->row * 8;
+  this_mv.col = best_mv->col * 8;
 
   if (bestsad == INT_MAX)
     return INT_MAX;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                     (unsigned int *)(&thissad)) +
-                    mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                    mv_err_cost(&this_mv, center_mv,
                                 mvjcost, mvcost, x->errorperbit);
 }
 
@@ -1405,10 +1520,10 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
                            int_mv *ref_mv, int_mv *dst_mv) {
   int_mv temp_mv;
   int thissme, n, num00;
-  int bestsme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+  int bestsme = cpi->diamond_search_sad(x, &mvp_full->as_mv, &temp_mv.as_mv,
                                         step_param, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost,
-                                        x->mvcost, ref_mv);
+                                        x->mvcost, &ref_mv->as_mv);
   dst_mv->as_int = temp_mv.as_int;
 
   n = num00;
@@ -1425,10 +1540,10 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, mvp_full, &temp_mv,
+      thissme = cpi->diamond_search_sad(x, &mvp_full->as_mv, &temp_mv.as_mv,
                                         step_param + n, sadpb, &num00,
                                         fn_ptr, x->nmvjointcost, x->mvcost,
-                                        ref_mv);
+                                        &ref_mv->as_mv);
 
       /* check to see if refining search is needed. */
       if (num00 > (further_steps - n))
@@ -1446,9 +1561,9 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
     int search_range = 8;
     int_mv best_mv;
     best_mv.as_int = dst_mv->as_int;
-    thissme = cpi->refining_search_sad(x, &best_mv, sadpb, search_range,
+    thissme = cpi->refining_search_sad(x, &best_mv.as_mv, sadpb, search_range,
                                        fn_ptr, x->nmvjointcost, x->mvcost,
-                                       ref_mv);
+                                       &ref_mv->as_mv);
 
     if (thissme < bestsme) {
       bestsme = thissme;
@@ -1458,11 +1573,11 @@ int vp9_full_pixel_diamond(VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sad_c(MACROBLOCK *x, MV *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
                           int *mvcost[2],
-                          int_mv *center_mv, int n) {
+                          const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
   uint8_t *what = x->plane[0].src.buf;
   int what_stride = x->plane[0].src.stride;
@@ -1471,27 +1586,27 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
   int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
+  MV this_mv;
   int bestsad = INT_MAX;
   int r, c;
 
   uint8_t *check_here;
   int thissad;
 
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
+  int ref_row = ref_mv->row;
+  int ref_col = ref_mv->col;
 
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
   int col_min = ref_col - distance;
   int col_max = ref_col + distance;
-  int_mv fcenter_mv;
+  MV fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
   in_what = xd->plane[0].pre[0].buf;
@@ -1503,7 +1618,7 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress,
                         in_what_stride, 0x7fffffff)
-                           + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+                           + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv,
                                             mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
@@ -1514,15 +1629,15 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
+    this_mv.row = r;
     check_here = r * mv_stride + in_what + col_min;
 
     for (c = col_min; c < col_max; c++) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride,
                             bestsad);
 
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+      this_mv.col = c;
+      thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                 mvjsadcost, mvsadcost, sad_per_bit);
 
       if (thissad < bestsad) {
@@ -1536,22 +1651,22 @@ int vp9_full_search_sad_c(MACROBLOCK *x, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.row = best_mv->as_mv.row * 8;
+  this_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                      mv_err_cost(&this_mv, center_mv,
                                   mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
 
-int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sadx3(MACROBLOCK *x, MV *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost,
-                          int *mvcost[2], int_mv *center_mv, int n) {
+                          int *mvcost[2], const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
   uint8_t *what = x->plane[0].src.buf;
   int what_stride = x->plane[0].src.stride;
@@ -1560,15 +1675,15 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
   int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
+  MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
 
   uint8_t *check_here;
   unsigned int thissad;
 
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
+  int ref_row = ref_mv->row;
+  int ref_col = ref_mv->col;
 
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
@@ -1576,13 +1691,13 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   int col_max = ref_col + distance;
 
   unsigned int sad_array[3];
-  int_mv fcenter_mv;
+  MV fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
   in_what = xd->plane[0].pre[0].buf;
@@ -1594,7 +1709,7 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv,
                              mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
@@ -1605,11 +1720,11 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
+    this_mv.row = r;
     check_here = r * mv_stride + in_what + col_min;
     c = col_min;
 
-    while ((c + 2) < col_max) {
+    while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
       int i;
 
       fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
@@ -1618,8 +1733,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
         thissad = sad_array[i];
 
         if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.col = c;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1640,8 +1755,8 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
                             bestsad);
 
       if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+        this_mv.col = c;
+        thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
                                    mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
@@ -1657,23 +1772,23 @@ int vp9_full_search_sadx3(MACROBLOCK *x, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.row = best_mv->as_mv.row * 8;
+  this_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                      mv_err_cost(&this_mv, center_mv,
                                   mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
 
-int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
+int vp9_full_search_sadx8(MACROBLOCK *x, MV *ref_mv,
                           int sad_per_bit, int distance,
                           vp9_variance_fn_ptr_t *fn_ptr,
                           int *mvjcost, int *mvcost[2],
-                          int_mv *center_mv, int n) {
+                          const MV *center_mv, int n) {
   const MACROBLOCKD* const xd = &x->e_mbd;
   uint8_t *what = x->plane[0].src.buf;
   int what_stride = x->plane[0].src.stride;
@@ -1682,15 +1797,15 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   int mv_stride = xd->plane[0].pre[0].stride;
   uint8_t *bestaddress;
   int_mv *best_mv = &x->e_mbd.mi_8x8[0]->bmi[n].as_mv[0];
-  int_mv this_mv;
+  MV this_mv;
   unsigned int bestsad = INT_MAX;
   int r, c;
 
   uint8_t *check_here;
   unsigned int thissad;
 
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
+  int ref_row = ref_mv->row;
+  int ref_col = ref_mv->col;
 
   int row_min = ref_row - distance;
   int row_max = ref_row + distance;
@@ -1699,13 +1814,13 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
 
   DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
   unsigned int sad_array[3];
-  int_mv fcenter_mv;
+  MV fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
 
   // Work out the mid point for the search
   in_what = xd->plane[0].pre[0].buf;
@@ -1717,7 +1832,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   // Baseline value at the centre
   bestsad = fn_ptr->sdf(what, what_stride,
                         bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv.as_mv,
+            + mvsad_err_cost(&best_mv->as_mv, &fcenter_mv,
                              mvjsadcost, mvsadcost, sad_per_bit);
 
   // Apply further limits to prevent us looking using vectors that stretch
@@ -1728,7 +1843,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
   row_max = MIN(row_max, x->mv_row_max);
 
   for (r = row_min; r < row_max; r++) {
-    this_mv.as_mv.row = r;
+    this_mv.row = r;
     check_here = r * mv_stride + in_what + col_min;
     c = col_min;
 
@@ -1741,8 +1856,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
         thissad = (unsigned int)sad_array8[i];
 
         if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.col = c;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1767,8 +1882,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
         thissad = sad_array[i];
 
         if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad  += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.col = c;
+          thissad  += mvsad_err_cost(&this_mv, &fcenter_mv,
                                      mvjsadcost, mvsadcost, sad_per_bit);
 
           if (thissad < bestsad) {
@@ -1789,8 +1904,8 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
                             bestsad);
 
       if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+        this_mv.col = c;
+        thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                   mvjsadcost, mvsadcost, sad_per_bit);
 
         if (thissad < bestsad) {
@@ -1806,21 +1921,22 @@ int vp9_full_search_sadx8(MACROBLOCK *x, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row * 8;
-  this_mv.as_mv.col = best_mv->as_mv.col * 8;
+  this_mv.row = best_mv->as_mv.row * 8;
+  this_mv.col = best_mv->as_mv.col * 8;
 
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                      mv_err_cost(&this_mv, center_mv,
                                   mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
 int vp9_refining_search_sad_c(MACROBLOCK *x,
-                              int_mv *ref_mv, int error_per_bit,
+                              MV *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+                              int *mvjcost, int *mvcost[2],
+                              const MV *center_mv) {
   const MACROBLOCKD* const xd = &x->e_mbd;
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
@@ -1830,31 +1946,31 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
   int in_what_stride = xd->plane[0].pre[0].stride;
   uint8_t *what = x->plane[0].src.buf;
   uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
-                          ref_mv->as_mv.col;
+                          (ref_mv->row * xd->plane[0].pre[0].stride) +
+                          ref_mv->col;
   uint8_t *check_here;
   unsigned int thissad;
-  int_mv this_mv;
+  MV this_mv;
   unsigned int bestsad = INT_MAX;
-  int_mv fcenter_mv;
+  MV fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
 
   bestsad = fn_ptr->sdf(what, what_stride, best_address,
                         in_what_stride, 0x7fffffff) +
-                        mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+                        mvsad_err_cost(ref_mv, &fcenter_mv,
                                        mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
 
     for (j = 0; j < 4; j++) {
-      this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-      this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+      this_row_offset = ref_mv->row + neighbors[j].row;
+      this_col_offset = ref_mv->col + neighbors[j].col;
 
       if ((this_col_offset > x->mv_col_min) &&
           (this_col_offset < x->mv_col_max) &&
@@ -1866,9 +1982,9 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
                               bestsad);
 
         if (thissad < bestsad) {
-          this_mv.as_mv.row = this_row_offset;
-          this_mv.as_mv.col = this_col_offset;
-          thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.row = this_row_offset;
+          this_mv.col = this_col_offset;
+          thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                     mvjsadcost, mvsadcost, error_per_bit);
 
           if (thissad < bestsad) {
@@ -1882,29 +1998,30 @@ int vp9_refining_search_sad_c(MACROBLOCK *x,
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
       best_address += (neighbors[best_site].row) * in_what_stride +
                       neighbors[best_site].col;
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
+  this_mv.row = ref_mv->row * 8;
+  this_mv.col = ref_mv->col * 8;
 
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                      mv_err_cost(&this_mv, center_mv,
                                   mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
 }
 
 int vp9_refining_search_sadx4(MACROBLOCK *x,
-                              int_mv *ref_mv, int error_per_bit,
+                              MV *ref_mv, int error_per_bit,
                               int search_range, vp9_variance_fn_ptr_t *fn_ptr,
-                              int *mvjcost, int *mvcost[2], int_mv *center_mv) {
+                              int *mvjcost, int *mvcost[2],
+                              const MV *center_mv) {
   const MACROBLOCKD* const xd = &x->e_mbd;
   MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
   int i, j;
@@ -1914,31 +2031,31 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
   int in_what_stride = xd->plane[0].pre[0].stride;
   uint8_t *what = x->plane[0].src.buf;
   uint8_t *best_address = xd->plane[0].pre[0].buf +
-                          (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
-                          ref_mv->as_mv.col;
+                          (ref_mv->row * xd->plane[0].pre[0].stride) +
+                          ref_mv->col;
   uint8_t *check_here;
   unsigned int thissad;
-  int_mv this_mv;
+  MV this_mv;
   unsigned int bestsad = INT_MAX;
-  int_mv fcenter_mv;
+  MV fcenter_mv;
 
   int *mvjsadcost = x->nmvjointsadcost;
   int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
 
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+  fcenter_mv.row = center_mv->row >> 3;
+  fcenter_mv.col = center_mv->col >> 3;
 
   bestsad = fn_ptr->sdf(what, what_stride, best_address,
                         in_what_stride, 0x7fffffff) +
-      mvsad_err_cost(&ref_mv->as_mv, &fcenter_mv.as_mv,
+      mvsad_err_cost(ref_mv, &fcenter_mv,
                      mvjsadcost, mvsadcost, error_per_bit);
 
   for (i = 0; i < search_range; i++) {
     int best_site = -1;
-    int all_in = ((ref_mv->as_mv.row - 1) > x->mv_row_min) &
-                 ((ref_mv->as_mv.row + 1) < x->mv_row_max) &
-                 ((ref_mv->as_mv.col - 1) > x->mv_col_min) &
-                 ((ref_mv->as_mv.col + 1) < x->mv_col_max);
+    int all_in = ((ref_mv->row - 1) > x->mv_row_min) &
+                 ((ref_mv->row + 1) < x->mv_row_max) &
+                 ((ref_mv->col - 1) > x->mv_col_min) &
+                 ((ref_mv->col + 1) < x->mv_col_max);
 
     if (all_in) {
       unsigned int sad_array[4];
@@ -1953,9 +2070,9 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
 
       for (j = 0; j < 4; j++) {
         if (sad_array[j] < bestsad) {
-          this_mv.as_mv.row = ref_mv->as_mv.row + neighbors[j].row;
-          this_mv.as_mv.col = ref_mv->as_mv.col + neighbors[j].col;
-          sad_array[j] += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+          this_mv.row = ref_mv->row + neighbors[j].row;
+          this_mv.col = ref_mv->col + neighbors[j].col;
+          sad_array[j] += mvsad_err_cost(&this_mv, &fcenter_mv,
                                          mvjsadcost, mvsadcost, error_per_bit);
 
           if (sad_array[j] < bestsad) {
@@ -1966,8 +2083,8 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
       }
     } else {
       for (j = 0; j < 4; j++) {
-        this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
-        this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+        this_row_offset = ref_mv->row + neighbors[j].row;
+        this_col_offset = ref_mv->col + neighbors[j].col;
 
         if ((this_col_offset > x->mv_col_min) &&
             (this_col_offset < x->mv_col_max) &&
@@ -1979,9 +2096,9 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
                                 bestsad);
 
           if (thissad < bestsad) {
-            this_mv.as_mv.row = this_row_offset;
-            this_mv.as_mv.col = this_col_offset;
-            thissad += mvsad_err_cost(&this_mv.as_mv, &fcenter_mv.as_mv,
+            this_mv.row = this_row_offset;
+            this_mv.col = this_col_offset;
+            thissad += mvsad_err_cost(&this_mv, &fcenter_mv,
                                       mvjsadcost, mvsadcost, error_per_bit);
 
             if (thissad < bestsad) {
@@ -1996,20 +2113,20 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
     if (best_site == -1) {
       break;
     } else {
-      ref_mv->as_mv.row += neighbors[best_site].row;
-      ref_mv->as_mv.col += neighbors[best_site].col;
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
       best_address += (neighbors[best_site].row) * in_what_stride +
                       neighbors[best_site].col;
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
-  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
+  this_mv.row = ref_mv->row * 8;
+  this_mv.col = ref_mv->col * 8;
 
   if (bestsad < INT_MAX)
     return fn_ptr->vf(what, what_stride, best_address, in_what_stride,
                       (unsigned int *)(&thissad)) +
-                      mv_err_cost(&this_mv.as_mv, &center_mv->as_mv,
+                      mv_err_cost(&this_mv, center_mv,
                                   mvjcost, mvcost, x->errorperbit);
   else
     return INT_MAX;
diff --git a/source/libvpx/vp9/encoder/vp9_mcomp.h b/source/libvpx/vp9/encoder/vp9_mcomp.h
index bcab679..c574e61 100644
--- a/source/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/source/libvpx/vp9/encoder/vp9_mcomp.h
@@ -18,8 +18,9 @@
 // The maximum number of steps in a step search given the largest
 // allowed initial step
 #define MAX_MVSEARCH_STEPS 11
-// Max full pel mv specified in 1 pel units
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 // Allowed motion vector pixel distance outside image border
@@ -27,7 +28,7 @@
 #define BORDER_MV_PIXELS_B16 (16 + VP9_INTERP_EXTEND)
 
 
-void vp9_clamp_mv_min_max(MACROBLOCK *x, MV *mv);
+void vp9_set_mv_search_range(MACROBLOCK *x, MV *mv);
 int vp9_mv_bit_cost(const MV *mv, const MV *ref,
                     const int *mvjcost, int *mvcost[2], int weight);
 void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride);
@@ -102,25 +103,25 @@ extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_iterative;
 extern fractional_mv_step_comp_fp vp9_find_best_sub_pixel_comp_tree;
 
 typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x,
-                                    int_mv *ref_mv, int sad_per_bit,
+                                    MV *ref_mv, int sad_per_bit,
                                     int distance, vp9_variance_fn_ptr_t *fn_ptr,
                                     int *mvjcost, int *mvcost[2],
-                                    int_mv *center_mv, int n);
+                                    const MV *center_mv, int n);
 
 typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x,
-                                        int_mv *ref_mv, int sad_per_bit,
+                                        MV *ref_mv, int sad_per_bit,
                                         int distance,
                                         vp9_variance_fn_ptr_t *fn_ptr,
                                         int *mvjcost, int *mvcost[2],
-                                        int_mv *center_mv);
+                                        const MV *center_mv);
 
 typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
-                                       int_mv *ref_mv, int_mv *best_mv,
+                                       MV *ref_mv, MV *best_mv,
                                        int search_param, int sad_per_bit,
                                        int *num00,
                                        vp9_variance_fn_ptr_t *fn_ptr,
                                        int *mvjcost, int *mvcost[2],
-                                       int_mv *center_mv);
+                                       const MV *center_mv);
 
 int vp9_refining_search_8p_c(MACROBLOCK *x,
                              int_mv *ref_mv, int error_per_bit,
diff --git a/source/libvpx/vp9/encoder/vp9_modecosts.c b/source/libvpx/vp9/encoder/vp9_modecosts.c
deleted file mode 100644
index 7eb6592..0000000
--- a/source/libvpx/vp9/encoder/vp9_modecosts.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp9/common/vp9_blockd.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_treewriter.h"
-#include "vp9/common/vp9_entropymode.h"
-
-
-void vp9_init_mode_costs(VP9_COMP *c) {
-  VP9_COMMON *const cm = &c->common;
-  const vp9_tree_index *KT = vp9_intra_mode_tree;
-  int i, j;
-
-  for (i = 0; i < INTRA_MODES; i++) {
-    for (j = 0; j < INTRA_MODES; j++) {
-      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
-                      KT);
-    }
-  }
-
-  // TODO(rbultje) separate tables for superblock costing?
-  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
-                  vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
-                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
-  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
-                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
-                  vp9_intra_mode_tree);
-
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
-                    cm->fc.switchable_interp_prob[i],
-                    vp9_switchable_interp_tree);
-}
diff --git a/source/libvpx/vp9/encoder/vp9_modecosts.h b/source/libvpx/vp9/encoder/vp9_modecosts.h
deleted file mode 100644
index f43033e..0000000
--- a/source/libvpx/vp9/encoder/vp9_modecosts.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_ENCODER_VP9_MODECOSTS_H_
-#define VP9_ENCODER_VP9_MODECOSTS_H_
-
-void vp9_init_mode_costs(VP9_COMP *x);
-
-#endif  // VP9_ENCODER_VP9_MODECOSTS_H_
diff --git a/source/libvpx/vp9/encoder/vp9_onyx_if.c b/source/libvpx/vp9/encoder/vp9_onyx_if.c
index b664f1e..3ca8af3 100644
--- a/source/libvpx/vp9/encoder/vp9_onyx_if.c
+++ b/source/libvpx/vp9/encoder/vp9_onyx_if.c
@@ -24,6 +24,8 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vp9/common/vp9_tile_common.h"
+
+#include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_onyx_int.h"
@@ -37,8 +39,8 @@
 
 #include "vpx_ports/vpx_timer.h"
 
-
-extern void print_tree_update_probs();
+void vp9_entropy_mode_init();
+void vp9_coef_tree_initialize();
 
 static void set_default_lf_deltas(struct loopfilter *lf);
 
@@ -112,14 +114,8 @@ extern unsigned __int64 Sectionbits[500];
 
 extern void vp9_init_quantizer(VP9_COMP *cpi);
 
-// Tables relating active max Q to active min Q
-static int kf_low_motion_minq[QINDEX_RANGE];
-static int kf_high_motion_minq[QINDEX_RANGE];
-static int gf_low_motion_minq[QINDEX_RANGE];
-static int gf_high_motion_minq[QINDEX_RANGE];
-static int inter_minq[QINDEX_RANGE];
-static int afq_low_motion_minq[QINDEX_RANGE];
-static int afq_high_motion_minq[QINDEX_RANGE];
+static const double in_frame_q_adj_ratio[MAX_SEGMENTS] =
+  {1.0, 1.5, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0};
 
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
@@ -147,98 +143,9 @@ static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   }
 }
 
-// Functions to compute the active minq lookup table entries based on a
-// formulaic approach to facilitate easier adjustment of the Q tables.
-// The formulae were derived from computing a 3rd order polynomial best
-// fit to the original data (after plotting real maxq vs minq (not q index))
-static int calculate_minq_index(double maxq,
-                                double x3, double x2, double x1, double c) {
-  int i;
-  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
-                                maxq);
-
-  // Special case handling to deal with the step from q2.0
-  // down to lossless mode represented by q 1.0.
-  if (minqtarget <= 2.0)
-    return 0;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    if (minqtarget <= vp9_convert_qindex_to_q(i))
-      return i;
-  }
-
-  return QINDEX_RANGE - 1;
-}
-
-static void init_minq_luts(void) {
-  int i;
-
-  for (i = 0; i < QINDEX_RANGE; i++) {
-    const double maxq = vp9_convert_qindex_to_q(i);
-
-
-    kf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.000001,
-                                                 -0.0004,
-                                                 0.15,
-                                                 0.0);
-    kf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.000002,
-                                                  -0.0012,
-                                                  0.5,
-                                                  0.0);
-
-    gf_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                 0.0000015,
-                                                 -0.0009,
-                                                 0.32,
-                                                 0.0);
-    gf_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000021,
-                                                  -0.00125,
-                                                  0.50,
-                                                  0.0);
-    inter_minq[i] = calculate_minq_index(maxq,
-                                         0.00000271,
-                                         -0.00113,
-                                         0.75,
-                                         0.0);
-    afq_low_motion_minq[i] = calculate_minq_index(maxq,
-                                                  0.0000015,
-                                                  -0.0009,
-                                                  0.33,
-                                                  0.0);
-    afq_high_motion_minq[i] = calculate_minq_index(maxq,
-                                                   0.0000021,
-                                                   -0.00125,
-                                                   0.55,
-                                                   0.0);
-  }
-}
-
-static int get_active_quality(int q,
-                              int gfu_boost,
-                              int low,
-                              int high,
-                              int *low_motion_minq,
-                              int *high_motion_minq) {
-  int active_best_quality;
-  if (gfu_boost > high) {
-    active_best_quality = low_motion_minq[q];
-  } else if (gfu_boost < low) {
-    active_best_quality = high_motion_minq[q];
-  } else {
-    const int gap = high - low;
-    const int offset = high - gfu_boost;
-    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
-    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
-    active_best_quality = low_motion_minq[q] + adjustment;
-  }
-  return active_best_quality;
-}
-
-static void set_mvcost(VP9_COMP *cpi) {
+static void set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
   MACROBLOCK *const mb = &cpi->mb;
+  cpi->common.allow_high_precision_mv = allow_high_precision_mv;
   if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
     mb->mvsadcost = mb->nmvsadcost_hp;
@@ -253,11 +160,14 @@ void vp9_initialize_enc() {
 
   if (!init_done) {
     vp9_initialize_common();
+    vp9_coef_tree_initialize();
     vp9_tokenize_initialize();
     vp9_init_quant_tables();
     vp9_init_me_luts();
-    init_minq_luts();
+    vp9_rc_init_minq_luts();
     // init_base_skip_probs();
+    vp9_entropy_mv_init();
+    vp9_entropy_mode_init();
     init_done = 1;
   }
 }
@@ -294,6 +204,8 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
   cpi->coding_context.last_frame_seg_map_copy = 0;
 
+  vpx_free(cpi->complexity_map);
+  cpi->complexity_map = 0;
   vpx_free(cpi->active_map);
   cpi->active_map = 0;
 
@@ -323,20 +235,20 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
 // Computes a q delta (in "q index" terms) to get from a starting q value
 // to a target value
 // target q value
-int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
+int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget) {
   int i;
-  int start_index = cpi->worst_quality;
-  int target_index = cpi->worst_quality;
+  int start_index = cpi->rc.worst_quality;
+  int target_index = cpi->rc.worst_quality;
 
   // Convert the average q value to an index.
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
     start_index = i;
     if (vp9_convert_qindex_to_q(i) >= qstart)
       break;
   }
 
   // Convert the q target to an index
-  for (i = cpi->best_quality; i < cpi->worst_quality; i++) {
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
     target_index = i;
     if (vp9_convert_qindex_to_q(i) >= qtarget)
       break;
@@ -345,11 +257,84 @@ int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget) {
   return target_index - start_index;
 }
 
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to thegiven rate ratio.
+
+int vp9_compute_qdelta_by_rate(VP9_COMP *cpi,
+                               double base_q_index, double rate_target_ratio) {
+  int i;
+  int base_bits_per_mb;
+  int target_bits_per_mb;
+  int target_index = cpi->rc.worst_quality;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  // Look up the current projected bits per block for the base index
+  base_bits_per_mb = vp9_rc_bits_per_mb(cpi->common.frame_type,
+                                        base_q_index, 1.0);
+
+  // Find the target bits per mb based on the base value and given ratio.
+  target_bits_per_mb = rate_target_ratio * base_bits_per_mb;
+
+  // Convert the q target to an index
+  for (i = cpi->rc.best_quality; i < cpi->rc.worst_quality; i++) {
+    target_index = i;
+    if (vp9_rc_bits_per_mb(cpi->common.frame_type,
+                           i, 1.0) <= target_bits_per_mb )
+      break;
+  }
+
+  return target_index - base_q_index;
+}
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+static void setup_in_frame_q_adj(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  struct segmentation *seg = &cm->seg;
+  // double q_ratio;
+  int segment;
+  int qindex_delta;
+
+  // Make SURE use of floating point in this function is safe.
+  vp9_clear_system_state();
+
+  if (cm->frame_type == KEY_FRAME ||
+      cpi->refresh_alt_ref_frame ||
+      (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)) {
+    // Clear down the segment map
+    vpx_memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Clear down the complexity map used for rd
+    vpx_memset(cpi->complexity_map, 0, cm->mi_rows * cm->mi_cols);
+
+    // Enable segmentation
+    vp9_enable_segmentation((VP9_PTR)cpi);
+    vp9_clearall_segfeatures(seg);
+
+    // Select delta coding method
+    seg->abs_delta = SEGMENT_DELTADATA;
+
+    // Segment 0 "Q" feature is disabled so it defaults to the baseline Q
+    vp9_disable_segfeature(seg, 0, SEG_LVL_ALT_Q);
+
+    // Use some of the segments for in frame Q adjustment
+    for (segment = 1; segment < 3; segment++) {
+      qindex_delta =
+        vp9_compute_qdelta_by_rate(cpi, cm->base_qindex,
+                                   in_frame_q_adj_ratio[segment]);
+      vp9_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+      vp9_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+    }
+  }
+}
+
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   struct segmentation *seg = &cm->seg;
 
-  int high_q = (int)(cpi->avg_q > 48.0);
+  int high_q = (int)(cpi->rc.avg_q > 48.0);
   int qi_delta;
 
   // Disable and clear down for KF
@@ -387,7 +372,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
       seg->update_map = 1;
       seg->update_data = 1;
 
-      qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q, (cpi->avg_q * 0.875));
+      qi_delta = vp9_compute_qdelta(
+          cpi, cpi->rc.avg_q, (cpi->rc.avg_q * 0.875));
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta - 2));
       vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2);
 
@@ -401,15 +387,15 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     // All other frames if segmentation has been enabled
 
     // First normal frame in a valid gf or alt ref group
-    if (cpi->frames_since_golden == 0) {
+    if (cpi->rc.frames_since_golden == 0) {
       // Set up segment features for normal frames in an arf group
       if (cpi->source_alt_ref_active) {
         seg->update_map = 0;
         seg->update_data = 1;
         seg->abs_delta = SEGMENT_DELTADATA;
 
-        qi_delta = vp9_compute_qdelta(cpi, cpi->avg_q,
-                                      (cpi->avg_q * 1.125));
+        qi_delta = vp9_compute_qdelta(cpi, cpi->rc.avg_q,
+                                      (cpi->rc.avg_q * 1.125));
         vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, (qi_delta + 2));
         vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
 
@@ -467,69 +453,6 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
   }
 }
 
-#ifdef ENTROPY_STATS
-void vp9_update_mode_context_stats(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int i, j;
-  unsigned int (*inter_mode_counts)[INTER_MODES - 1][2] =
-      cm->fc.inter_mode_counts;
-  int64_t (*mv_ref_stats)[INTER_MODES - 1][2] = cpi->mv_ref_stats;
-  FILE *f;
-
-  // Read the past stats counters
-  f = fopen("mode_context.bin",  "rb");
-  if (!f) {
-    vpx_memset(cpi->mv_ref_stats, 0, sizeof(cpi->mv_ref_stats));
-  } else {
-    fread(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
-    fclose(f);
-  }
-
-  // Add in the values for this frame
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-    for (j = 0; j < INTER_MODES - 1; j++) {
-      mv_ref_stats[i][j][0] += (int64_t)inter_mode_counts[i][j][0];
-      mv_ref_stats[i][j][1] += (int64_t)inter_mode_counts[i][j][1];
-    }
-  }
-
-  // Write back the accumulated stats
-  f = fopen("mode_context.bin",  "wb");
-  fwrite(cpi->mv_ref_stats, sizeof(cpi->mv_ref_stats), 1, f);
-  fclose(f);
-}
-
-void print_mode_context(VP9_COMP *cpi) {
-  FILE *f = fopen("vp9_modecont.c", "a");
-  int i, j;
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(
-      f,
-      "const int inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1] =");
-  fprintf(f, "{\n");
-  for (j = 0; j < INTER_MODE_CONTEXTS; j++) {
-    fprintf(f, "  {/* %d */ ", j);
-    fprintf(f, "    ");
-    for (i = 0; i < INTER_MODES - 1; i++) {
-      int this_prob;
-      int64_t count = cpi->mv_ref_stats[j][i][0] + cpi->mv_ref_stats[j][i][1];
-      if (count)
-        this_prob = ((cpi->mv_ref_stats[j][i][0] * 256) + (count >> 1)) / count;
-      else
-        this_prob = 128;
-
-      // context probs
-      fprintf(f, "%5d, ", this_prob);
-    }
-    fprintf(f, "  },\n");
-  }
-
-  fprintf(f, "};\n");
-  fclose(f);
-}
-#endif  // ENTROPY_STATS
-
 // DEBUG: Print out the segment id of each MB in the current frame.
 static void print_seg_map(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
@@ -731,6 +654,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
+  sf->adaptive_pred_filter_type = 0;
   sf->use_avoid_tested_higherror = 0;
   sf->reference_masking = 0;
   sf->use_one_partition_size_always = 0;
@@ -764,10 +688,9 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
   sf->static_segmentation = 0;
 #endif
 
-  sf->variance_adaptive_quantization = 0;
-
   switch (mode) {
     case 0:  // This is the best quality mode.
+      cpi->diamond_search_sad = vp9_full_range_search;
       break;
 
     case 1:
@@ -795,6 +718,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
+        sf->adaptive_pred_filter_type = 1;
         sf->auto_mv_step_size = 1;
         sf->adaptive_rd_thresh = 2;
         sf->recode_loop = 2;
@@ -822,9 +746,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
+        sf->adaptive_pred_filter_type = 2;
         sf->auto_mv_step_size = 1;
 
-        sf->disable_filter_search_var_thresh = 16;
+        sf->disable_filter_search_var_thresh = 50;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
         sf->auto_min_max_partition_size = 1;
@@ -834,6 +759,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
         sf->adaptive_rd_thresh = 2;
         sf->recode_loop = 2;
+        sf->use_lp32x32fdct = 1;
         sf->mode_skip_start = 11;
         sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
         sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -856,9 +782,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
+        sf->adaptive_pred_filter_type = 2;
         sf->auto_mv_step_size = 1;
 
-        sf->disable_filter_search_var_thresh = 16;
+        sf->disable_filter_search_var_thresh = 100;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
         sf->auto_min_max_partition_size = 1;
@@ -889,9 +816,10 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
 
         sf->use_rd_breakout = 1;
         sf->adaptive_motion_search = 1;
+        sf->adaptive_pred_filter_type = 2;
         sf->auto_mv_step_size = 1;
 
-        sf->disable_filter_search_var_thresh = 16;
+        sf->disable_filter_search_var_thresh = 200;
         sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
 
         sf->auto_min_max_partition_size = 1;
@@ -937,7 +865,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
         sf->search_method = HEX;
         sf->subpel_iters_per_step = 1;
         sf->disable_split_var_thresh = 64;
-        sf->disable_filter_search_var_thresh = 96;
+        sf->disable_filter_search_var_thresh = 500;
         for (i = 0; i < TX_SIZES; i++) {
           sf->intra_y_mode_mask[i] = INTRA_DC_ONLY;
           sf->intra_uv_mode_mask[i] = INTRA_DC_ONLY;
@@ -998,7 +926,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) {
   if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer,
                                cpi->oxcf.width, cpi->oxcf.height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
+                               VP9BORDERINPIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate altref buffer");
 }
@@ -1032,11 +960,6 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) {
     CHECK_MEM_ERROR(cm, cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
-  // Data used for real time vc mode to see if gf needs refreshing
-  cpi->inter_zz_count = 0;
-  cpi->gf_bad_count = 0;
-  cpi->gf_update_recommended = 0;
-
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(cm, cpi->mb_activity_map,
                   vpx_calloc(sizeof(unsigned int),
@@ -1071,14 +994,14 @@ static void update_frame_size(VP9_COMP *cpi) {
   if (vp9_realloc_frame_buffer(&cpi->last_frame_uf,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
+                               VP9BORDERINPIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate last frame buffer");
 
   if (vp9_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS))
+                               VP9BORDERINPIXELS, NULL, NULL, NULL))
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                        "Failed to reallocate scaled source buffer");
 
@@ -1131,33 +1054,34 @@ void vp9_new_framerate(VP9_COMP *cpi, double framerate) {
 
   cpi->oxcf.framerate = framerate;
   cpi->output_framerate = cpi->oxcf.framerate;
-  cpi->per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
-                             / cpi->output_framerate);
-  cpi->av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
-                                / cpi->output_framerate);
-  cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
-                                   cpi->oxcf.two_pass_vbrmin_section / 100);
+  cpi->rc.per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                                      / cpi->output_framerate);
+  cpi->rc.av_per_frame_bandwidth = (int)(cpi->oxcf.target_bandwidth
+                                         / cpi->output_framerate);
+  cpi->rc.min_frame_bandwidth = (int)(cpi->rc.av_per_frame_bandwidth *
+                                      cpi->oxcf.two_pass_vbrmin_section / 100);
 
 
-  cpi->min_frame_bandwidth = MAX(cpi->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+  cpi->rc.min_frame_bandwidth = MAX(cpi->rc.min_frame_bandwidth,
+                                    FRAME_OVERHEAD_BITS);
 
   // Set Maximum gf/arf interval
-  cpi->max_gf_interval = 16;
+  cpi->rc.max_gf_interval = 16;
 
   // Extended interval for genuinely static scenes
   cpi->twopass.static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
 
   // Special conditions when alt ref frame enabled in lagged compress mode
   if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames) {
-    if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
-      cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+    if (cpi->rc.max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+      cpi->rc.max_gf_interval = cpi->oxcf.lag_in_frames - 1;
 
     if (cpi->twopass.static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
       cpi->twopass.static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
   }
 
-  if (cpi->max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
-    cpi->max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
+  if (cpi->rc.max_gf_interval > cpi->twopass.static_scene_max_gf_interval)
+    cpi->rc.max_gf_interval = cpi->twopass.static_scene_max_gf_interval;
 }
 
 static int64_t rescale(int val, int64_t num, int denom) {
@@ -1185,7 +1109,6 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   int i;
 
   cpi->oxcf = *oxcf;
-  cpi->goldfreq = 7;
 
   cm->version = oxcf->version;
 
@@ -1199,21 +1122,21 @@ static void init_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   vp9_change_config(ptr, oxcf);
 
   // Initialize active best and worst q and average q values.
-  cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
-  cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
-  cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+  cpi->rc.active_worst_quality      = cpi->oxcf.worst_allowed_q;
+
+  cpi->rc.avg_frame_qindex          = cpi->oxcf.worst_allowed_q;
 
   // Initialise the starting buffer levels
-  cpi->buffer_level                 = cpi->oxcf.starting_buffer_level;
-  cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+  cpi->rc.buffer_level              = cpi->oxcf.starting_buffer_level;
+  cpi->rc.bits_off_target           = cpi->oxcf.starting_buffer_level;
 
-  cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
-  cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
-  cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
+  cpi->rc.rolling_target_bits       = cpi->rc.av_per_frame_bandwidth;
+  cpi->rc.rolling_actual_bits       = cpi->rc.av_per_frame_bandwidth;
+  cpi->rc.long_rolling_target_bits  = cpi->rc.av_per_frame_bandwidth;
+  cpi->rc.long_rolling_actual_bits  = cpi->rc.av_per_frame_bandwidth;
 
-  cpi->total_actual_bits            = 0;
-  cpi->total_target_vs_actual       = 0;
+  cpi->rc.total_actual_bits         = 0;
+  cpi->rc.total_target_vs_actual    = 0;
 
   cpi->static_mb_pct = 0;
 
@@ -1277,7 +1200,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cpi->oxcf.lossless = oxcf->lossless;
   cpi->mb.e_mbd.itxm_add = cpi->oxcf.lossless ? vp9_iwht4x4_add
                                               : vp9_idct4x4_add;
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
   cpi->ref_frame_flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
@@ -1289,8 +1212,7 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   cm->reset_frame_context = 0;
 
   setup_features(cm);
-  cpi->common.allow_high_precision_mv = 0;  // Default mv precision
-  set_mvcost(cpi);
+  set_high_precision_mv(cpi, 0);
 
   {
     int i;
@@ -1332,19 +1254,13 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   vp9_new_framerate(cpi, cpi->oxcf.framerate);
 
   // Set absolute upper and lower quality limits
-  cpi->worst_quality = cpi->oxcf.worst_allowed_q;
-  cpi->best_quality = cpi->oxcf.best_allowed_q;
+  cpi->rc.worst_quality = cpi->oxcf.worst_allowed_q;
+  cpi->rc.best_quality = cpi->oxcf.best_allowed_q;
 
   // active values should only be modified if out of new range
-  cpi->active_worst_quality = clamp(cpi->active_worst_quality,
-                                    cpi->oxcf.best_allowed_q,
-                                    cpi->oxcf.worst_allowed_q);
-
-  cpi->active_best_quality = clamp(cpi->active_best_quality,
-                                   cpi->oxcf.best_allowed_q,
-                                   cpi->oxcf.worst_allowed_q);
-
-  cpi->buffered_mode = cpi->oxcf.optimal_buffer_level > 0;
+  cpi->rc.active_worst_quality = clamp(cpi->rc.active_worst_quality,
+                                       cpi->rc.best_quality,
+                                       cpi->rc.worst_quality);
 
   cpi->cq_target_quality = cpi->oxcf.cq_level;
 
@@ -1370,9 +1286,9 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
   update_frame_size(cpi);
 
   if (cpi->oxcf.fixed_q >= 0) {
-    cpi->last_q[0] = cpi->oxcf.fixed_q;
-    cpi->last_q[1] = cpi->oxcf.fixed_q;
-    cpi->last_boosted_qindex = cpi->oxcf.fixed_q;
+    cpi->rc.last_q[0] = cpi->oxcf.fixed_q;
+    cpi->rc.last_q[1] = cpi->oxcf.fixed_q;
+    cpi->rc.last_boosted_qindex = cpi->oxcf.fixed_q;
   }
 
   cpi->speed = cpi->oxcf.cpu_used;
@@ -1442,90 +1358,121 @@ static void cal_nmvsadcosts_hp(int *mvsadcost[2]) {
   } while (++i <= MV_MAX);
 }
 
+static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
+                               PICK_MODE_CONTEXT *ctx) {
+  int num_pix = num_4x4_blk << 4;
+  int i, k;
+  ctx->num_4x4_blk = num_4x4_blk;
+  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
+                  vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+      ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
+      ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
+      ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
+      ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
+    }
+  }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
+  int i, k;
+  vpx_free(ctx->zcoeff_blk);
+  ctx->zcoeff_blk = 0;
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    for (k = 0; k < 3; ++k) {
+      vpx_free(ctx->coeff[i][k]);
+      ctx->coeff[i][k] = 0;
+      vpx_free(ctx->qcoeff[i][k]);
+      ctx->qcoeff[i][k] = 0;
+      vpx_free(ctx->dqcoeff[i][k]);
+      ctx->dqcoeff[i][k] = 0;
+      vpx_free(ctx->eobs[i][k]);
+      ctx->eobs[i][k] = 0;
+    }
+  }
+}
+
 static void init_pick_mode_context(VP9_COMP *cpi) {
   int i;
-  MACROBLOCK  *x  = &cpi->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  VP9_COMMON  *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x  = &cpi->mb;
+
 
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
     const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
     if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
             PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            ctx->num_4x4_blk = num_4x4_blk;
-            CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                            vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+            alloc_mode_context(cm, num_4x4_blk, ctx);
           }
         }
       }
     } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
           PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
           ctx->num_4x4_blk = num_4x4_blk;
-          CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                          vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+          alloc_mode_context(cm, num_4x4_blk, ctx);
         }
       }
     } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+      for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
         PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
         ctx->num_4x4_blk = num_4x4_blk;
-        CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                        vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+        alloc_mode_context(cm, num_4x4_blk, ctx);
       }
     } else {
       PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
       ctx->num_4x4_blk = num_4x4_blk;
-      CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
-                      vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
+      alloc_mode_context(cm, num_4x4_blk, ctx);
     }
   }
 }
 
 static void free_pick_mode_context(MACROBLOCK *x) {
   int i;
-  MACROBLOCKD *xd = &x->e_mbd;
 
   for (i = 0; i < BLOCK_SIZES; ++i) {
     const int num_4x4_w = num_4x4_blocks_wide_lookup[i];
     const int num_4x4_h = num_4x4_blocks_high_lookup[i];
     const int num_4x4_blk = MAX(4, num_4x4_w * num_4x4_h);
     if (i < BLOCK_16X16) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 4; ++xd->mb_index) {
-          for (xd->b_index = 0; xd->b_index < 16 / num_4x4_blk; ++xd->b_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 4; ++x->mb_index) {
+          for (x->b_index = 0; x->b_index < 16 / num_4x4_blk; ++x->b_index) {
             PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-            vpx_free(ctx->zcoeff_blk);
-            ctx->zcoeff_blk = 0;
+            free_mode_context(ctx);
           }
         }
       }
     } else if (i < BLOCK_32X32) {
-      for (xd->sb_index = 0; xd->sb_index < 4; ++xd->sb_index) {
-        for (xd->mb_index = 0; xd->mb_index < 64 / num_4x4_blk;
-                               ++xd->mb_index) {
+      for (x->sb_index = 0; x->sb_index < 4; ++x->sb_index) {
+        for (x->mb_index = 0; x->mb_index < 64 / num_4x4_blk; ++x->mb_index) {
           PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-          vpx_free(ctx->zcoeff_blk);
-          ctx->zcoeff_blk = 0;
+          free_mode_context(ctx);
         }
       }
     } else if (i < BLOCK_64X64) {
-      for (xd->sb_index = 0; xd->sb_index < 256 / num_4x4_blk; ++xd->sb_index) {
+      for (x->sb_index = 0; x->sb_index < 256 / num_4x4_blk; ++x->sb_index) {
         PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-        vpx_free(ctx->zcoeff_blk);
-        ctx->zcoeff_blk = 0;
+        free_mode_context(ctx);
       }
     } else {
       PICK_MODE_CONTEXT *ctx = get_block_context(x, i);
-      vpx_free(ctx->zcoeff_blk);
-      ctx->zcoeff_blk = 0;
+      free_mode_context(ctx);
     }
   }
 }
@@ -1569,16 +1516,12 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   init_pick_mode_context(cpi);
 
   cm->current_video_frame   = 0;
-  cpi->kf_overspend_bits            = 0;
-  cpi->kf_bitrate_adjustment        = 0;
-  cpi->frames_till_gf_update_due    = 0;
-  cpi->gf_overspend_bits            = 0;
-  cpi->non_gf_bitrate_adjustment    = 0;
+  cpi->rc.frames_till_gf_update_due = 0;
 
   // Set reference frame sign bias for ALTREF frame to 1 (for now)
   cm->ref_frame_sign_bias[ALTREF_FRAME] = 1;
 
-  cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  cpi->rc.baseline_gf_interval = DEFAULT_GF_INTERVAL;
 
   cpi->gold_is_last = 0;
   cpi->alt_is_last  = 0;
@@ -1591,6 +1534,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   CHECK_MEM_ERROR(cm, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
+  // Create a complexity map used for rd adjustment
+  CHECK_MEM_ERROR(cm, cpi->complexity_map,
+                  vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
@@ -1678,20 +1626,18 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
-  cpi->frames_till_gf_update_due      = 0;
-  cpi->key_frame_count              = 1;
+  cpi->rc.frames_till_gf_update_due      = 0;
+  cpi->rc.key_frame_count              = 1;
 
-  cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
-  cpi->ni_tot_qi                    = 0;
-  cpi->ni_frames                   = 0;
-  cpi->tot_q = 0.0;
-  cpi->avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
-  cpi->total_byte_count             = 0;
+  cpi->rc.ni_av_qi                     = cpi->oxcf.worst_allowed_q;
+  cpi->rc.ni_tot_qi                    = 0;
+  cpi->rc.ni_frames                   = 0;
+  cpi->rc.tot_q = 0.0;
+  cpi->rc.avg_q = vp9_convert_qindex_to_q(cpi->oxcf.worst_allowed_q);
 
-  cpi->rate_correction_factor         = 1.0;
-  cpi->key_frame_rate_correction_factor = 1.0;
-  cpi->gf_rate_correction_factor  = 1.0;
-  cpi->twopass.est_max_qcorrection_factor  = 1.0;
+  cpi->rc.rate_correction_factor         = 1.0;
+  cpi->rc.key_frame_rate_correction_factor = 1.0;
+  cpi->rc.gf_rate_correction_factor  = 1.0;
 
   cal_nmvjointsadcost(cpi->mb.nmvjointsadcost);
   cpi->mb.nmvcost[0] = &cpi->mb.nmvcosts[0][MV_MAX];
@@ -1707,7 +1653,7 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
   cal_nmvsadcosts_hp(cpi->mb.nmvsadcost_hp);
 
   for (i = 0; i < KEY_FRAME_CONTEXT; i++)
-    cpi->prior_key_frame_distance[i] = (int)cpi->output_framerate;
+    cpi->rc.prior_key_frame_distance[i] = (int)cpi->output_framerate;
 
 #ifdef OUTPUT_YUV_SRC
   yuv_file = fopen("bd.yuv", "ab");
@@ -1878,14 +1824,6 @@ void vp9_remove_compressor(VP9_PTR *ptr) {
       vp9_end_second_pass(cpi);
     }
 
-#ifdef ENTROPY_STATS
-    if (cpi->pass != 1) {
-      print_context_counters();
-      print_tree_update_probs();
-      print_mode_context(cpi);
-    }
-#endif
-
 #ifdef MODE_STATS
     if (cpi->pass != 1) {
       write_tx_count_stats();
@@ -2218,7 +2156,7 @@ int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb) {
   VP9_COMP *cpi = (VP9_COMP *)(ptr);
   VP9_COMMON *cm = &cpi->common;
 
-  if (index < 0 || index >= NUM_REF_FRAMES)
+  if (index < 0 || index >= REF_FRAMES)
     return -1;
 
   *fb = &cm->yv12_fb[cm->ref_frame_map[index]];
@@ -2365,7 +2303,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
 
 static void update_alt_ref_frame_stats(VP9_COMP *cpi) {
   // this frame refreshes means next frames don't unless specified by user
-  cpi->frames_since_golden = 0;
+  cpi->rc.frames_since_golden = 0;
 
 #if CONFIG_MULTIPLE_ARF
   if (!cpi->multi_arf_enabled)
@@ -2381,7 +2319,7 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   if (cpi->refresh_golden_frame) {
     // this frame refreshes means next frames don't unless specified by user
     cpi->refresh_golden_frame = 0;
-    cpi->frames_since_golden = 0;
+    cpi->rc.frames_since_golden = 0;
 
     // ******** Fixed Q test code only ************
     // If we are going to use the ALT reference for the next group of frames
@@ -2389,12 +2327,12 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
     if (cpi->oxcf.fixed_q >= 0 &&
         cpi->oxcf.play_alternate && !cpi->refresh_alt_ref_frame) {
       cpi->source_alt_ref_pending = 1;
-      cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+      cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
 
       // TODO(ivan): For SVC encoder, GF automatic update is disabled by using
       // a large GF_interval.
       if (cpi->use_svc) {
-        cpi->frames_till_gf_update_due = INT_MAX;
+        cpi->rc.frames_till_gf_update_due = INT_MAX;
       }
     }
 
@@ -2402,18 +2340,18 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
       cpi->source_alt_ref_active = 0;
 
     // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
 
   } else if (!cpi->refresh_alt_ref_frame) {
     // Decrement count down till next gf
-    if (cpi->frames_till_gf_update_due > 0)
-      cpi->frames_till_gf_update_due--;
+    if (cpi->rc.frames_till_gf_update_due > 0)
+      cpi->rc.frames_till_gf_update_due--;
 
     if (cpi->frames_till_alt_ref_frame)
       cpi->frames_till_alt_ref_frame--;
 
-    cpi->frames_since_golden++;
+    cpi->rc.frames_since_golden++;
   }
 }
 
@@ -2432,16 +2370,6 @@ static int find_fp_qindex() {
   return i;
 }
 
-static void Pass1Encode(VP9_COMP *cpi, unsigned long *size, unsigned char *dest,
-                        unsigned int *frame_flags) {
-  (void) size;
-  (void) dest;
-  (void) frame_flags;
-
-  vp9_set_quantizer(cpi, find_fp_qindex());
-  vp9_first_pass(cpi);
-}
-
 #define WRITE_RECON_BUFFER 0
 #if WRITE_RECON_BUFFER
 void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
@@ -2523,25 +2451,19 @@ static int recode_loop_test(VP9_COMP *cpi,
         cpi->refresh_golden_frame ||
         cpi->refresh_alt_ref_frame))) {
     // General over and under shoot tests
-    if (((cpi->projected_frame_size > high_limit) && (q < maxq)) ||
-        ((cpi->projected_frame_size < low_limit) && (q > minq))) {
+    if (((cpi->rc.projected_frame_size > high_limit) && (q < maxq)) ||
+        ((cpi->rc.projected_frame_size < low_limit) && (q > minq))) {
       force_recode = 1;
     } else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
       // Deal with frame undershoot and whether or not we are
       // below the automatically set cq level.
       if (q > cpi->cq_target_quality &&
-          cpi->projected_frame_size < ((cpi->this_frame_target * 7) >> 3)) {
-        force_recode = 1;
-      } else if (q > cpi->oxcf.cq_level &&
-                 cpi->projected_frame_size < cpi->min_frame_bandwidth &&
-                 cpi->active_best_quality > cpi->oxcf.cq_level) {
-        // Severe undershoot and between auto and user cq level
+          cpi->rc.projected_frame_size <
+          ((cpi->rc.this_frame_target * 7) >> 3)) {
         force_recode = 1;
-        cpi->active_best_quality = cpi->oxcf.cq_level;
       }
     }
   }
-
   return force_recode;
 }
 
@@ -2634,8 +2556,8 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
 static void scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int i;
-  int refs[ALLOWED_REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
-                                      cpi->alt_fb_idx};
+  int refs[REFS_PER_FRAME] = {cpi->lst_fb_idx, cpi->gld_fb_idx,
+                              cpi->alt_fb_idx};
 
   for (i = 0; i < 3; i++) {
     YV12_BUFFER_CONFIG *ref = &cm->yv12_fb[cm->ref_frame_map[refs[i]]];
@@ -2647,7 +2569,7 @@ static void scale_references(VP9_COMP *cpi) {
       vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb],
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
-                               VP9BORDERINPIXELS);
+                               VP9BORDERINPIXELS, NULL, NULL, NULL);
       scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]);
       cpi->scaled_ref_idx[i] = new_fb;
     } else {
@@ -2671,22 +2593,20 @@ static void full_to_model_count(unsigned int *model_count,
   model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN];
   model_count[ONE_TOKEN] = full_count[ONE_TOKEN];
   model_count[TWO_TOKEN] = full_count[TWO_TOKEN];
-  for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n)
+  for (n = THREE_TOKEN; n < EOB_TOKEN; ++n)
     model_count[TWO_TOKEN] += full_count[n];
-  model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN];
+  model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
 }
 
-static void full_to_model_counts(
-    vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) {
+static void full_to_model_counts(vp9_coeff_count_model *model_count,
+                                 vp9_coeff_count *full_count) {
   int i, j, k, l;
-  for (i = 0; i < BLOCK_TYPES; ++i)
+
+  for (i = 0; i < PLANE_TYPES; ++i)
     for (j = 0; j < REF_TYPES; ++j)
       for (k = 0; k < COEF_BANDS; ++k)
-        for (l = 0; l < PREV_COEF_CONTEXTS; ++l) {
-          if (l >= 3 && k == 0)
-            continue;
+        for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
           full_to_model_count(model_count[i][j][k][l], full_count[i][j][k][l]);
-        }
 }
 
 #if 0 && CONFIG_INTERNAL_STATS
@@ -2701,28 +2621,28 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d"
-        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
-        "%6d %6d %5d %5d %5d %8.2f %10d %10.3f"
+        "%7.2f %7.2f %7.2f %7.2f %7.2f %7.2f"
+        "%6d %6d %5d %5d %5d %10d %10.3f"
         "%10.3f %8d %10d %10d %10d\n",
-        cpi->common.current_video_frame, cpi->this_frame_target,
-        cpi->projected_frame_size, 0,
-        (cpi->projected_frame_size - cpi->this_frame_target),
-        (int)cpi->total_target_vs_actual,
-        (int)(cpi->oxcf.starting_buffer_level - cpi->bits_off_target),
-        (int)cpi->total_actual_bits, cm->base_qindex,
+        cpi->common.current_video_frame, cpi->rc.this_frame_target,
+        cpi->rc.projected_frame_size, 0,
+        (cpi->rc.projected_frame_size - cpi->rc.this_frame_target),
+        (int)cpi->rc.total_target_vs_actual,
+        (int)(cpi->oxcf.starting_buffer_level - cpi->rc.bits_off_target),
+        (int)cpi->rc.total_actual_bits, cm->base_qindex,
         vp9_convert_qindex_to_q(cm->base_qindex),
         (double)vp9_dc_quant(cm->base_qindex, 0) / 4.0,
-        vp9_convert_qindex_to_q(cpi->active_best_quality),
-        vp9_convert_qindex_to_q(cpi->active_worst_quality), cpi->avg_q,
-        vp9_convert_qindex_to_q(cpi->ni_av_qi),
+        vp9_convert_qindex_to_q(cpi->rc.active_worst_quality), cpi->rc.avg_q,
+        vp9_convert_qindex_to_q(cpi->rc.ni_av_qi),
         vp9_convert_qindex_to_q(cpi->cq_target_quality),
         cpi->refresh_last_frame, cpi->refresh_golden_frame,
-        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost,
-        cpi->twopass.est_max_qcorrection_factor, (int)cpi->twopass.bits_left,
+        cpi->refresh_alt_ref_frame, cm->frame_type, cpi->rc.gfu_boost,
+        (int)cpi->twopass.bits_left,
         cpi->twopass.total_left_stats.coded_error,
         (double)cpi->twopass.bits_left /
             (1 + cpi->twopass.total_left_stats.coded_error),
-        cpi->tot_recode_hits, recon_err, cpi->kf_boost, cpi->kf_zeromotion_pct);
+        cpi->tot_recode_hits, recon_err, cpi->rc.kf_boost,
+        cpi->kf_zeromotion_pct);
 
   fclose(f);
 
@@ -2746,221 +2666,215 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
 }
 #endif
 
-static int pick_q_and_adjust_q_bounds(VP9_COMP *cpi,
-                                      int * bottom_index, int * top_index) {
-  // Set an active best quality and if necessary active worst quality
-  int q = cpi->active_worst_quality;
+static void encode_with_recode_loop(VP9_COMP *cpi,
+                                    size_t *size,
+                                    uint8_t *dest,
+                                    int *q,
+                                    int bottom_index,
+                                    int top_index,
+                                    int frame_over_shoot_limit,
+                                    int frame_under_shoot_limit) {
   VP9_COMMON *const cm = &cpi->common;
+  int loop_count = 0;
+  int loop = 0;
+  int overshoot_seen = 0;
+  int undershoot_seen = 0;
+  int q_low = bottom_index, q_high = top_index;
 
-  if (frame_is_intra_only(cm)) {
-#if !CONFIG_MULTIPLE_ARF
-    // Handle the special case for key frames forced when we have75 reached
-    // the maximum key frame interval. Here force the Q to a range
-    // based on the ambient Q to reduce the risk of popping.
-    if (cpi->this_key_frame_forced) {
-      int delta_qindex;
-      int qindex = cpi->last_boosted_qindex;
-      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
-
-      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
-                                        (last_boosted_q * 0.75));
-
-      cpi->active_best_quality = MAX(qindex + delta_qindex,
-                                     cpi->best_quality);
-    } else {
-      int high = 5000;
-      int low = 400;
-      double q_adj_factor = 1.0;
-      double q_val;
-
-      // Baseline value derived from cpi->active_worst_quality and kf boost
-      cpi->active_best_quality = get_active_quality(q, cpi->kf_boost,
-                                                    low, high,
-                                                    kf_low_motion_minq,
-                                                    kf_high_motion_minq);
-
-      // Allow somewhat lower kf minq with small image formats.
-      if ((cm->width * cm->height) <= (352 * 288)) {
-        q_adj_factor -= 0.25;
-      }
-
-      // Make a further adjustment based on the kf zero motion measure.
-      q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
+  do {
+    vp9_clear_system_state();  // __asm emms;
 
-      // Convert the adjustment factor to a qindex delta
-      // on active_best_quality.
-      q_val = vp9_convert_qindex_to_q(cpi->active_best_quality);
-      cpi->active_best_quality +=
-          vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
-    }
-#else
-    double current_q;
-    // Force the KF quantizer to be 30% of the active_worst_quality.
-    current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
-    cpi->active_best_quality = cpi->active_worst_quality
-        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
-#endif
-  } else if (!cpi->is_src_frame_alt_ref &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    int high = 2000;
-    int low = 400;
-
-    // Use the lower of cpi->active_worst_quality and recent
-    // average Q as basis for GF/ARF best Q limit unless last frame was
-    // a key frame.
-    if (cpi->frames_since_key > 1 &&
-        cpi->avg_frame_qindex < cpi->active_worst_quality) {
-      q = cpi->avg_frame_qindex;
-    }
-    // For constrained quality dont allow Q less than the cq level
-    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-      if (q < cpi->cq_target_quality)
-        q = cpi->cq_target_quality;
-      if (cpi->frames_since_key > 1) {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      afq_low_motion_minq,
-                                                      afq_high_motion_minq);
-      } else {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
-      }
-      // Constrained quality use slightly lower active best.
-      cpi->active_best_quality = cpi->active_best_quality * 15 / 16;
+    vp9_set_quantizer(cpi, *q);
 
-    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      if (!cpi->refresh_alt_ref_frame) {
-        cpi->active_best_quality = cpi->cq_target_quality;
+    if (loop_count == 0) {
+      // Set up entropy context depending on frame type. The decoder mandates
+      // the use of the default context, index 0, for keyframes and inter
+      // frames where the error_resilient_mode or intra_only flag is set. For
+      // other inter-frames the encoder currently uses only two contexts;
+      // context 1 for ALTREF frames and context 0 for the others.
+      if (cm->frame_type == KEY_FRAME) {
+        vp9_setup_key_frame(cpi);
       } else {
-        if (cpi->frames_since_key > 1) {
-          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                        low, high,
-                                                        afq_low_motion_minq,
-                                                        afq_high_motion_minq);
-        } else {
-          cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                        low, high,
-                                                        gf_low_motion_minq,
-                                                        gf_high_motion_minq);
+        if (!cm->intra_only && !cm->error_resilient_mode) {
+          cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
         }
+        vp9_setup_inter_frame(cpi);
       }
-    } else {
-        cpi->active_best_quality = get_active_quality(q, cpi->gfu_boost,
-                                                      low, high,
-                                                      gf_low_motion_minq,
-                                                      gf_high_motion_minq);
     }
-  } else {
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      cpi->active_best_quality = cpi->cq_target_quality;
-    } else {
-#ifdef ONE_SHOT_Q_ESTIMATE
-#ifdef STRICT_ONE_SHOT_Q
-      cpi->active_best_quality = q;
-#else
-      cpi->active_best_quality = inter_minq[q];
-#endif
-#else
-      cpi->active_best_quality = inter_minq[q];
-      // 1-pass: for now, use the average Q for the active_best, if its lower
-      // than active_worst.
-      if (cpi->pass == 0 && (cpi->avg_frame_qindex < q))
-        cpi->active_best_quality = inter_minq[cpi->avg_frame_qindex];
-#endif
 
-      // For the constrained quality mode we don't want
-      // q to fall below the cq level.
-      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
-          (cpi->active_best_quality < cpi->cq_target_quality)) {
-        // If we are strongly undershooting the target rate in the last
-        // frames then use the user passed in cq value not the auto
-        // cq value.
-        if (cpi->rolling_actual_bits < cpi->min_frame_bandwidth)
-          cpi->active_best_quality = cpi->oxcf.cq_level;
-        else
-          cpi->active_best_quality = cpi->cq_target_quality;
-      }
+    // Variance adaptive and in frame q adjustment experiments are mutually
+    // exclusive.
+    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+      vp9_vaq_frame_setup(cpi);
+    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+      setup_in_frame_q_adj(cpi);
     }
-  }
 
-  // Clip the active best and worst quality values to limits
-  if (cpi->active_worst_quality > cpi->worst_quality)
-    cpi->active_worst_quality = cpi->worst_quality;
+    // transform / motion compensation build reconstruction frame
+
+    vp9_encode_frame(cpi);
 
-  if (cpi->active_best_quality < cpi->best_quality)
-    cpi->active_best_quality = cpi->best_quality;
+    // Update the skip mb flag probabilities based on the distribution
+    // seen in the last encoder iteration.
+    // update_base_skip_probs(cpi);
 
-  if (cpi->active_best_quality > cpi->worst_quality)
-    cpi->active_best_quality = cpi->worst_quality;
+    vp9_clear_system_state();  // __asm emms;
 
-  if (cpi->active_worst_quality < cpi->active_best_quality)
-    cpi->active_worst_quality = cpi->active_best_quality;
+    // Dummy pack of the bitstream using up to date stats to get an
+    // accurate estimate of output frame size to determine if we need
+    // to recode.
+    vp9_save_coding_context(cpi);
+    cpi->dummy_packing = 1;
+    vp9_pack_bitstream(cpi, dest, size);
+    cpi->rc.projected_frame_size = (*size) << 3;
+    vp9_restore_coding_context(cpi);
 
-  // Limit Q range for the adaptive loop.
-  if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
-    *top_index =
-      (cpi->active_worst_quality + cpi->active_best_quality * 3) / 4;
-    // If this is the first (key) frame in 1-pass, active best is the user
-    // best-allowed, and leave the top_index to active_worst.
-    if (cpi->pass == 0 && cpi->common.current_video_frame == 0) {
-      cpi->active_best_quality = cpi->oxcf.best_allowed_q;
-      *top_index = cpi->oxcf.worst_allowed_q;
-    }
-  } else if (!cpi->is_src_frame_alt_ref &&
-             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
-             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
-    *top_index =
-      (cpi->active_worst_quality + cpi->active_best_quality) / 2;
-  } else {
-    *top_index = cpi->active_worst_quality;
-  }
-  *bottom_index = cpi->active_best_quality;
+    if (frame_over_shoot_limit == 0)
+      frame_over_shoot_limit = 1;
 
-  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-    q = cpi->active_best_quality;
-  // Special case code to try and match quality with forced key frames
-  } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-    q = cpi->last_boosted_qindex;
-  } else {
-    // Determine initial Q to try.
-    if (cpi->pass == 0) {
-      // 1-pass: for now, use per-frame-bw for target size of frame, scaled
-      // by |x| for key frame.
-      int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
-      q = vp9_regulate_q(cpi, scale * cpi->av_per_frame_bandwidth);
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      loop = 0;
     } else {
-      q = vp9_regulate_q(cpi, cpi->this_frame_target);
+      // Special case handling for forced key frames
+      if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+        int last_q = *q;
+        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
+
+        int high_err_target = cpi->ambient_err;
+        int low_err_target = cpi->ambient_err >> 1;
+
+        // Prevent possible divide by zero error below for perfect KF
+        kf_err += !kf_err;
+
+        // The key frame is not good enough or we can afford
+        // to make it better without undue risk of popping.
+        if ((kf_err > high_err_target &&
+             cpi->rc.projected_frame_size <= frame_over_shoot_limit) ||
+            (kf_err > low_err_target &&
+             cpi->rc.projected_frame_size <= frame_under_shoot_limit)) {
+          // Lower q_high
+          q_high = *q > q_low ? *q - 1 : q_low;
+
+          // Adjust Q
+          *q = ((*q) * high_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low) >> 1);
+        } else if (kf_err < low_err_target &&
+                   cpi->rc.projected_frame_size >= frame_under_shoot_limit) {
+          // The key frame is much better than the previous frame
+          // Raise q_low
+          q_low = *q < q_high ? *q + 1 : q_high;
+
+          // Adjust Q
+          *q = ((*q) * low_err_target) / kf_err;
+          *q = MIN((*q), (q_high + q_low + 1) >> 1);
+        }
+
+        // Clamp Q to upper and lower limits:
+        *q = clamp(*q, q_low, q_high);
+
+        loop = *q != last_q;
+      } else if (recode_loop_test(
+          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
+          *q, top_index, bottom_index)) {
+        // Is the projected frame size out of range and are we allowed
+        // to attempt to recode.
+        int last_q = *q;
+        int retries = 0;
+
+        // Frame size out of permitted range:
+        // Update correction factor & compute new Q to try...
+
+        // Frame is too large
+        if (cpi->rc.projected_frame_size > cpi->rc.this_frame_target) {
+          // Raise Qlow as to at least the current value
+          q_low = *q < q_high ? *q + 1 : q_high;
+
+          if (undershoot_seen || loop_count > 1) {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi, 1);
+
+            *q = (q_high + q_low + 1) / 2;
+          } else {
+            // Update rate_correction_factor unless
+            vp9_rc_update_rate_correction_factors(cpi, 0);
+
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+                                   bottom_index, top_index);
+
+            while (*q < q_low && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+                                     bottom_index, top_index);
+              retries++;
+            }
+          }
+
+          overshoot_seen = 1;
+        } else {
+          // Frame is too small
+          q_high = *q > q_low ? *q - 1 : q_low;
+
+          if (overshoot_seen || loop_count > 1) {
+            vp9_rc_update_rate_correction_factors(cpi, 1);
+            *q = (q_high + q_low) / 2;
+          } else {
+            vp9_rc_update_rate_correction_factors(cpi, 0);
+            *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+                                   bottom_index, top_index);
+            // Special case reset for qlow for constrained quality.
+            // This should only trigger where there is very substantial
+            // undershoot on a frame and the auto cq level is above
+            // the user passsed in value.
+            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY &&
+                *q < q_low) {
+              q_low = *q;
+            }
+
+            while (*q > q_high && retries < 10) {
+              vp9_rc_update_rate_correction_factors(cpi, 0);
+              *q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+                                     bottom_index, top_index);
+              retries++;
+            }
+          }
+
+          undershoot_seen = 1;
+        }
+
+        // Clamp Q to upper and lower limits:
+        *q = clamp(*q, q_low, q_high);
+
+        loop = *q != last_q;
+      } else {
+        loop = 0;
+      }
     }
-    if (q > *top_index)
-      q = *top_index;
-  }
 
-  return q;
+    if (cpi->is_src_frame_alt_ref)
+      loop = 0;
+
+    if (loop) {
+      loop_count++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->tot_recode_hits++;
+#endif
+    }
+  } while (loop);
 }
+
 static void encode_frame_to_data_rate(VP9_COMP *cpi,
-                                      unsigned long *size,
-                                      unsigned char *dest,
+                                      size_t *size,
+                                      uint8_t *dest,
                                       unsigned int *frame_flags) {
   VP9_COMMON *const cm = &cpi->common;
   TX_SIZE t;
   int q;
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
-
-  int loop = 0;
-  int loop_count;
-
-  int q_low;
-  int q_high;
-
   int top_index;
+  int top_index_prop;
   int bottom_index;
-  int active_worst_qchanged = 0;
-
-  int overshoot_seen = 0;
-  int undershoot_seen = 0;
 
   SPEED_FEATURES *const sf = &cpi->sf;
   unsigned int max_mv_def = MIN(cpi->common.width, cpi->common.height);
@@ -2983,7 +2897,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   // pass function that sets the target bandwidth so we must set it here.
   if (cpi->refresh_alt_ref_frame) {
     // Set a per frame bit target for the alt ref frame.
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
+    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
     // Set a per second target bitrate.
     cpi->target_bandwidth = (int)(cpi->twopass.gf_bits * cpi->output_framerate);
   }
@@ -3071,56 +2985,13 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
     configure_static_seg_features(cpi);
   }
 
-  // Decide how big to make the frame.
-  vp9_pick_frame_size(cpi);
-
   vp9_clear_system_state();
 
-  q = pick_q_and_adjust_q_bounds(cpi, &bottom_index, &top_index);
-
-  q_high = top_index;
-  q_low  = bottom_index;
-
-  vp9_compute_frame_size_bounds(cpi, &frame_under_shoot_limit,
-                                &frame_over_shoot_limit);
-
-#if CONFIG_MULTIPLE_ARF
-  // Force the quantizer determined by the coding order pattern.
-  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
-      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
-    double new_q;
-    double current_q = vp9_convert_qindex_to_q(cpi->active_worst_quality);
-    int level = cpi->this_frame_weight;
-    assert(level >= 0);
-
-    // Set quantizer steps at 10% increments.
-    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
-    q = cpi->active_worst_quality + vp9_compute_qdelta(cpi, current_q, new_q);
-
-    bottom_index = q;
-    top_index    = q;
-    q_low  = q;
-    q_high = q;
-
-    printf("frame:%d q:%d\n", cm->current_video_frame, q);
-  }
-#endif
-
-  loop_count = 0;
   vp9_zero(cpi->rd_tx_select_threshes);
 
-  if (!frame_is_intra_only(cm)) {
-    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
-    /* TODO: Decide this more intelligently */
-    cm->allow_high_precision_mv = q < HIGH_PRECISION_MV_QTHRESH;
-    set_mvcost(cpi);
-  }
-
 #if CONFIG_VP9_POSTPROC
-
   if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
-
     switch (cpi->oxcf.noise_sensitivity) {
       case 1:
         l = 20;
@@ -3139,201 +3010,42 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
         l = 150;
         break;
     }
-
     vp9_denoise(cpi->Source, cpi->Source, l);
   }
-
 #endif
 
 #ifdef OUTPUT_YUV_SRC
   vp9_write_yuv_frame(cpi->Source);
 #endif
 
-  do {
-    vp9_clear_system_state();  // __asm emms;
-
-    vp9_set_quantizer(cpi, q);
-
-    if (loop_count == 0) {
-      // Set up entropy context depending on frame type. The decoder mandates
-      // the use of the default context, index 0, for keyframes and inter
-      // frames where the error_resilient_mode or intra_only flag is set. For
-      // other inter-frames the encoder currently uses only two contexts;
-      // context 1 for ALTREF frames and context 0 for the others.
-      if (cm->frame_type == KEY_FRAME) {
-        vp9_setup_key_frame(cpi);
-      } else {
-        if (!cm->intra_only && !cm->error_resilient_mode) {
-          cpi->common.frame_context_idx = cpi->refresh_alt_ref_frame;
-        }
-        vp9_setup_inter_frame(cpi);
-      }
-    }
-
-    if (cpi->sf.variance_adaptive_quantization) {
-        vp9_vaq_frame_setup(cpi);
-    }
-
-    // transform / motion compensation build reconstruction frame
-
-    vp9_encode_frame(cpi);
-
-    // Update the skip mb flag probabilities based on the distribution
-    // seen in the last encoder iteration.
-    // update_base_skip_probs(cpi);
-
-    vp9_clear_system_state();  // __asm emms;
-
-    // Dummy pack of the bitstream using up to date stats to get an
-    // accurate estimate of output frame size to determine if we need
-    // to recode.
-    vp9_save_coding_context(cpi);
-    cpi->dummy_packing = 1;
-    vp9_pack_bitstream(cpi, dest, size);
-    cpi->projected_frame_size = (*size) << 3;
-    vp9_restore_coding_context(cpi);
-
-    if (frame_over_shoot_limit == 0)
-      frame_over_shoot_limit = 1;
-    active_worst_qchanged = 0;
-
-    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
-      loop = 0;
-    } else {
-      // Special case handling for forced key frames
-      if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
-        int last_q = q;
-        int kf_err = vp9_calc_ss_err(cpi->Source, get_frame_new_buffer(cm));
-
-        int high_err_target = cpi->ambient_err;
-        int low_err_target = cpi->ambient_err >> 1;
-
-        // Prevent possible divide by zero error below for perfect KF
-        kf_err += !kf_err;
-
-        // The key frame is not good enough or we can afford
-        // to make it better without undue risk of popping.
-        if ((kf_err > high_err_target &&
-             cpi->projected_frame_size <= frame_over_shoot_limit) ||
-            (kf_err > low_err_target &&
-             cpi->projected_frame_size <= frame_under_shoot_limit)) {
-          // Lower q_high
-          q_high = q > q_low ? q - 1 : q_low;
-
-          // Adjust Q
-          q = (q * high_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low) >> 1);
-        } else if (kf_err < low_err_target &&
-                   cpi->projected_frame_size >= frame_under_shoot_limit) {
-          // The key frame is much better than the previous frame
-          // Raise q_low
-          q_low = q < q_high ? q + 1 : q_high;
-
-          // Adjust Q
-          q = (q * low_err_target) / kf_err;
-          q = MIN(q, (q_high + q_low + 1) >> 1);
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
-
-        loop = q != last_q;
-      } else if (recode_loop_test(
-          cpi, frame_over_shoot_limit, frame_under_shoot_limit,
-          q, top_index, bottom_index)) {
-        // Is the projected frame size out of range and are we allowed
-        // to attempt to recode.
-        int last_q = q;
-        int retries = 0;
-
-        // Frame size out of permitted range:
-        // Update correction factor & compute new Q to try...
-
-        // Frame is too large
-        if (cpi->projected_frame_size > cpi->this_frame_target) {
-          // Raise Qlow as to at least the current value
-          q_low = q < q_high ? q + 1 : q_high;
-
-          if (undershoot_seen || loop_count > 1) {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
-
-            q = (q_high + q_low + 1) / 2;
-          } else {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
-
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-            while (q < q_low && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->this_frame_target);
-              retries++;
-            }
-          }
-
-          overshoot_seen = 1;
-        } else {
-          // Frame is too small
-          q_high = q > q_low ? q - 1 : q_low;
-
-          if (overshoot_seen || loop_count > 1) {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 1);
-
-            q = (q_high + q_low) / 2;
-          } else {
-            // Update rate_correction_factor unless
-            // cpi->active_worst_quality has changed.
-            if (!active_worst_qchanged)
-              vp9_update_rate_correction_factors(cpi, 0);
-
-            q = vp9_regulate_q(cpi, cpi->this_frame_target);
-
-            // Special case reset for qlow for constrained quality.
-            // This should only trigger where there is very substantial
-            // undershoot on a frame and the auto cq level is above
-            // the user passsed in value.
-            if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY && q < q_low) {
-              q_low = q;
-            }
-
-            while (q > q_high && retries < 10) {
-              vp9_update_rate_correction_factors(cpi, 0);
-              q = vp9_regulate_q(cpi, cpi->this_frame_target);
-              retries++;
-            }
-          }
-
-          undershoot_seen = 1;
-        }
-
-        // Clamp Q to upper and lower limits:
-        q = clamp(q, q_low, q_high);
+  // Decide how big to make the frame.
+  vp9_rc_pick_frame_size_target(cpi);
 
-        loop = q != last_q;
-      } else {
-        loop = 0;
-      }
-    }
+  // Decide frame size bounds
+  vp9_rc_compute_frame_size_bounds(cpi, cpi->rc.this_frame_target,
+                                   &frame_under_shoot_limit,
+                                   &frame_over_shoot_limit);
 
-    if (cpi->is_src_frame_alt_ref)
-      loop = 0;
+  // Decide q and q bounds
+  q = vp9_rc_pick_q_and_adjust_q_bounds(cpi,
+                                        &bottom_index,
+                                        &top_index,
+                                        &top_index_prop);
 
-    if (loop) {
-      loop_count++;
+  if (!frame_is_intra_only(cm)) {
+    cm->mcomp_filter_type = DEFAULT_INTERP_FILTER;
+    /* TODO: Decide this more intelligently */
+    set_high_precision_mv(cpi, (q < HIGH_PRECISION_MV_QTHRESH));
+  }
 
-#if CONFIG_INTERNAL_STATS
-      cpi->tot_recode_hits++;
-#endif
-    }
-  } while (loop);
+  encode_with_recode_loop(cpi,
+                          size,
+                          dest,
+                          &q,
+                          bottom_index,
+                          top_index,
+                          frame_over_shoot_limit,
+                          frame_under_shoot_limit);
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3391,7 +3103,6 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
 
     vp9_copy(counts->y_mode, cpi->y_mode_count);
     vp9_copy(counts->uv_mode, cpi->y_uv_mode_count);
-    vp9_copy(counts->partition, cpi->partition_count);
     vp9_copy(counts->intra_inter, cpi->intra_inter_count);
     vp9_copy(counts->comp_inter, cpi->comp_inter_count);
     vp9_copy(counts->single_ref, cpi->single_ref_count);
@@ -3412,102 +3123,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
    * needed in motion search besides loopfilter */
   cm->last_frame_type = cm->frame_type;
 
-  // Update rate control heuristics
-  cpi->total_byte_count += (*size);
-  cpi->projected_frame_size = (*size) << 3;
-
-  // Post encode loop adjustment of Q prediction.
-  if (!active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
-
-  cpi->last_q[cm->frame_type] = cm->base_qindex;
-
-  // Keep record of last boosted (KF/KF/ARF) Q value.
-  // If the current frame is coded at a lower Q then we also update it.
-  // If all mbs in this group are skipped only update if the Q value is
-  // better than that already stored.
-  // This is used to help set quality in forced key frames to reduce popping
-  if ((cm->base_qindex < cpi->last_boosted_qindex) ||
-      ((cpi->static_mb_pct < 100) &&
-       ((cm->frame_type == KEY_FRAME) ||
-        cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
-    cpi->last_boosted_qindex = cm->base_qindex;
-  }
-
-  if (cm->frame_type == KEY_FRAME) {
-    vp9_adjust_key_frame_context(cpi);
-  }
-
-  // Keep a record of ambient average Q.
-  if (cm->frame_type != KEY_FRAME)
-    cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex +
-                            cm->base_qindex) >> 2;
-
-  // Keep a record from which we can calculate the average Q excluding GF
-  // updates and key frames.
-  if (cm->frame_type != KEY_FRAME &&
-      !cpi->refresh_golden_frame &&
-      !cpi->refresh_alt_ref_frame) {
-    cpi->ni_frames++;
-    cpi->tot_q += vp9_convert_qindex_to_q(q);
-    cpi->avg_q = cpi->tot_q / (double)cpi->ni_frames;
-
-    // Calculate the average Q for normal inter frames (not key or GFU frames).
-    cpi->ni_tot_qi += q;
-    cpi->ni_av_qi = cpi->ni_tot_qi / cpi->ni_frames;
-  }
-
-  // Update the buffer level variable.
-  // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame)
-    cpi->bits_off_target -= cpi->projected_frame_size;
-  else
-    cpi->bits_off_target += cpi->av_per_frame_bandwidth -
-                            cpi->projected_frame_size;
-
-  // Clip the buffer level at the maximum buffer size
-  if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size)
-    cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-
-  // Rolling monitors of whether we are over or underspending used to help
-  // regulate min and Max Q in two pass.
-  if (cm->frame_type != KEY_FRAME) {
-    cpi->rolling_target_bits =
-      ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
-    cpi->rolling_actual_bits =
-      ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
-    cpi->long_rolling_target_bits =
-      ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
-    cpi->long_rolling_actual_bits =
-      ((cpi->long_rolling_actual_bits * 31) +
-       cpi->projected_frame_size + 16) / 32;
-  }
-
-  // Actual bits spent
-  cpi->total_actual_bits += cpi->projected_frame_size;
-
-  // Debug stats
-  cpi->total_target_vs_actual += (cpi->this_frame_target -
-                                  cpi->projected_frame_size);
-
-  cpi->buffer_level = cpi->bits_off_target;
-
-#ifndef DISABLE_RC_LONG_TERM_MEM
-  // Update bits left to the kf and gf groups to account for overshoot or
-  // undershoot on these frames
-  if (cm->frame_type == KEY_FRAME) {
-    cpi->twopass.kf_group_bits += cpi->this_frame_target -
-                                  cpi->projected_frame_size;
-
-    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
-  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
-    cpi->twopass.gf_group_bits += cpi->this_frame_target -
-                                  cpi->projected_frame_size;
-
-    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
-  }
-#endif
+  vp9_rc_postencode_update(cpi, *size, top_index_prop);
 
 #if 0
   output_frame_level_debug_stats(cpi);
@@ -3628,8 +3244,23 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mode_info_stride + 1;
 }
 
-static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
-                        unsigned char *dest, unsigned int *frame_flags) {
+static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+}
+
+static void Pass1Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
+                        unsigned int *frame_flags) {
+  (void) size;
+  (void) dest;
+  (void) frame_flags;
+
+  vp9_set_quantizer(cpi, find_fp_qindex());
+  vp9_first_pass(cpi);
+}
+
+static void Pass2Encode(VP9_COMP *cpi, size_t *size,
+                        uint8_t *dest, unsigned int *frame_flags) {
   cpi->enable_encode_breakout = 1;
 
   if (!cpi->refresh_alt_ref_frame)
@@ -3637,34 +3268,17 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
 
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
   // vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
-#ifdef DISABLE_RC_LONG_TERM_MEM
-  cpi->twopass.bits_left -=  cpi->this_frame_target;
-#else
-  cpi->twopass.bits_left -= 8 * *size;
-#endif
-
-  if (!cpi->refresh_alt_ref_frame) {
-    double lower_bounds_min_rate = FRAME_OVERHEAD_BITS * cpi->oxcf.framerate;
-    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
-                                        * cpi->oxcf.two_pass_vbrmin_section
-                                        / 100);
 
-    if (two_pass_min_rate < lower_bounds_min_rate)
-      two_pass_min_rate = lower_bounds_min_rate;
-
-    cpi->twopass.bits_left += (int64_t)(two_pass_min_rate
-                              / cpi->oxcf.framerate);
-  }
+  vp9_twopass_postencode_update(cpi, *size);
 }
 
 static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) {
-  VP9_COMMON            *cm = &cpi->common;
+  VP9_COMMON *const cm = &cpi->common;
   if (!cpi->initial_width) {
-    // TODO(jkoleszar): Support 1/4 subsampling?
-    cm->subsampling_x = (sd != NULL) && sd->uv_width < sd->y_width;
-    cm->subsampling_y = (sd != NULL) && sd->uv_height < sd->y_height;
+    // TODO(agrange) Subsampling defaults to assuming sampled chroma.
+    cm->subsampling_x = sd != NULL ? (sd->uv_width < sd->y_width) : 1;
+    cm->subsampling_y = sd != NULL ? (sd->uv_height < sd->y_height) : 1;
     alloc_raw_frame_buffers(cpi);
-
     cpi->initial_width = cm->width;
     cpi->initial_height = cm->height;
   }
@@ -3711,7 +3325,7 @@ int is_next_frame_arf(VP9_COMP *cpi) {
 #endif
 
 int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
-                            unsigned long *size, unsigned char *dest,
+                            size_t *size, uint8_t *dest,
                             int64_t *time_stamp, int64_t *time_end, int flush) {
   VP9_COMP *cpi = (VP9_COMP *) ptr;
   VP9_COMMON *cm = &cpi->common;
@@ -3727,8 +3341,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
 
   cpi->source = NULL;
 
-  cpi->common.allow_high_precision_mv = ALTREF_HIGH_PRECISION_MV;
-  set_mvcost(cpi);
+  set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV);
 
   // Should we code an alternate reference frame.
   if (cpi->oxcf.play_alternate && cpi->source_alt_ref_pending) {
@@ -3743,7 +3356,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         - cpi->next_frame_in_order;
     else
 #endif
-      frames_to_arf = cpi->frames_till_gf_update_due;
+      frames_to_arf = cpi->rc.frames_till_gf_update_due;
 
     assert(frames_to_arf < cpi->twopass.frames_to_key);
 
@@ -3758,7 +3371,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
         // Produce the filtered ARF frame.
         // TODO(agrange) merge these two functions.
         configure_arnr_filter(cpi, cm->current_video_frame + frames_to_arf,
-                              cpi->gfu_boost);
+                              cpi->rc.gfu_boost);
         vp9_temporal_filter_prepare(cpi, frames_to_arf);
         vp9_extend_frame_borders(&cpi->alt_ref_buffer,
                                  cm->subsampling_x, cm->subsampling_y);
@@ -3947,15 +3560,19 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   vp9_realloc_frame_buffer(get_frame_new_buffer(cm),
                            cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
-                           VP9BORDERINPIXELS);
+                           VP9BORDERINPIXELS, NULL, NULL, NULL);
 
   // Calculate scaling factors for each of the 3 available references
-  for (i = 0; i < ALLOWED_REFS_PER_FRAME; ++i)
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
     vp9_setup_scale_factors(cm, i);
+    if (vp9_is_scaled(&cm->active_ref_scale_comm[i]))
+      vp9_extend_frame_borders(&cm->yv12_fb[cm->active_ref_idx[i]],
+                               cm->subsampling_x, cm->subsampling_y);
+  }
 
   vp9_setup_interp_filters(&cpi->mb.e_mbd, DEFAULT_INTERP_FILTER, cm);
 
-  if (cpi->sf.variance_adaptive_quantization) {
+  if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
       vp9_vaq_init();
   }
 
@@ -3964,7 +3581,8 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags,
   } else if (cpi->pass == 2) {
     Pass2Encode(cpi, size, dest, frame_flags);
   } else {
-    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
   }
 
   if (cm->refresh_frame_context)
@@ -4256,37 +3874,9 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width,
   return 0;
 }
 
-int vp9_switch_layer(VP9_PTR comp, int layer) {
-  VP9_COMP *cpi = (VP9_COMP *)comp;
-
-  if (cpi->use_svc) {
-    cpi->current_layer = layer;
-
-    // Use buffer i for layer i LST
-    cpi->lst_fb_idx = layer;
-
-    // Use buffer i-1 for layer i Alt (Inter-layer prediction)
-    if (layer != 0) cpi->alt_fb_idx = layer - 1;
-
-    // Use the rest for Golden
-    if (layer < 2 * cpi->number_spatial_layers - NUM_REF_FRAMES)
-      cpi->gld_fb_idx = cpi->lst_fb_idx;
-    else
-      cpi->gld_fb_idx = 2 * cpi->number_spatial_layers - 1 - layer;
-
-    printf("Switching to layer %d:\n", layer);
-    printf("Using references: LST/GLD/ALT [%d|%d|%d]\n", cpi->lst_fb_idx,
-           cpi->gld_fb_idx, cpi->alt_fb_idx);
-  } else {
-    printf("Switching layer not supported. Enable SVC first \n");
-  }
-  return 0;
-}
-
 void vp9_set_svc(VP9_PTR comp, int use_svc) {
   VP9_COMP *cpi = (VP9_COMP *)comp;
   cpi->use_svc = use_svc;
-  if (cpi->use_svc) printf("Enabled SVC encoder \n");
   return;
 }
 
diff --git a/source/libvpx/vp9/encoder/vp9_onyx_int.h b/source/libvpx/vp9/encoder/vp9_onyx_int.h
index 0498043..8f2ffc9 100644
--- a/source/libvpx/vp9/encoder/vp9_onyx_int.h
+++ b/source/libvpx/vp9/encoder/vp9_onyx_int.h
@@ -29,13 +29,7 @@
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/encoder/vp9_lookahead.h"
 
-// Experimental rate control switches
-#if CONFIG_ONESHOTQ
-#define ONE_SHOT_Q_ESTIMATE 0
-#define STRICT_ONE_SHOT_Q 0
-#endif
 #define DISABLE_RC_LONG_TERM_MEM 0
-
 // #define MODE_TEST_HIT_STATS
 
 // #define SPEEDSTATS 1
@@ -253,7 +247,6 @@ typedef struct {
   int auto_mv_step_size;
   int optimize_coefficients;
   int static_segmentation;
-  int variance_adaptive_quantization;
   int comp_inter_joint_search_thresh;
   int adaptive_rd_thresh;
   int skip_encode_sb;
@@ -277,6 +270,7 @@ typedef struct {
   int using_small_partition_info;
   // TODO(jingning): combine the related motion search speed features
   int adaptive_motion_search;
+  int adaptive_pred_filter_type;
 
   // Implements various heuristics to skip searching modes
   // The heuristics selected are based on  flags
@@ -295,6 +289,59 @@ typedef struct {
   int use_fast_coef_updates;  // 0: 2-loop, 1: 1-loop, 2: 1-loop reduced
 } SPEED_FEATURES;
 
+typedef struct {
+  // Rate targetting variables
+  int this_frame_target;
+  int projected_frame_size;
+  int sb64_target_rate;
+  int last_q[2];                   // Separate values for Intra/Inter
+  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
+
+  int gfu_boost;
+  int last_boost;
+  int kf_boost;
+
+  double rate_correction_factor;
+  double key_frame_rate_correction_factor;
+  double gf_rate_correction_factor;
+
+  unsigned int frames_since_golden;
+  int frames_till_gf_update_due;  // Count down till next GF
+
+  int max_gf_interval;
+  int baseline_gf_interval;
+
+  int64_t key_frame_count;
+  int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+  int per_frame_bandwidth;  // Current section per frame bandwidth target
+  int av_per_frame_bandwidth;  // Average frame size target for clip
+  int min_frame_bandwidth;  // Minimum allocation used for any frame
+
+  int ni_av_qi;
+  int ni_tot_qi;
+  int ni_frames;
+  int avg_frame_qindex;
+  double tot_q;
+  double avg_q;
+
+  int buffer_level;
+  int bits_off_target;
+
+  int rolling_target_bits;
+  int rolling_actual_bits;
+
+  int long_rolling_target_bits;
+  int long_rolling_actual_bits;
+
+  int64_t total_actual_bits;
+  int total_target_vs_actual;        // debug stats
+
+  int worst_quality;
+  int active_worst_quality;
+  int best_quality;
+  // int active_best_quality;
+} RATE_CONTROL;
+
 typedef struct VP9_COMP {
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
@@ -317,11 +364,10 @@ typedef struct VP9_COMP {
   VP9_COMMON common;
   VP9_CONFIG oxcf;
   struct rdcost_block_args rdcost_stack;
-
   struct lookahead_ctx    *lookahead;
   struct lookahead_entry  *source;
 #if CONFIG_MULTIPLE_ARF
-  struct lookahead_entry  *alt_ref_source[NUM_REF_FRAMES];
+  struct lookahead_entry  *alt_ref_source[REF_FRAMES];
 #else
   struct lookahead_entry  *alt_ref_source;
 #endif
@@ -349,7 +395,7 @@ typedef struct VP9_COMP {
   int use_svc;
 
 #if CONFIG_MULTIPLE_ARF
-  int alt_ref_fb_idx[NUM_REF_FRAMES - 3];
+  int alt_ref_fb_idx[REF_FRAMES - 3];
 #endif
   int refresh_last_frame;
   int refresh_golden_frame;
@@ -385,8 +431,8 @@ typedef struct VP9_COMP {
   int rd_thresh_sub8x8[MAX_SEGMENTS][BLOCK_SIZES][MAX_REFS];
   int rd_thresh_freq_sub8x8[BLOCK_SIZES][MAX_REFS];
 
-  int64_t rd_comp_pred_diff[NB_PREDICTION_TYPES];
-  int64_t rd_prediction_type_threshes[4][NB_PREDICTION_TYPES];
+  int64_t rd_comp_pred_diff[REFERENCE_MODES];
+  int64_t rd_prediction_type_threshes[4][REFERENCE_MODES];
   unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2];
   unsigned int single_ref_count[REF_CONTEXTS][2][2];
@@ -405,87 +451,29 @@ typedef struct VP9_COMP {
 
   CODING_CONTEXT coding_context;
 
-  // Rate targetting variables
-  int this_frame_target;
-  int projected_frame_size;
-  int last_q[2];                   // Separate values for Intra/Inter
-  int last_boosted_qindex;         // Last boosted GF/KF/ARF q
-
-  double rate_correction_factor;
-  double key_frame_rate_correction_factor;
-  double gf_rate_correction_factor;
-
-  unsigned int frames_since_golden;
-  int frames_till_gf_update_due;  // Count down till next GF
-
-  int gf_overspend_bits;  // cumulative bits overspent because of GF boost
-
-  int non_gf_bitrate_adjustment;  // Following GF to recover extra bits spent
-
-  int kf_overspend_bits;  // Bits spent on key frames to be recovered on inters
-  int kf_bitrate_adjustment;  // number of bits to recover on each inter frame.
-  int max_gf_interval;
-  int baseline_gf_interval;
+  int zbin_mode_boost;
+  int zbin_mode_boost_enabled;
   int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
   int active_arnr_strength;         // <= cpi->oxcf.arnr_max_strength
 
-  int64_t key_frame_count;
-  int prior_key_frame_distance[KEY_FRAME_CONTEXT];
-  int per_frame_bandwidth;  // Current section per frame bandwidth target
-  int av_per_frame_bandwidth;  // Average frame size target for clip
-  int min_frame_bandwidth;  // Minimum allocation used for any frame
-  int inter_frame_target;
   double output_framerate;
   int64_t last_time_stamp_seen;
   int64_t last_end_time_stamp_seen;
   int64_t first_time_stamp_ever;
 
-  int ni_av_qi;
-  int ni_tot_qi;
-  int ni_frames;
-  int avg_frame_qindex;
-  double tot_q;
-  double avg_q;
-
-  int zbin_mode_boost;
-  int zbin_mode_boost_enabled;
-
-  int64_t total_byte_count;
-
-  int buffered_mode;
-
-  int buffer_level;
-  int bits_off_target;
-
-  int rolling_target_bits;
-  int rolling_actual_bits;
-
-  int long_rolling_target_bits;
-  int long_rolling_actual_bits;
-
-  int64_t total_actual_bits;
-  int total_target_vs_actual;        // debug stats
-
-  int worst_quality;
-  int active_worst_quality;
-  int best_quality;
-  int active_best_quality;
+  RATE_CONTROL rc;
 
   int cq_target_quality;
 
   int y_mode_count[4][INTRA_MODES];
   int y_uv_mode_count[INTRA_MODES][INTRA_MODES];
-  unsigned int partition_count[PARTITION_CONTEXTS][PARTITION_TYPES];
 
   nmv_context_counts NMVcount;
 
-  vp9_coeff_count coef_counts[TX_SIZES][BLOCK_TYPES];
-  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][BLOCK_TYPES];
-  vp9_coeff_stats frame_branch_ct[TX_SIZES][BLOCK_TYPES];
+  vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
+  vp9_coeff_probs_model frame_coef_probs[TX_SIZES][PLANE_TYPES];
+  vp9_coeff_stats frame_branch_ct[TX_SIZES][PLANE_TYPES];
 
-  int gfu_boost;
-  int last_boost;
-  int kf_boost;
   int kf_zeromotion_pct;
   int gf_zeromotion_pct;
 
@@ -506,15 +494,9 @@ typedef struct VP9_COMP {
   int decimation_count;
 
   // for real time encoding
-  int avg_encode_time;              // microsecond
-  int avg_pick_mode_time;            // microsecond
   int speed;
-  unsigned int cpu_freq;           // Mhz
   int compressor_speed;
 
-  int interquantizer;
-  int goldfreq;
-  int auto_worst_q;
   int cpu_used;
   int pass;
 
@@ -529,17 +511,13 @@ typedef struct VP9_COMP {
   unsigned int max_mv_magnitude;
   int mv_step_param;
 
-  // Data used for real time conferencing mode to help determine if it
-  // would be good to update the gf
-  int inter_zz_count;
-  int gf_bad_count;
-  int gf_update_recommended;
-
   unsigned char *segmentation_map;
 
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
 
+  unsigned char *complexity_map;
+
   unsigned char *active_map;
   unsigned int active_map_enabled;
 
@@ -592,7 +570,6 @@ typedef struct VP9_COMP {
     int alt_extra_bits;
 
     int sr_update_lag;
-    double est_max_qcorrection_factor;
   } twopass;
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
@@ -707,8 +684,7 @@ static int get_scale_ref_frame_idx(VP9_COMP *cpi,
 
 void vp9_encode_frame(VP9_COMP *cpi);
 
-void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
-                        unsigned long *size);
+void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size);
 
 void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x);
 
@@ -718,7 +694,7 @@ int vp9_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
 void vp9_alloc_compressor_data(VP9_COMP *cpi);
 
-int vp9_compute_qdelta(VP9_COMP *cpi, double qstart, double qtarget);
+int vp9_compute_qdelta(const VP9_COMP *cpi, double qstart, double qtarget);
 
 static int get_token_alloc(int mb_rows, int mb_cols) {
   return mb_rows * mb_cols * (48 * 16 + 4);
diff --git a/source/libvpx/vp9/encoder/vp9_quantize.c b/source/libvpx/vp9/encoder/vp9_quantize.c
index fca7525..8c41724 100644
--- a/source/libvpx/vp9/encoder/vp9_quantize.c
+++ b/source/libvpx/vp9/encoder/vp9_quantize.c
@@ -22,7 +22,7 @@
 extern int enc_debug;
 #endif
 
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
                       int skip_block,
                       const int16_t *zbin_ptr, const int16_t *round_ptr,
                       const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
@@ -30,58 +30,44 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
                       const int16_t *dequant_ptr,
                       int zbin_oq_value, uint16_t *eob_ptr,
                       const int16_t *scan, const int16_t *iscan) {
-  int i, rc, eob;
-  int zbins[2], nzbins[2], zbin;
-  int x, y, z, sz;
-  int zero_flag = n_coeffs;
-
-  vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+  int i, non_zero_count = count, eob = -1;
+  const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
+                         zbin_ptr[1] + zbin_oq_value };
+  const int nzbins[2] = { zbins[0] * -1,
+                          zbins[1] * -1 };
 
-  eob = -1;
-
-  // Base ZBIN
-  zbins[0] = zbin_ptr[0] + zbin_oq_value;
-  zbins[1] = zbin_ptr[1] + zbin_oq_value;
-  nzbins[0] = zbins[0] * -1;
-  nzbins[1] = zbins[1] * -1;
+  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
 
   if (!skip_block) {
     // Pre-scan pass
-    for (i = n_coeffs - 1; i >= 0; i--) {
-      rc = scan[i];
-      z = coeff_ptr[rc];
+    for (i = count - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
 
-      if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
-        zero_flag--;
-      } else {
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
         break;
-      }
     }
 
     // Quantization pass: All coefficients with index >= zero_flag are
     // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < zero_flag; i++) {
-      rc = scan[i];
-      z  = coeff_ptr[rc];
-
-      zbin = (zbins[rc != 0]);
-
-      sz = (z >> 31);                               // sign of z
-      x  = (z ^ sz) - sz;
-
-      if (x >= zbin) {
-        x += (round_ptr[rc != 0]);
-        x  = clamp(x, INT16_MIN, INT16_MAX);
-        y  = (((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) *
-              quant_shift_ptr[rc != 0]) >> 16;      // quantize (x)
-        x  = (y ^ sz) - sz;                         // get the sign back
-        qcoeff_ptr[rc]  = x;                        // write to destination
-        dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0];  // dequantized value
-
-        if (y) {
-          eob = i;                                  // last nonzero coeffs
-        }
+    for (i = 0; i < non_zero_count; i++) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+      const int coeff_sign = (coeff >> 31);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+      if (abs_coeff >= zbins[rc != 0]) {
+        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                  quant_shift_ptr[rc != 0]) >> 16;  // quantization
+        qcoeff_ptr[rc]  = (tmp ^ coeff_sign) - coeff_sign;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+        if (tmp)
+          eob = i;
       }
     }
   }
@@ -151,45 +137,18 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = eob + 1;
 }
 
-struct plane_block_idx {
-  int plane;
-  int block;
-};
-
-// TODO(jkoleszar): returning a struct so it can be used in a const context,
-// expect to refactor this further later.
-static INLINE struct plane_block_idx plane_block_idx(int y_blocks,
-                                                     int b_idx) {
-  const int v_offset = y_blocks * 5 / 4;
-  struct plane_block_idx res;
-
-  if (b_idx < y_blocks) {
-    res.plane = 0;
-    res.block = b_idx;
-  } else if (b_idx < v_offset) {
-    res.plane = 1;
-    res.block = b_idx - y_blocks;
-  } else {
-    assert(b_idx < y_blocks * 3 / 2);
-    res.plane = 2;
-    res.block = b_idx - v_offset;
-  }
-  return res;
-}
-
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  const struct plane_block_idx pb_idx = plane_block_idx(y_blocks, b_idx);
-  struct macroblock_plane* p = &x->plane[pb_idx.plane];
-  struct macroblockd_plane* pd = &xd->plane[pb_idx.plane];
+  struct macroblock_plane* p = &x->plane[plane];
+  struct macroblockd_plane* pd = &xd->plane[plane];
 
-  vp9_quantize_b(BLOCK_OFFSET(p->coeff, pb_idx.block),
+  vp9_quantize_b(BLOCK_OFFSET(p->coeff, block),
            16, x->skip_block,
            p->zbin, p->round, p->quant, p->quant_shift,
-           BLOCK_OFFSET(pd->qcoeff, pb_idx.block),
-           BLOCK_OFFSET(pd->dqcoeff, pb_idx.block),
-           pd->dequant, p->zbin_extra, &pd->eobs[pb_idx.block], scan, iscan);
+           BLOCK_OFFSET(p->qcoeff, block),
+           BLOCK_OFFSET(pd->dqcoeff, block),
+           pd->dequant, p->zbin_extra, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
@@ -315,17 +274,17 @@ void vp9_mb_init_quantizer(VP9_COMP *cpi, MACROBLOCK *x) {
                                         SEG_LVL_SKIP);
 
   /* save this macroblock QIndex for vp9_update_zbin_extra() */
-  x->e_mbd.q_index = qindex;
+  x->q_index = qindex;
 
   /* R/D setup */
   cpi->mb.errorperbit = rdmult >> 6;
   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
 
-  vp9_initialize_me_consts(cpi, xd->q_index);
+  vp9_initialize_me_consts(cpi, x->q_index);
 }
 
 void vp9_update_zbin_extra(VP9_COMP *cpi, MACROBLOCK *x) {
-  const int qindex = x->e_mbd.q_index;
+  const int qindex = x->q_index;
   const int y_zbin_extra = (cpi->common.y_dequant[qindex][1] *
                 (cpi->zbin_mode_boost + x->act_zbin_adj)) >> 7;
   const int uv_zbin_extra = (cpi->common.uv_dequant[qindex][1] *
diff --git a/source/libvpx/vp9/encoder/vp9_quantize.h b/source/libvpx/vp9/encoder/vp9_quantize.h
index c078e1d..41cfa52 100644
--- a/source/libvpx/vp9/encoder/vp9_quantize.h
+++ b/source/libvpx/vp9/encoder/vp9_quantize.h
@@ -13,7 +13,7 @@
 
 #include "vp9/encoder/vp9_block.h"
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int y_blocks, int b_idx,
+void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                 const int16_t *scan, const int16_t *iscan);
 
 struct VP9_COMP;
diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.c b/source/libvpx/vp9/encoder/vp9_ratectrl.c
index 0aa3a68..3fa8cea 100644
--- a/source/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/source/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -26,6 +26,8 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_seg_common.h"
 
+#define LIMIT_QRANGE_FOR_ALTREF_AND_KEY 1
+
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
@@ -35,6 +37,88 @@
 static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] =
     { 1, 2, 3, 4, 5 };
 
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq[QINDEX_RANGE];
+static int kf_high_motion_minq[QINDEX_RANGE];
+static int gf_low_motion_minq[QINDEX_RANGE];
+static int gf_high_motion_minq[QINDEX_RANGE];
+static int inter_minq[QINDEX_RANGE];
+static int afq_low_motion_minq[QINDEX_RANGE];
+static int afq_high_motion_minq[QINDEX_RANGE];
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int calculate_minq_index(double maxq,
+                                double x3, double x2, double x1, double c) {
+  int i;
+  const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq + c,
+                                maxq);
+
+  // Special case handling to deal with the step from q2.0
+  // down to lossless mode represented by q 1.0.
+  if (minqtarget <= 2.0)
+    return 0;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    if (minqtarget <= vp9_convert_qindex_to_q(i))
+      return i;
+  }
+
+  return QINDEX_RANGE - 1;
+}
+
+void vp9_rc_init_minq_luts(void) {
+  int i;
+
+  for (i = 0; i < QINDEX_RANGE; i++) {
+    const double maxq = vp9_convert_qindex_to_q(i);
+
+
+    kf_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                 0.000001,
+                                                 -0.0004,
+                                                 0.15,
+                                                 0.0);
+    kf_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.000002,
+                                                  -0.0012,
+                                                  0.50,
+                                                  0.0);
+
+    gf_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                 0.0000015,
+                                                 -0.0009,
+                                                 0.32,
+                                                 0.0);
+    gf_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.0000021,
+                                                  -0.00125,
+                                                  0.50,
+                                                  0.0);
+    afq_low_motion_minq[i] = calculate_minq_index(maxq,
+                                                  0.0000015,
+                                                  -0.0009,
+                                                  0.33,
+                                                  0.0);
+    afq_high_motion_minq[i] = calculate_minq_index(maxq,
+                                                   0.0000021,
+                                                   -0.00125,
+                                                   0.55,
+                                                   0.0);
+    inter_minq[i] = calculate_minq_index(maxq,
+                                         0.00000271,
+                                         -0.00113,
+                                         0.75,
+                                         0.0);
+  }
+}
+
 // These functions use formulaic calculations to make playing with the
 // quantizer tables easier. If necessary they can be replaced by lookup
 // tables if and when things settle down in the experimental bitstream
@@ -43,22 +127,8 @@ double vp9_convert_qindex_to_q(int qindex) {
   return vp9_ac_quant(qindex, 0) / 4.0;
 }
 
-int vp9_gfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000828 * q * q * q) +
-               (-0.0055 * q * q) +
-               (1.32 * q) + 79.3);
-}
-
-static int kfboost_qadjust(int qindex) {
-  const double q = vp9_convert_qindex_to_q(qindex);
-  return (int)((0.00000973 * q * q * q) +
-               (-0.00613 * q * q) +
-               (1.316 * q) + 121.2);
-}
-
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor) {
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor) {
   const double q = vp9_convert_qindex_to_q(qindex);
   int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000;
 
@@ -118,7 +188,7 @@ void vp9_setup_key_frame(VP9_COMP *cpi) {
   vp9_setup_past_independence(cm);
 
   // interval before next GF
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+  cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
   /* All buffers are implicitly updated on key frames. */
   cpi->refresh_golden_frame = 1;
   cpi->refresh_alt_ref_frame = 1;
@@ -129,13 +199,13 @@ void vp9_setup_inter_frame(VP9_COMP *cpi) {
   if (cm->error_resilient_mode || cm->intra_only)
     vp9_setup_past_independence(cm);
 
-  assert(cm->frame_context_idx < NUM_FRAME_CONTEXTS);
+  assert(cm->frame_context_idx < FRAME_CONTEXTS);
   cm->fc = cm->frame_contexts[cm->frame_context_idx];
 }
 
 static int estimate_bits_at_q(int frame_kind, int q, int mbs,
                               double correction_factor) {
-  const int bpm = (int)(vp9_bits_per_mb(frame_kind, q, correction_factor));
+  const int bpm = (int)(vp9_rc_bits_per_mb(frame_kind, q, correction_factor));
 
   // Attempt to retain reasonable accuracy without overflow. The cutoff is
   // chosen such that the maximum product of Bpm and MBs fits 31 bits. The
@@ -153,20 +223,18 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
   vp9_clear_system_state();  // __asm emms;
 
   // New Two pass RC
-  target = cpi->per_frame_bandwidth;
+  target = cpi->rc.per_frame_bandwidth;
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
-    int max_rate = cpi->per_frame_bandwidth
+    int max_rate = cpi->rc.per_frame_bandwidth
                  * cpi->oxcf.rc_max_intra_bitrate_pct / 100;
 
     if (target > max_rate)
       target = max_rate;
   }
-
-  cpi->this_frame_target = target;
+  cpi->rc.this_frame_target = target;
 }
 
-
 //  Do the best we can to define the parameters for the next GF based
 //  on what information we have available.
 //
@@ -174,21 +242,21 @@ static void calc_iframe_target_size(VP9_COMP *cpi) {
 //  so we just use the interval determined in the two pass code.
 static void calc_gf_params(VP9_COMP *cpi) {
   // Set the gf interval
-  cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+  cpi->rc.frames_till_gf_update_due = cpi->rc.baseline_gf_interval;
 }
 
 
 static void calc_pframe_target_size(VP9_COMP *cpi) {
-  const int min_frame_target = MAX(cpi->min_frame_bandwidth,
-                                   cpi->av_per_frame_bandwidth >> 5);
+  const int min_frame_target = MAX(cpi->rc.min_frame_bandwidth,
+                                   cpi->rc.av_per_frame_bandwidth >> 5);
   if (cpi->refresh_alt_ref_frame) {
     // Special alt reference frame case
     // Per frame bit target for the alt ref frame
-    cpi->per_frame_bandwidth = cpi->twopass.gf_bits;
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
+    cpi->rc.per_frame_bandwidth = cpi->twopass.gf_bits;
+    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
   } else {
     // Normal frames (gf,and inter)
-    cpi->this_frame_target = cpi->per_frame_bandwidth;
+    cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
   }
 
   // Check that the total sum of adjustments is not above the maximum allowed.
@@ -197,47 +265,31 @@ static void calc_pframe_target_size(VP9_COMP *cpi) {
   // not capable of recovering all the extra bits we have spent in the KF or GF,
   // then the remainder will have to be recovered over a longer time span via
   // other buffer / rate control mechanisms.
-  if (cpi->this_frame_target < min_frame_target)
-    cpi->this_frame_target = min_frame_target;
-
-  if (!cpi->refresh_alt_ref_frame)
-    // Note the baseline target data rate for this inter frame.
-    cpi->inter_frame_target = cpi->this_frame_target;
+  if (cpi->rc.this_frame_target < min_frame_target)
+    cpi->rc.this_frame_target = min_frame_target;
 
   // Adjust target frame size for Golden Frames:
-  if (cpi->frames_till_gf_update_due == 0) {
-    const int q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
-                                          : cpi->oxcf.fixed_q;
-
+  if (cpi->rc.frames_till_gf_update_due == 0) {
     cpi->refresh_golden_frame = 1;
-
     calc_gf_params(cpi);
-
     // If we are using alternate ref instead of gf then do not apply the boost
     // It will instead be applied to the altref update
     // Jims modified boost
     if (!cpi->source_alt_ref_active) {
-      if (cpi->oxcf.fixed_q < 0) {
-        // The spend on the GF is defined in the two pass code
-        // for two pass encodes
-        cpi->this_frame_target = cpi->per_frame_bandwidth;
-      } else {
-        cpi->this_frame_target =
-          (estimate_bits_at_q(1, q, cpi->common.MBs, 1.0)
-           * cpi->last_boost) / 100;
-      }
+      // The spend on the GF is defined in the two pass code
+      // for two pass encodes
+      cpi->rc.this_frame_target = cpi->rc.per_frame_bandwidth;
     } else {
       // If there is an active ARF at this location use the minimum
       // bits on this frame even if it is a constructed arf.
       // The active maximum quantizer insures that an appropriate
       // number of bits will be spent if needed for constructed ARFs.
-      cpi->this_frame_target = 0;
+      cpi->rc.this_frame_target = 0;
     }
   }
 }
 
-
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   const int q = cpi->common.base_qindex;
   int correction_factor = 100;
   double rate_correction_factor;
@@ -249,12 +301,12 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   vp9_clear_system_state();  // __asm emms;
 
   if (cpi->common.frame_type == KEY_FRAME) {
-    rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    rate_correction_factor = cpi->rc.key_frame_rate_correction_factor;
   } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      rate_correction_factor = cpi->gf_rate_correction_factor;
+      rate_correction_factor = cpi->rc.gf_rate_correction_factor;
     else
-      rate_correction_factor = cpi->rate_correction_factor;
+      rate_correction_factor = cpi->rc.rate_correction_factor;
   }
 
   // Work out how big we would have expected the frame to be at this Q given
@@ -267,7 +319,7 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   // Work out a size correction factor.
   if (projected_size_based_on_q > 0)
     correction_factor =
-        (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+        (100 * cpi->rc.projected_frame_size) / projected_size_based_on_q;
 
   // More heavily damped adjustment used if we have been oscillating either side
   // of target.
@@ -284,7 +336,6 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
       break;
   }
 
-  // if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
   if (correction_factor > 102) {
     // We are not already at the worst allowable quality
     correction_factor =
@@ -308,18 +359,19 @@ void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) {
   }
 
   if (cpi->common.frame_type == KEY_FRAME) {
-    cpi->key_frame_rate_correction_factor = rate_correction_factor;
+    cpi->rc.key_frame_rate_correction_factor = rate_correction_factor;
   } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      cpi->gf_rate_correction_factor = rate_correction_factor;
+      cpi->rc.gf_rate_correction_factor = rate_correction_factor;
     else
-      cpi->rate_correction_factor = rate_correction_factor;
+      cpi->rc.rate_correction_factor = rate_correction_factor;
   }
 }
 
 
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
-  int q = cpi->active_worst_quality;
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality) {
+  int q = active_worst_quality;
 
   int i;
   int last_error = INT_MAX;
@@ -329,12 +381,12 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
 
   // Select the appropriate correction factor based upon type of frame.
   if (cpi->common.frame_type == KEY_FRAME) {
-    correction_factor = cpi->key_frame_rate_correction_factor;
+    correction_factor = cpi->rc.key_frame_rate_correction_factor;
   } else {
     if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)
-      correction_factor = cpi->gf_rate_correction_factor;
+      correction_factor = cpi->rc.gf_rate_correction_factor;
     else
-      correction_factor = cpi->rate_correction_factor;
+      correction_factor = cpi->rc.rate_correction_factor;
   }
 
   // Calculate required scaling factor based on target frame size and size of
@@ -347,11 +399,11 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
     target_bits_per_mb =
         (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
 
-  i = cpi->active_best_quality;
+  i = active_best_quality;
 
   do {
-    bits_per_mb_at_this_q = (int)vp9_bits_per_mb(cpi->common.frame_type, i,
-                                                 correction_factor);
+    bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cpi->common.frame_type, i,
+                                                    correction_factor);
 
     if (bits_per_mb_at_this_q <= target_bits_per_mb) {
       if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
@@ -363,11 +415,237 @@ int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame) {
     } else {
       last_error = bits_per_mb_at_this_q - target_bits_per_mb;
     }
-  } while (++i <= cpi->active_worst_quality);
+  } while (++i <= active_worst_quality);
 
   return q;
 }
 
+static int get_active_quality(int q,
+                              int gfu_boost,
+                              int low,
+                              int high,
+                              int *low_motion_minq,
+                              int *high_motion_minq) {
+  int active_best_quality;
+  if (gfu_boost > high) {
+    active_best_quality = low_motion_minq[q];
+  } else if (gfu_boost < low) {
+    active_best_quality = high_motion_minq[q];
+  } else {
+    const int gap = high - low;
+    const int offset = high - gfu_boost;
+    const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+    const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+    active_best_quality = low_motion_minq[q] + adjustment;
+  }
+  return active_best_quality;
+}
+
+int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
+                                      int *bottom_index,
+                                      int *top_index,
+                                      int *top_index_prop) {
+  const VP9_COMMON *const cm = &cpi->common;
+  int active_best_quality;
+  int active_worst_quality = cpi->rc.active_worst_quality;
+  int q;
+
+  if (frame_is_intra_only(cm)) {
+    active_best_quality = cpi->rc.best_quality;
+#if !CONFIG_MULTIPLE_ARF
+    // Handle the special case for key frames forced when we have75 reached
+    // the maximum key frame interval. Here force the Q to a range
+    // based on the ambient Q to reduce the risk of popping.
+    if (cpi->this_key_frame_forced) {
+      int delta_qindex;
+      int qindex = cpi->rc.last_boosted_qindex;
+      double last_boosted_q = vp9_convert_qindex_to_q(qindex);
+
+      delta_qindex = vp9_compute_qdelta(cpi, last_boosted_q,
+                                        (last_boosted_q * 0.75));
+      active_best_quality = MAX(qindex + delta_qindex,
+                                cpi->rc.best_quality);
+    } else if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+      // not first frame of one pass
+      double q_adj_factor = 1.0;
+      double q_val;
+
+      // Baseline value derived from cpi->active_worst_quality and kf boost
+      active_best_quality = get_active_quality(active_worst_quality,
+                                               cpi->rc.kf_boost,
+                                               kf_low, kf_high,
+                                               kf_low_motion_minq,
+                                               kf_high_motion_minq);
+
+      // Allow somewhat lower kf minq with small image formats.
+      if ((cm->width * cm->height) <= (352 * 288)) {
+        q_adj_factor -= 0.25;
+      }
+
+      // Make a further adjustment based on the kf zero motion measure.
+      q_adj_factor += 0.05 - (0.001 * (double)cpi->kf_zeromotion_pct);
+
+      // Convert the adjustment factor to a qindex delta
+      // on active_best_quality.
+      q_val = vp9_convert_qindex_to_q(active_best_quality);
+      active_best_quality +=
+          vp9_compute_qdelta(cpi, q_val, (q_val * q_adj_factor));
+    }
+#else
+    double current_q;
+    // Force the KF quantizer to be 30% of the active_worst_quality.
+    current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    active_best_quality = active_worst_quality
+        + vp9_compute_qdelta(cpi, current_q, current_q * 0.3);
+#endif
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+
+    // Use the lower of active_worst_quality and recent
+    // average Q as basis for GF/ARF best Q limit unless last frame was
+    // a key frame.
+    if (cpi->frames_since_key > 1 &&
+        cpi->rc.avg_frame_qindex < active_worst_quality) {
+      q = cpi->rc.avg_frame_qindex;
+    } else {
+      q = active_worst_quality;
+    }
+    // For constrained quality dont allow Q less than the cq level
+    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
+      if (q < cpi->cq_target_quality)
+        q = cpi->cq_target_quality;
+      if (cpi->frames_since_key > 1) {
+        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+                                                 gf_low, gf_high,
+                                                 afq_low_motion_minq,
+                                                 afq_high_motion_minq);
+      } else {
+        active_best_quality = get_active_quality(q, cpi->rc.gfu_boost,
+                                                 gf_low, gf_high,
+                                                 gf_low_motion_minq,
+                                                 gf_high_motion_minq);
+      }
+      // Constrained quality use slightly lower active best.
+      active_best_quality = active_best_quality * 15 / 16;
+
+    } else if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      if (!cpi->refresh_alt_ref_frame) {
+        active_best_quality = cpi->cq_target_quality;
+      } else {
+        if (cpi->frames_since_key > 1) {
+          active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              afq_low_motion_minq, afq_high_motion_minq);
+        } else {
+          active_best_quality = get_active_quality(
+              q, cpi->rc.gfu_boost, gf_low, gf_high,
+              gf_low_motion_minq, gf_high_motion_minq);
+        }
+      }
+    } else {
+      active_best_quality = get_active_quality(
+          q, cpi->rc.gfu_boost, gf_low, gf_high,
+          gf_low_motion_minq, gf_high_motion_minq);
+    }
+  } else {
+    if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+      active_best_quality = cpi->cq_target_quality;
+    } else {
+      if (cpi->pass == 0 &&
+          cpi->rc.avg_frame_qindex < active_worst_quality)
+        // 1-pass: for now, use the average Q for the active_best, if its lower
+        // than active_worst.
+        active_best_quality = inter_minq[cpi->rc.avg_frame_qindex];
+      else
+        active_best_quality = inter_minq[active_worst_quality];
+
+      // For the constrained quality mode we don't want
+      // q to fall below the cq level.
+      if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+          (active_best_quality < cpi->cq_target_quality)) {
+        // If we are strongly undershooting the target rate in the last
+        // frames then use the user passed in cq value not the auto
+        // cq value.
+        if (cpi->rc.rolling_actual_bits < cpi->rc.min_frame_bandwidth)
+          active_best_quality = cpi->oxcf.cq_level;
+        else
+          active_best_quality = cpi->cq_target_quality;
+      }
+    }
+  }
+
+  // Clip the active best and worst quality values to limits
+  if (active_worst_quality > cpi->rc.worst_quality)
+    active_worst_quality = cpi->rc.worst_quality;
+
+  if (active_best_quality < cpi->rc.best_quality)
+    active_best_quality = cpi->rc.best_quality;
+
+  if (active_best_quality > cpi->rc.worst_quality)
+    active_best_quality = cpi->rc.worst_quality;
+
+  if (active_worst_quality < active_best_quality)
+    active_worst_quality = active_best_quality;
+
+  *top_index_prop = active_worst_quality;
+  *top_index = active_worst_quality;
+  *bottom_index = active_best_quality;
+
+#if LIMIT_QRANGE_FOR_ALTREF_AND_KEY
+  // Limit Q range for the adaptive loop.
+  if (cm->frame_type == KEY_FRAME && !cpi->this_key_frame_forced) {
+    if (!(cpi->pass == 0 && cpi->common.current_video_frame == 0)) {
+      *top_index = active_worst_quality;
+      *top_index =
+          (active_worst_quality + active_best_quality * 3) / 4;
+    }
+  } else if (!cpi->is_src_frame_alt_ref &&
+             (cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER) &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+    *top_index =
+      (active_worst_quality + active_best_quality) / 2;
+  }
+#endif
+
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
+    q = active_best_quality;
+  // Special case code to try and match quality with forced key frames
+  } else if ((cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced) {
+    q = cpi->rc.last_boosted_qindex;
+  } else {
+    // Determine initial Q to try.
+    if (cpi->pass == 0) {
+      // 1-pass: for now, use per-frame-bw for target size of frame, scaled
+      // by |x| for key frame.
+      int scale = (cm->frame_type == KEY_FRAME) ? 5 : 1;
+      q = vp9_rc_regulate_q(cpi, scale * cpi->rc.av_per_frame_bandwidth,
+                            active_best_quality, active_worst_quality);
+    } else {
+      q = vp9_rc_regulate_q(cpi, cpi->rc.this_frame_target,
+                            active_best_quality, active_worst_quality);
+    }
+    if (q > *top_index)
+      q = *top_index;
+  }
+#if CONFIG_MULTIPLE_ARF
+  // Force the quantizer determined by the coding order pattern.
+  if (cpi->multi_arf_enabled && (cm->frame_type != KEY_FRAME) &&
+      cpi->oxcf.end_usage != USAGE_CONSTANT_QUALITY) {
+    double new_q;
+    double current_q = vp9_convert_qindex_to_q(active_worst_quality);
+    int level = cpi->this_frame_weight;
+    assert(level >= 0);
+    new_q = current_q * (1.0 - (0.2 * (cpi->max_arf_level - level)));
+    q = active_worst_quality +
+        vp9_compute_qdelta(cpi, current_q, new_q);
+
+    *bottom_index = q;
+    *top_index    = q;
+    printf("frame:%d q:%d\n", cm->current_video_frame, q);
+  }
+#endif
+  return q;
+}
 
 static int estimate_keyframe_frequency(VP9_COMP *cpi) {
   int i;
@@ -378,7 +656,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
   /* First key frame at start of sequence is a special case. We have no
    * frequency data.
    */
-  if (cpi->key_frame_count == 1) {
+  if (cpi->rc.key_frame_count == 1) {
     /* Assume a default of 1 kf every 2 seconds, or the max kf interval,
      * whichever is smaller.
      */
@@ -388,7 +666,7 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
     if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq)
       av_key_frame_frequency = cpi->oxcf.key_freq;
 
-    cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
+    cpi->rc.prior_key_frame_distance[KEY_FRAME_CONTEXT - 1]
       = av_key_frame_frequency;
   } else {
     unsigned int total_weight = 0;
@@ -400,13 +678,13 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
      */
     for (i = 0; i < KEY_FRAME_CONTEXT; i++) {
       if (i < KEY_FRAME_CONTEXT - 1)
-        cpi->prior_key_frame_distance[i]
-          = cpi->prior_key_frame_distance[i + 1];
+        cpi->rc.prior_key_frame_distance[i]
+          = cpi->rc.prior_key_frame_distance[i + 1];
       else
-        cpi->prior_key_frame_distance[i] = last_kf_interval;
+        cpi->rc.prior_key_frame_distance[i] = last_kf_interval;
 
       av_key_frame_frequency += prior_key_frame_weight[i]
-                                * cpi->prior_key_frame_distance[i];
+                                * cpi->rc.prior_key_frame_distance[i];
       total_weight += prior_key_frame_weight[i];
     }
 
@@ -416,38 +694,38 @@ static int estimate_keyframe_frequency(VP9_COMP *cpi) {
 }
 
 
-void vp9_adjust_key_frame_context(VP9_COMP *cpi) {
+static void adjust_key_frame_context(VP9_COMP *cpi) {
   // Clear down mmx registers to allow floating point in what follows
   vp9_clear_system_state();
 
   cpi->frames_since_key = 0;
-  cpi->key_frame_count++;
+  cpi->rc.key_frame_count++;
 }
 
-
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit) {
+void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit) {
   // Set-up bounds on acceptable frame size:
-  if (cpi->oxcf.fixed_q >= 0) {
-    // Fixed Q scenario: frame size never outranges target (there is no target!)
+  if (cpi->oxcf.end_usage == USAGE_CONSTANT_QUALITY) {
     *frame_under_shoot_limit = 0;
     *frame_over_shoot_limit  = INT_MAX;
   } else {
     if (cpi->common.frame_type == KEY_FRAME) {
-      *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-      *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+      *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+      *frame_under_shoot_limit = this_frame_target * 7 / 8;
     } else {
       if (cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) {
-        *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
-        *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+        *frame_over_shoot_limit  = this_frame_target * 9 / 8;
+        *frame_under_shoot_limit = this_frame_target * 7 / 8;
       } else {
         // Stron overshoot limit for constrained quality
         if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 2 / 8;
         } else {
-          *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-          *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+          *frame_over_shoot_limit  = this_frame_target * 11 / 8;
+          *frame_under_shoot_limit = this_frame_target * 5 / 8;
         }
       }
     }
@@ -462,9 +740,8 @@ void vp9_compute_frame_size_bounds(VP9_COMP *cpi, int *frame_under_shoot_limit,
   }
 }
 
-
 // return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi) {
+int vp9_rc_pick_frame_size_target(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
 
   if (cm->frame_type == KEY_FRAME)
@@ -472,5 +749,111 @@ int vp9_pick_frame_size(VP9_COMP *cpi) {
   else
     calc_pframe_target_size(cpi);
 
+  // Target rate per SB64 (including partial SB64s.
+  cpi->rc.sb64_target_rate = ((int64_t)cpi->rc.this_frame_target * 64 * 64) /
+                             (cpi->common.width * cpi->common.height);
   return 1;
 }
+
+void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used,
+                              int worst_q) {
+  VP9_COMMON *const cm = &cpi->common;
+  // Update rate control heuristics
+  cpi->rc.projected_frame_size = (bytes_used << 3);
+
+  // Post encode loop adjustment of Q prediction.
+  vp9_rc_update_rate_correction_factors(
+      cpi, (cpi->sf.recode_loop ||
+            cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
+  cpi->rc.last_q[cm->frame_type] = cm->base_qindex;
+  cpi->rc.active_worst_quality = worst_q;
+
+  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // If the current frame is coded at a lower Q then we also update it.
+  // If all mbs in this group are skipped only update if the Q value is
+  // better than that already stored.
+  // This is used to help set quality in forced key frames to reduce popping
+  if ((cm->base_qindex < cpi->rc.last_boosted_qindex) ||
+      ((cpi->static_mb_pct < 100) &&
+       ((cm->frame_type == KEY_FRAME) || cpi->refresh_alt_ref_frame ||
+        (cpi->refresh_golden_frame && !cpi->is_src_frame_alt_ref)))) {
+    cpi->rc.last_boosted_qindex = cm->base_qindex;
+  }
+
+  if (cm->frame_type == KEY_FRAME) {
+    adjust_key_frame_context(cpi);
+  }
+
+  // Keep a record of ambient average Q.
+  if (cm->frame_type != KEY_FRAME)
+    cpi->rc.avg_frame_qindex = (2 + 3 * cpi->rc.avg_frame_qindex +
+                            cm->base_qindex) >> 2;
+
+  // Keep a record from which we can calculate the average Q excluding GF
+  // updates and key frames.
+  if (cm->frame_type != KEY_FRAME &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame) {
+    cpi->rc.ni_frames++;
+    cpi->rc.tot_q += vp9_convert_qindex_to_q(cm->base_qindex);
+    cpi->rc.avg_q = cpi->rc.tot_q / (double)cpi->rc.ni_frames;
+
+    // Calculate the average Q for normal inter frames (not key or GFU frames).
+    cpi->rc.ni_tot_qi += cm->base_qindex;
+    cpi->rc.ni_av_qi = cpi->rc.ni_tot_qi / cpi->rc.ni_frames;
+  }
+
+  // Update the buffer level variable.
+  // Non-viewable frames are a special case and are treated as pure overhead.
+  if (!cm->show_frame)
+    cpi->rc.bits_off_target -= cpi->rc.projected_frame_size;
+  else
+    cpi->rc.bits_off_target += cpi->rc.av_per_frame_bandwidth -
+                               cpi->rc.projected_frame_size;
+
+  // Clip the buffer level at the maximum buffer size
+  if (cpi->rc.bits_off_target > cpi->oxcf.maximum_buffer_size)
+    cpi->rc.bits_off_target = cpi->oxcf.maximum_buffer_size;
+
+  // Rolling monitors of whether we are over or underspending used to help
+  // regulate min and Max Q in two pass.
+  if (cm->frame_type != KEY_FRAME) {
+    cpi->rc.rolling_target_bits =
+        ((cpi->rc.rolling_target_bits * 3) +
+         cpi->rc.this_frame_target + 2) / 4;
+    cpi->rc.rolling_actual_bits =
+        ((cpi->rc.rolling_actual_bits * 3) +
+         cpi->rc.projected_frame_size + 2) / 4;
+    cpi->rc.long_rolling_target_bits =
+        ((cpi->rc.long_rolling_target_bits * 31) +
+         cpi->rc.this_frame_target + 16) / 32;
+    cpi->rc.long_rolling_actual_bits =
+        ((cpi->rc.long_rolling_actual_bits * 31) +
+         cpi->rc.projected_frame_size + 16) / 32;
+  }
+
+  // Actual bits spent
+  cpi->rc.total_actual_bits += cpi->rc.projected_frame_size;
+
+  // Debug stats
+  cpi->rc.total_target_vs_actual += (cpi->rc.this_frame_target -
+                                     cpi->rc.projected_frame_size);
+
+  cpi->rc.buffer_level = cpi->rc.bits_off_target;
+
+#ifndef DISABLE_RC_LONG_TERM_MEM
+  // Update bits left to the kf and gf groups to account for overshoot or
+  // undershoot on these frames
+  if (cm->frame_type == KEY_FRAME) {
+    cpi->twopass.kf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.kf_group_bits = MAX(cpi->twopass.kf_group_bits, 0);
+  } else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame) {
+    cpi->twopass.gf_group_bits += cpi->rc.this_frame_target -
+                                  cpi->rc.projected_frame_size;
+
+    cpi->twopass.gf_group_bits = MAX(cpi->twopass.gf_group_bits, 0);
+  }
+#endif
+}
diff --git a/source/libvpx/vp9/encoder/vp9_ratectrl.h b/source/libvpx/vp9/encoder/vp9_ratectrl.h
index ddda713..063ac8f 100644
--- a/source/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/source/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -20,20 +20,47 @@ void vp9_save_coding_context(VP9_COMP *cpi);
 void vp9_restore_coding_context(VP9_COMP *cpi);
 
 void vp9_setup_key_frame(VP9_COMP *cpi);
-void vp9_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
-int vp9_regulate_q(VP9_COMP *cpi, int target_bits_per_frame);
-void vp9_adjust_key_frame_context(VP9_COMP *cpi);
-void vp9_compute_frame_size_bounds(VP9_COMP *cpi,
-                                   int *frame_under_shoot_limit,
-                                   int *frame_over_shoot_limit);
+void vp9_setup_inter_frame(VP9_COMP *cpi);
+
+double vp9_convert_qindex_to_q(int qindex);
+
+// Updates rate correction factors
+void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var);
+
+// initialize luts for minq
+void vp9_rc_init_minq_luts(void);
 
 // return of 0 means drop frame
-int vp9_pick_frame_size(VP9_COMP *cpi);
+// Changes only rc.this_frame_target and rc.sb64_rate_target
+int vp9_rc_pick_frame_size_target(VP9_COMP *cpi);
 
-double vp9_convert_qindex_to_q(int qindex);
-int vp9_gfboost_qadjust(int qindex);
-int vp9_bits_per_mb(FRAME_TYPE frame_type, int qindex,
-                    double correction_factor);
-void vp9_setup_inter_frame(VP9_COMP *cpi);
+void vp9_rc_compute_frame_size_bounds(const VP9_COMP *cpi,
+                                      int this_frame_target,
+                                      int *frame_under_shoot_limit,
+                                      int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi,
+                                      int *bottom_index,
+                                      int *top_index,
+                                      int *top_index_prop);
+
+// Estimates q to achieve a target bits per frame
+int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
+                      int active_best_quality, int active_worst_quality);
+
+// Post encode update of the rate control parameters based
+// on bytes used and q used for the frame
+void vp9_rc_postencode_update(VP9_COMP *cpi,
+                              uint64_t bytes_used,
+                              int worst_q);
+
+// estimates bits per mb for a given qindex and correction factor
+int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+                       double correction_factor);
+
+// Post encode update of the rate control parameters for 2-pass
+void vp9_twopass_postencode_update(VP9_COMP *cpi,
+                                   uint64_t bytes_used);
 
 #endif  // VP9_ENCODER_VP9_RATECTRL_H_
diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.c b/source/libvpx/vp9/encoder/vp9_rdopt.c
index f9de78b..5702e5a 100644
--- a/source/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/source/libvpx/vp9/encoder/vp9_rdopt.c
@@ -17,8 +17,6 @@
 #include "vp9/encoder/vp9_tokenize.h"
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_modecosts.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -115,23 +113,60 @@ static int rd_thresh_block_size_factor[BLOCK_SIZES] =
 #define MV_COST_WEIGHT      108
 #define MV_COST_WEIGHT_SUB  120
 
+static int raster_block_offset(BLOCK_SIZE plane_bsize,
+                               int raster_block, int stride) {
+  const int bw = b_width_log2(plane_bsize);
+  const int y = 4 * (raster_block >> bw);
+  const int x = 4 * (raster_block & ((1 << bw) - 1));
+  return y * stride + x;
+}
+static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
+                                          int raster_block, int16_t *base) {
+  const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  return base + raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+static void fill_mode_costs(VP9_COMP *c) {
+  VP9_COMMON *const cm = &c->common;
+  int i, j;
+
+  for (i = 0; i < INTRA_MODES; i++)
+    for (j = 0; j < INTRA_MODES; j++)
+      vp9_cost_tokens((int *)c->mb.y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
+                      vp9_intra_mode_tree);
+
+  // TODO(rbultje) separate tables for superblock costing?
+  vp9_cost_tokens(c->mb.mbmode_cost, cm->fc.y_mode_prob[1],
+                  vp9_intra_mode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[1],
+                  cm->fc.uv_mode_prob[INTRA_MODES - 1], vp9_intra_mode_tree);
+  vp9_cost_tokens(c->mb.intra_uv_mode_cost[0],
+                  vp9_kf_uv_mode_prob[INTRA_MODES - 1],
+                  vp9_intra_mode_tree);
+
+  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+    vp9_cost_tokens((int *)c->mb.switchable_interp_costs[i],
+                    cm->fc.switchable_interp_prob[i],
+                    vp9_switchable_interp_tree);
+}
+
 static void fill_token_costs(vp9_coeff_cost *c,
-                             vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
+                             vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
-  for (t = TX_4X4; t <= TX_32X32; t++)
-    for (i = 0; i < BLOCK_TYPES; i++)
-      for (j = 0; j < REF_TYPES; j++)
-        for (k = 0; k < COEF_BANDS; k++)
-          for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
+  for (t = TX_4X4; t <= TX_32X32; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
             vp9_prob probs[ENTROPY_NODES];
             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp9_coef_tree);
             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  vp9_coef_tree);
-            assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
-                   c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
+            assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
+                   c[t][i][j][k][1][l][EOB_TOKEN]);
           }
 }
 
@@ -246,17 +281,20 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
 
   vp9_set_speed_features(cpi);
 
+  cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                              cm->frame_type != KEY_FRAME) ?
+                              0 : 1;
+
   set_block_thresholds(cpi);
 
   fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
 
   for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vp9_cost_tokens(cpi->mb.partition_cost[i],
-                    cm->fc.partition_prob[cm->frame_type][i],
+    vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
                     vp9_partition_tree);
 
   /*rough estimate for costing*/
-  vp9_init_mode_costs(cpi);
+  fill_mode_costs(cpi);
 
   if (!frame_is_intra_only(cm)) {
     vp9_build_nmv_cost_table(
@@ -265,15 +303,9 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi) {
         &cm->fc.nmvc,
         cm->allow_high_precision_mv, 1, 1);
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
-      MB_PREDICTION_MODE m;
-
-      for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
-        cpi->mb.inter_mode_cost[i][inter_mode_offset(m)] =
-            cost_token(vp9_inter_mode_tree,
-                       cm->fc.inter_mode_probs[i],
-                       &vp9_inter_mode_encodings[inter_mode_offset(m)]);
-    }
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+      vp9_cost_tokens((int *)cpi->mb.inter_mode_cost[i],
+                      cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
   }
 }
 
@@ -489,13 +521,14 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
                               const int16_t *scan, const int16_t *nb) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
+  struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   const PLANE_TYPE type = pd->plane_type;
   const int16_t *band_count = &band_counts[tx_size][1];
-  const int eob = pd->eobs[block];
-  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
+  const int eob = p->eobs[block];
+  const int16_t *const qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
-  unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
+  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][ref];
   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
   uint8_t *p_tok = x->token_cache;
@@ -503,12 +536,12 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
   int c, cost;
 
   // Check for consistency of tx_size with mode info
-  assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
+  assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                                       : get_uv_tx_size(mbmi) == tx_size);
 
   if (eob == 0) {
     // single eob token
-    cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
+    cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
     int band_left = *band_count++;
@@ -540,7 +573,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
     // eob token
     if (band_left) {
       pt = get_coef_context(nb, p_tok, c);
-      cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
+      cost += (*token_costs)[0][pt][EOB_TOKEN];
     }
   }
 
@@ -565,8 +598,7 @@ static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
                                &this_sse) >> shift;
   args->sse  = this_sse >> shift;
 
-  if (x->skip_encode &&
-      xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) {
+  if (x->skip_encode && !is_inter_block(&xd->mi_8x8[0]->mbmi)) {
     // TODO(jingning): tune the model to better capture the distortion.
     int64_t p = (pd->dequant[1] * pd->dequant[1] *
                     (1 << ss_txfrm_size)) >> (shift + 2);
@@ -587,8 +619,8 @@ static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
                            args->scan, args->nb);
 }
 
-static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
-                           TX_SIZE tx_size, void *arg) {
+static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, void *arg) {
   struct rdcost_block_args *args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -611,7 +643,8 @@ static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
   if (plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
+    x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
+                                    (rd1 > rd2 && !xd->lossless);
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -653,7 +686,7 @@ void vp9_get_entropy_contexts(TX_SIZE tx_size,
         t_left[i] = !!*(const uint64_t *)&left[i];
       break;
     default:
-      assert(!"Invalid transform size.");
+      assert(0 && "Invalid transform size.");
   }
 }
 
@@ -680,6 +713,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
   const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
+  const scan_order *so;
 
   init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
                     ref_best_rd, rd_stack);
@@ -690,10 +724,12 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
                            pd->above_context, pd->left_context,
                            num_4x4_w, num_4x4_h);
 
-  get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb);
+  so = get_scan(xd, tx_size, pd->plane_type, 0);
+  rd_stack->scan = so->scan;
+  rd_stack->nb = so->neighbors;
 
   foreach_transformed_block_in_plane(xd, bsize, plane,
-                                     block_yrd_txfm, rd_stack);
+                                     block_rd_txfm, rd_stack);
   if (rd_stack->skip) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
@@ -703,7 +739,7 @@ static void txfm_rd_in_plane(MACROBLOCK *x,
     *distortion = rd_stack->this_dist;
     *rate       = rd_stack->this_rate;
     *sse        = rd_stack->this_sse;
-    *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
+    *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
   }
 }
 
@@ -736,63 +772,46 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
+  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    r[n][1] = r[n][0];
-    if (r[n][0] == INT_MAX)
-      continue;
-    for (m = 0; m <= n - (n == max_tx_size); m++) {
-      if (m == n)
-        r[n][1] += vp9_cost_zero(tx_probs[m]);
-      else
-        r[n][1] += vp9_cost_one(tx_probs[m]);
-    }
-  }
-
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
   assert(skip_prob > 0);
   s0 = vp9_cost_bit(skip_prob, 0);
   s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    r[n][1] = r[n][0];
+    if (r[n][0] < INT_MAX) {
+      for (m = 0; m <= n - (n == max_tx_size); m++) {
+        if (m == n)
+          r[n][1] += vp9_cost_zero(tx_probs[m]);
+        else
+          r[n][1] += vp9_cost_one(tx_probs[m]);
+      }
+    }
     if (d[n] == INT64_MAX) {
       rd[n][0] = rd[n][1] = INT64_MAX;
-      continue;
-    }
-    if (s[n]) {
+    } else if (s[n]) {
       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
     } else {
       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
     }
-  }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-        rd[TX_32X32][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] < rd[TX_8X8][1] &&
-               rd[TX_16X16][1] < rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
+    if (rd[n][1] < best_rd) {
+      best_tx = n;
+      best_rd = rd[n][1];
+    }
   }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
+
 
   *distortion = d[mbmi->tx_size];
   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
@@ -802,29 +821,18 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
-  else if (max_tx_size >= TX_16X16 &&
-           rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
-    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
-  else
-    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
-                                 rd[TX_4X4][1] : rd[TX_8X8][1];
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] < rd[TX_16X16][1] &&
-      rd[TX_32X32][1] < rd[TX_8X8][1] &&
-      rd[TX_32X32][1] < rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] < rd[TX_8X8][1] &&
-             rd[TX_16X16][1] < rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
+    tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
   } else {
+    tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
   }
 }
@@ -839,19 +847,22 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-  vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
+  vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   int64_t rd[TX_SIZES][2];
   int n, m;
   int s0, s1;
   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
-  // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
-
-  const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->mi_8x8[0]);
+  const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = TX_4X4;
 
-  // for (n = TX_4X4; n <= max_txfm_size; n++)
-  //   r[n][0] = (r[n][0] * scale_r[n]);
+  const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
+  assert(skip_prob > 0);
+  s0 = vp9_cost_bit(skip_prob, 0);
+  s1 = vp9_cost_bit(skip_prob, 1);
 
   for (n = TX_4X4; n <= max_tx_size; n++) {
+    double scale = scale_rd[n];
     r[n][1] = r[n][0];
     for (m = 0; m <= n - (n == max_tx_size); m++) {
       if (m == n)
@@ -859,62 +870,29 @@ static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
       else
         r[n][1] += vp9_cost_one(tx_probs[m]);
     }
-  }
-
-  assert(skip_prob > 0);
-  s0 = vp9_cost_bit(skip_prob, 0);
-  s1 = vp9_cost_bit(skip_prob, 1);
-
-  for (n = TX_4X4; n <= max_tx_size; n++) {
     if (s[n]) {
-      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
+      rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]) * scale;
     } else {
-      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
-      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
+      rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]) * scale;
+      rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]) * scale;
+    }
+    if (rd[n][1] < best_rd) {
+      best_rd = rd[n][1];
+      best_tx = n;
     }
-  }
-  for (n = TX_4X4; n <= max_tx_size; n++) {
-    rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
-    rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
   }
 
-  if (max_tx_size == TX_32X32 &&
-      (cm->tx_mode == ALLOW_32X32 ||
-       (cm->tx_mode == TX_MODE_SELECT &&
-        rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-        rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-        rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_32X32;
-  } else if (max_tx_size >= TX_16X16 &&
-             (cm->tx_mode == ALLOW_16X16 ||
-              cm->tx_mode == ALLOW_32X32 ||
-              (cm->tx_mode == TX_MODE_SELECT &&
-               rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-               rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
-    mbmi->tx_size = TX_16X16;
-  } else if (cm->tx_mode == ALLOW_8X8 ||
-             cm->tx_mode == ALLOW_16X16 ||
-             cm->tx_mode == ALLOW_32X32 ||
-           (cm->tx_mode == TX_MODE_SELECT &&
-            rd[TX_8X8][1] <= rd[TX_4X4][1])) {
-    mbmi->tx_size = TX_8X8;
-  } else {
-    mbmi->tx_size = TX_4X4;
-  }
+  mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
+                      best_tx : MIN(max_tx_size, max_mode_tx_size);
 
   // Actually encode using the chosen mode if a model was used, but do not
   // update the r, d costs
   txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
 
-  if (max_tx_size == TX_32X32 &&
-      rd[TX_32X32][1] <= rd[TX_16X16][1] &&
-      rd[TX_32X32][1] <= rd[TX_8X8][1] &&
-      rd[TX_32X32][1] <= rd[TX_4X4][1]) {
+  if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
     cpi->tx_stepdown_count[0]++;
-  } else if (max_tx_size >= TX_16X16 &&
-             rd[TX_16X16][1] <= rd[TX_8X8][1] &&
-             rd[TX_16X16][1] <= rd[TX_4X4][1]) {
+  } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
@@ -934,6 +912,9 @@ static void super_block_yrd(VP9_COMP *cpi,
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
   const int b_inter_mode = is_inter_block(mbmi);
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE tx_size;
+
 
   assert(bs == mbmi->sb_type);
   if (b_inter_mode)
@@ -952,34 +933,16 @@ static void super_block_yrd(VP9_COMP *cpi,
 
   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
       b_inter_mode) {
-    if (bs >= BLOCK_32X32)
-      model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
-                           &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
-    if (bs >= BLOCK_16X16)
-      model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
-                           &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
-                         &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
-
-    model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
-                         &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
-
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
+                           &r[tx_size][0], &d[tx_size], &s[tx_size]);
     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
                                   skip, sse, ref_best_rd, bs);
   } else {
-    if (bs >= BLOCK_32X32)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
-                       &s[TX_32X32], &sse[TX_32X32],
-                       ref_best_rd, 0, bs, TX_32X32);
-    if (bs >= BLOCK_16X16)
-      txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
-                       &s[TX_16X16], &sse[TX_16X16],
-                       ref_best_rd, 0, bs, TX_16X16);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
-                     &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
-    txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
-                     &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
+    for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+      txfm_rd_in_plane(x, rdcost_stack, &r[tx_size][0], &d[tx_size],
+                       &s[tx_size], &sse[tx_size],
+                       ref_best_rd, 0, bs, tx_size);
     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
                              skip, txfm_cache, bs);
   }
@@ -1024,10 +987,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                p->src.buf, src_stride);
-  uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                pd->dst.buf, dst_stride);
+  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                            src_stride)];
+  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                       dst_stride)];
   int16_t *src_diff, *coeff;
 
   ENTROPY_CONTEXT ta[2], tempa[2];
@@ -1067,9 +1030,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
         int64_t ssz;
-        const int16_t *scan;
-        const int16_t *nb;
-        uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
+        const scan_order *so;
+        const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
         TX_TYPE tx_type;
@@ -1085,18 +1047,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                            src, src_stride,
                            dst, dst_stride);
 
-        tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
-        get_scan_nb_4x4(tx_type, &scan, &nb);
+        tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
+        so = &vp9_scan_orders[TX_4X4][tx_type];
 
         if (tx_type != DCT_DCT)
           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
         else
           x->fwd_txm4x4(src_diff, coeff, 8);
 
-        vp9_regular_quantize_b_4x4(x, 16, block, scan, get_iscan_4x4(tx_type));
+        vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
 
-        ratey += cost_coeffs(x, 0, block,
-                             tempa + idx, templ + idy, TX_4X4, scan, nb);
+        ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+                             so->scan, so->neighbors);
         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                       16, &ssz) >> 2;
         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
@@ -1327,22 +1289,18 @@ static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
-                                       BLOCK_SIZE bsize) {
+                                       BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MB_PREDICTION_MODE mode;
   MB_PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
 
-  // int mode_mask = (bsize <= BLOCK_8X8)
-  //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
-
-  for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
-    // if (!(mode_mask & (1 << mode)))
-    if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
-          & (1 << mode)))
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
@@ -1362,11 +1320,31 @@ static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
+      if (!x->select_txfm_size) {
+        int i;
+        struct macroblock_plane *const p = x->plane;
+        struct macroblockd_plane *const pd = x->e_mbd.plane;
+        for (i = 1; i < MAX_MB_PLANE; ++i) {
+          p[i].coeff    = ctx->coeff_pbuf[i][2];
+          p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
+          pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+          p[i].eobs    = ctx->eobs_pbuf[i][2];
+
+          ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
+          ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
+          ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
+          ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
+
+          ctx->coeff_pbuf[i][0]   = p[i].coeff;
+          ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
+          ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+          ctx->eobs_pbuf[i][0]    = p[i].eobs;
+        }
+      }
     }
   }
 
   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
-
   return best_rd;
 }
 
@@ -1387,7 +1365,8 @@ static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   return this_rd;
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                                 BLOCK_SIZE bsize, TX_SIZE max_tx_size,
                                  int *rate_uv, int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  MB_PREDICTION_MODE *mode_uv) {
@@ -1401,9 +1380,9 @@ static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
-    rd_pick_intra_sbuv_mode(cpi, x,
+    rd_pick_intra_sbuv_mode(cpi, x, ctx,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
-                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+                            bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
   *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
 }
@@ -1417,7 +1396,7 @@ static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
   // Don't account for mode here if segment skip is enabled.
   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
     assert(is_inter_mode(mode));
-    return x->inter_mode_cost[mode_context][inter_mode_offset(mode)];
+    return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   } else {
     return 0;
   }
@@ -1523,21 +1502,22 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   struct macroblockd_plane *const pd = &xd->plane[0];
   struct macroblock_plane *const p = &x->plane[0];
   MODE_INFO *const mi = xd->mi_8x8[0];
-  const BLOCK_SIZE bsize = mi->mbmi.sb_type;
-  const int width = plane_block_width(bsize, pd);
-  const int height = plane_block_height(bsize, pd);
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
+  const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
 
-  uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 p->src.buf, p->src.stride);
-  uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 pd->dst.buf, pd->dst.stride);
+  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
+                                                             p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
+                                                        pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
+  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(&mi->mbmi);
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
-                                     pd->pre[ref].buf, pd->pre[ref].stride);
+    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+                                               pd->pre[ref].stride)];
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
@@ -1560,16 +1540,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
       coeff = BLOCK_OFFSET(p->coeff, k);
       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                     coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 16, k, get_scan_4x4(DCT_DCT),
-                                 get_iscan_4x4(DCT_DCT));
+      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
                                         16, &ssz);
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, k,
-                              ta + (k & 1),
-                              tl + (k >> 1), TX_4X4,
-                              vp9_default_scan_4x4,
-                              vp9_default_scan_4x4_neighbors);
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+                              so->scan, so->neighbors);
       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
       rd = MIN(rd1, rd2);
@@ -1625,14 +1601,13 @@ static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
-                                         p->src.stride);
+  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
-                                             pd->pre[0].stride);
+  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
+                                                       pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
-                                               pd->pre[1].stride);
+    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
+                                                         pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
@@ -1654,6 +1629,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   MB_PREDICTION_MODE this_mode;
   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
+  struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   const int label_count = 4;
   int64_t this_segment_rd = 0;
@@ -1693,22 +1669,22 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
 
       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
       vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
+                                    i, 0, mi_row, mi_col,
                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
-                                    &frame_mv[NEARMV][mbmi->ref_frame[0]],
-                                    i, 0, mi_row, mi_col);
+                                    &frame_mv[NEARMV][mbmi->ref_frame[0]]);
       if (has_second_rf) {
         frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
+                                      i, 1, mi_row, mi_col,
                                       &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
-                                      &frame_mv[NEARMV][mbmi->ref_frame[1]],
-                                      i, 1, mi_row, mi_col);
+                                      &frame_mv[NEARMV][mbmi->ref_frame[1]]);
       }
       // search for the best motion vector on this segment
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
 
-        mode_idx = inter_mode_offset(this_mode);
+        mode_idx = INTER_OFFSET(this_mode);
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
 
         // if we're near/nearest and mv == 0,0, compare to zeromv
@@ -1804,6 +1780,9 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
           // adjust src pointer for this block
           mi_buf_shift(x, i);
+
+          vp9_set_mv_search_range(x, &bsi->ref_mv->as_mv);
+
           if (cpi->sf.search_method == HEX) {
             bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                                      step_param,
@@ -1834,10 +1813,10 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
             clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max,
                      x->mv_row_min, x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, &mvp_full,
+            thissme = cpi->full_search_sad(x, &mvp_full.as_mv,
                                            sadpb, 16, v_fn_ptr,
                                            x->nmvjointcost, x->mvcost,
-                                           bsi->ref_mv, i);
+                                           &bsi->ref_mv->as_mv, i);
 
             if (thissme < bestsme) {
               bestsme = thissme;
@@ -1872,12 +1851,14 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+        if (has_second_rf) {
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
             continue;
+        }
 
+        if (has_second_rf && this_mode == NEWMV &&
+            mbmi->interp_filter == EIGHTTAP) {
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
@@ -1900,6 +1881,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
                         x->mvcost, cpi);
 
+
         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
         if (num_4x4_blocks_wide > 1)
           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
@@ -1979,11 +1961,11 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
+          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
+            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
+            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
@@ -2001,7 +1983,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
         return;
       }
 
-      mode_idx = inter_mode_offset(mode_selected);
+      mode_idx = INTER_OFFSET(mode_selected);
       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
@@ -2077,11 +2059,11 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
     return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
-    mode_idx = inter_mode_offset(bsi->modes[i]);
+    mode_idx = INTER_OFFSET(bsi->modes[i]);
     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
     if (has_second_ref(mbmi))
       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
-    xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
+    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
     mi->bmi[i].as_mode = bsi->modes[i];
   }
 
@@ -2091,7 +2073,7 @@ static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   *returntotrate = bsi->r;
   *returndistortion = bsi->d;
   *returnyrate = bsi->segment_yrate;
-  *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
+  *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
   *psse = bsi->sse;
   mbmi->mode = bsi->modes[3];
 
@@ -2109,7 +2091,7 @@ static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   int best_index = 0;
   int best_sad = INT_MAX;
   int this_sad = INT_MAX;
-  unsigned int max_mv = 0;
+  int max_mv = 0;
 
   uint8_t *src_y_ptr = x->plane[0].src.buf;
   uint8_t *ref_y_ptr;
@@ -2165,11 +2147,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
     *comp_mode_p = 128;
   } else {
-    vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
+    vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
     vp9_prob comp_inter_p = 128;
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
-      comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
       *comp_mode_p = comp_inter_p;
     } else {
       *comp_mode_p = 128;
@@ -2177,12 +2159,12 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
 
     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
 
-    if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 0);
 
       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
@@ -2197,11 +2179,11 @@ static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
-    if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
+    if (cm->reference_mode != SINGLE_REFERENCE) {
       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION)
+      if (cm->reference_mode == REFERENCE_MODE_SELECT)
         base_cost += vp9_cost_bit(comp_inter_p, 1);
 
       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
@@ -2217,7 +2199,7 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int_mv *ref_mv,
                          int_mv *second_ref_mv,
-                         int64_t comp_pred_diff[NB_PREDICTION_TYPES],
+                         int64_t comp_pred_diff[REFERENCE_MODES],
                          int64_t tx_size_diff[TX_MODES],
                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2231,9 +2213,9 @@ static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   ctx->best_ref_mv.as_int = ref_mv->as_int;
   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
 
-  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
-  ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
-  ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
+  ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+  ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
+  ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
 
   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
@@ -2343,7 +2325,6 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   int_mv mvp_full;
   int ref = mbmi->ref_frame[0];
   int_mv ref_mv = mbmi->ref_mvs[ref][0];
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2363,7 +2344,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
-  vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
+  vp9_set_mv_search_range(x, &ref_mv.as_mv);
 
   // Adjust search parameters based on small partitions' result.
   if (x->fast_ms) {
@@ -2417,24 +2398,24 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     bestsme = vp9_hex_search(x, &mvp_full.as_mv,
                              step_param,
                              sadpb, 1,
-                             &cpi->fn_ptr[block_size], 1,
+                             &cpi->fn_ptr[bsize], 1,
                              &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == SQUARE) {
     bestsme = vp9_square_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
-                                &cpi->fn_ptr[block_size], 1,
+                                &cpi->fn_ptr[bsize], 1,
                                 &ref_mv.as_mv, &tmp_mv->as_mv);
   } else if (cpi->sf.search_method == BIGDIA) {
     bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
                                 step_param,
                                 sadpb, 1,
-                                &cpi->fn_ptr[block_size], 1,
+                                &cpi->fn_ptr[bsize], 1,
                                 &ref_mv.as_mv, &tmp_mv->as_mv);
   } else {
     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
                                      sadpb, further_steps, 1,
-                                     &cpi->fn_ptr[block_size],
+                                     &cpi->fn_ptr[bsize],
                                      &ref_mv, tmp_mv);
   }
 
@@ -2449,7 +2430,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
                                  cm->allow_high_precision_mv,
                                  x->errorperbit,
-                                 &cpi->fn_ptr[block_size],
+                                 &cpi->fn_ptr[bsize],
                                  0, cpi->sf.subpel_iters_per_step,
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &sse);
@@ -2473,57 +2454,44 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv) {
-  int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
+  const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
-  int refs[2] = { mbmi->ref_frame[0],
-    (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+  const int refs[2] = { mbmi->ref_frame[0],
+                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   int_mv ref_mv[2];
-  const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  int ite;
+  int ite, ref;
   // Prediction buffer from second frame.
   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
 
   // Do joint motion search in compound mode to get more accurate mv.
-  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
-  struct buf_2d scaled_first_yv12;
+  struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+  struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   int last_besterr[2] = {INT_MAX, INT_MAX};
-  YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
-  scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
-  scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
-
-  ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
-  ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
-
-  if (scaled_ref_frame[0]) {
-    int i;
-    // Swap out the reference frame for a version that's been scaled to
-    // match the resolution of the current frame, allowing the existing
-    // motion search code to be used without additional modifications.
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
-    setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
-  }
+  YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+    get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
+    get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
+  };
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_second_yv12[i] = xd->plane[i].pre[1];
+  for (ref = 0; ref < 2; ++ref) {
+    ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
+
+    if (scaled_ref_frame[ref]) {
+      int i;
+      // Swap out the reference frame for a version that's been scaled to
+      // match the resolution of the current frame, allowing the existing
+      // motion search code to be used without additional modifications.
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        backup_yv12[ref][i] = xd->plane[i].pre[ref];
+      setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
+    }
 
-    setup_pre_planes(xd, 1, scaled_ref_frame[1], mi_row, mi_col, NULL);
+    xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref],
+                                                  mi_row, mi_col);
+    frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   }
 
-  xd->scale_factor[0].sfc->set_scaled_offsets(&xd->scale_factor[0],
-                                         mi_row, mi_col);
-  xd->scale_factor[1].sfc->set_scaled_offsets(&xd->scale_factor[1],
-                                         mi_row, mi_col);
-  scaled_first_yv12 = xd->plane[0].pre[0];
-
-  // Initialize mv using single prediction mode result.
-  frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
-  frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
-
   // Allow joint search multiple times iteratively for each ref frame
   // and break out the search loop if it couldn't find better mv.
   for (ite = 0; ite < 4; ite++) {
@@ -2555,7 +2523,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     // Compound motion search on first ref frame.
     if (id)
       xd->plane[0].pre[0] = ref_yv12[id];
-    vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
+    vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
 
     // Use mv result from single mode as mvp.
     tmp_mv.as_int = frame_mv[refs[id]].as_int;
@@ -2566,7 +2534,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     // Small-range full-pixel motion search
     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
                                        search_range,
-                                       &cpi->fn_ptr[block_size],
+                                       &cpi->fn_ptr[bsize],
                                        x->nmvjointcost, x->mvcost,
                                        &ref_mv[id], second_pred,
                                        pw, ph);
@@ -2585,7 +2553,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
           &ref_mv[id].as_mv,
           cpi->common.allow_high_precision_mv,
           x->errorperbit,
-          &cpi->fn_ptr[block_size],
+          &cpi->fn_ptr[bsize],
           0, cpi->sf.subpel_iters_per_step,
           x->nmvjointcost, x->mvcost,
           &dis, &sse, second_pred,
@@ -2603,28 +2571,34 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     }
   }
 
-  // restore the predictor
-  if (scaled_ref_frame[0]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
-  }
+  *rate_mv = 0;
 
-  if (scaled_ref_frame[1]) {
-    int i;
-    for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[1] = backup_second_yv12[i];
+  for (ref = 0; ref < 2; ++ref) {
+    if (scaled_ref_frame[ref]) {
+      // restore the predictor
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[ref] = backup_yv12[ref][i];
+    }
+
+    *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                &mbmi->ref_mvs[refs[ref]][0].as_mv,
+                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-  *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
-                              &mbmi->ref_mvs[refs[0]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
-  *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
-                              &mbmi->ref_mvs[refs[1]][0].as_mv,
-                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
   vpx_free(second_pred);
 }
 
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
@@ -2661,6 +2635,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
 
+  if (is_comp_pred) {
+    if (frame_mv[refs[0]].as_int == INVALID_MV ||
+        frame_mv[refs[1]].as_int == INVALID_MV)
+      return INT64_MAX;
+  }
+
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
@@ -2679,9 +2659,6 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      if (frame_mv[refs[0]].as_int == INVALID_MV ||
-          frame_mv[refs[1]].as_int == INVALID_MV)
-        return INT64_MAX;
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
@@ -2758,13 +2735,10 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   *rate2 += cost_mv_ref(cpi, this_mode,
                         mbmi->mode_context[mbmi->ref_frame[0]]);
 
-  if (!(*mode_excluded)) {
-    if (is_comp_pred) {
-      *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
-    } else {
-      *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
-    }
-  }
+  if (!(*mode_excluded))
+    *mode_excluded = is_comp_pred
+        ? cpi->common.reference_mode == SINGLE_REFERENCE
+        : cpi->common.reference_mode == COMPOUND_REFERENCE;
 
   pred_exists = 0;
   // Are all MVs integer pel for Y and UV
@@ -2773,6 +2747,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   if (is_comp_pred)
     intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
         (mbmi->mv[1].as_mv.col & 15) == 0;
+
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (cm->mcomp_filter_type != BILINEAR) {
@@ -2812,10 +2787,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
               (cm->mcomp_filter_type != SWITCHABLE &&
                (cm->mcomp_filter_type == mbmi->interp_filter ||
                 (i == 0 && intpel_mv)))) {
-            for (j = 0; j < MAX_MB_PLANE; j++) {
-              xd->plane[j].dst.buf = orig_dst[j];
-              xd->plane[j].dst.stride = orig_dst_stride[j];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
           } else {
             for (j = 0; j < MAX_MB_PLANE; j++) {
               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
@@ -2839,10 +2811,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
         }
         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
           if (rd / 2 > ref_best_rd) {
-            for (i = 0; i < MAX_MB_PLANE; i++) {
-              xd->plane[i].dst.buf = orig_dst[i];
-              xd->plane[i].dst.stride = orig_dst_stride[i];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
             return INT64_MAX;
           }
         }
@@ -2861,11 +2830,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
           pred_exists = 1;
         }
       }
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
   // Set the appropriate filter
@@ -2897,10 +2862,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
   }
@@ -2975,7 +2937,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
               x->skip = 1;
 
               // The cost of skip bit needs to be added.
-              *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
+              *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
 
               // Scaling factor for SSE from spatial domain to frequency domain
               // is 16. Adjust distortion accordingly.
@@ -3003,10 +2965,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3021,10 +2980,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3034,14 +2990,34 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
     *skippable = skippable_y && skippable_uv;
   }
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = orig_dst[i];
-    xd->plane[i].dst.stride = orig_dst_stride[i];
-  }
-
+  restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                           int max_plane) {
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = x->e_mbd.plane;
+  int i;
+
+  for (i = 0; i < max_plane; ++i) {
+    p[i].coeff    = ctx->coeff_pbuf[i][1];
+    p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
+    p[i].eobs    = ctx->eobs_pbuf[i][1];
+
+    ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
+    ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
+    ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
+    ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
+
+    ctx->coeff_pbuf[i][0]   = p[i].coeff;
+    ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
+    ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+    ctx->eobs_pbuf[i][0]    = p[i].eobs;
+  }
+}
+
 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
                                int *returnrate, int64_t *returndist,
                                BLOCK_SIZE bsize,
@@ -3051,9 +3027,11 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   int y_skip = 0, uv_skip = 0;
   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
+  TX_SIZE max_uv_tx_size;
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
+
   if (bsize >= BLOCK_8X8) {
     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
                                &dist_y, &y_skip, bsize, tx_cache,
@@ -3061,8 +3039,9 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, bsize);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   } else {
     y_skip = 0;
     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
@@ -3070,19 +3049,19 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
-                            &dist_uv, &uv_skip, BLOCK_8X8);
+    max_uv_tx_size = get_uv_tx_size_impl(xd->mi_8x8[0]->mbmi.tx_size, bsize);
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+                            &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   }
 
   if (y_skip && uv_skip) {
     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
-                  vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
+                  vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     *returndist = dist_y + dist_uv;
     vp9_zero(ctx->tx_rd_diff);
   } else {
     int i;
-    *returnrate = rate_y + rate_uv +
-        vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
+    *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
     *returndist = dist_y + dist_uv;
     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
       for (i = 0; i < TX_MODES; i++) {
@@ -3125,8 +3104,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_rd = best_rd_so_far;
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
@@ -3153,7 +3132,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   int best_skip2 = 0;
 
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
 
   // Everywhere the flag is set the error is much higher than its neighbors.
   ctx->frames_with_high_error = 0;
@@ -3162,7 +3141,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
@@ -3192,8 +3171,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       case BLOCK_32X32:
         for (i = 0; i < 4; i++) {
           ref_frame_mask |=
-              x->mb_context[xd->sb_index][i].frames_with_high_error;
-          mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
+              x->mb_context[x->sb_index][i].frames_with_high_error;
+          mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error;
         }
         break;
       default:
@@ -3258,7 +3237,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
             break;
           case NONE:
           case MAX_REF_FRAMES:
-            assert(!"Invalid Reference frame");
+            assert(0 && "Invalid Reference frame");
         }
       }
       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
@@ -3335,17 +3314,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
-      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
 
-      mode_excluded = mode_excluded
-                         ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
+      mode_excluded = mode_excluded ? mode_excluded
+                                    : cm->reference_mode == SINGLE_REFERENCE;
     } else {
-      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
-        mode_excluded =
-            mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
+      if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME)
+        mode_excluded = mode_excluded ?
+            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
     }
 
     // Select prediction reference frames.
@@ -3434,12 +3410,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       if (rate_y == INT_MAX)
         continue;
 
-      uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
+      uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
-                             &rate_uv_tokenonly[uv_tx],
-                             &dist_uv[uv_tx], &skip_uv[uv_tx],
-                             &mode_uv[uv_tx]);
+        choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
+                             &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                             &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
 
       rate_uv = rate_uv_tokenonly[uv_tx];
@@ -3467,9 +3442,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
     }
 
-    if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+    if (cm->reference_mode == REFERENCE_MODE_SELECT)
       rate2 += compmode_cost;
-    }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
@@ -3498,9 +3472,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
           int prob_skip_cost;
 
           // Cost the skip mb case
-          vp9_prob skip_prob =
-            vp9_get_pred_prob_mbskip(cm, xd);
-
+          vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
           if (skip_prob) {
             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
             rate2 += prob_skip_cost;
@@ -3510,14 +3482,10 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            0);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            1);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -3527,9 +3495,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         }
       } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                          0);
-        rate2 += prob_skip_cost;
+        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
 
       // Calculate the final RD estimate for this mode.
@@ -3537,23 +3503,22 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best intra rd
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME &&
-        is_intra_mode(xd->mi_8x8[0]->mbmi.mode) &&
+    if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
         this_rd < best_intra_rd) {
       best_intra_rd = this_rd;
       best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
     }
+
     // Keep record of best inter rd with single reference
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
-        !mode_excluded &&
-        this_rd < best_inter_rd) {
+    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
+        !mode_excluded && this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
       best_inter_ref_frame = ref_frame;
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -3571,6 +3536,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
+      int max_plane = MAX_MB_PLANE;
       if (!mode_excluded) {
         // Note index of best mode so far
         best_mode_index = mode_index;
@@ -3578,6 +3544,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -3585,6 +3552,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
@@ -3609,9 +3578,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cm->comp_pred_mode == HYBRID_PREDICTION) {
+      if (cm->reference_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -3623,14 +3592,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
       if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
     /* keep record of best filter type */
@@ -3690,12 +3659,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
-                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
+                              bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
+                              uv_tx_size);
     }
   }
 
@@ -3725,7 +3697,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.ref_frame[0] == INTRA_FRAME));
+         !is_inter_block(&best_mbmode));
 
   // Updating rd_thresh_freq_fact[] here means that the different
   // partition/block sizes are handled independently based on the best
@@ -3752,7 +3724,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
@@ -3823,8 +3795,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_tx_rd[TX_MODES];
   int64_t best_tx_diff[TX_MODES];
-  int64_t best_pred_diff[NB_PREDICTION_TYPES];
-  int64_t best_pred_rd[NB_PREDICTION_TYPES];
+  int64_t best_pred_diff[REFERENCE_MODES];
+  int64_t best_pred_rd[REFERENCE_MODES];
   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode = { 0 };
@@ -3839,15 +3811,13 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   int skip_uv[TX_SIZES];
   MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
   struct scale_factors scale_factor[4];
-  unsigned int ref_frame_mask = 0;
-  unsigned int mode_mask = 0;
   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
                                              cpi->common.y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
 
-  x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
+  x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
 
   for (i = 0; i < 4; i++) {
@@ -3859,7 +3829,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+  for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
   for (i = 0; i < TX_MODES; i++)
     best_tx_rd[i] = INT64_MAX;
@@ -3870,15 +3840,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   *returnrate = INT_MAX;
 
-  // Create a mask set to 1 for each reference frame used by a smaller
-  // resolution.
-  if (cpi->sf.use_avoid_tested_higherror) {
-    ref_frame_mask = 0;
-    mode_mask = 0;
-    ref_frame_mask = ~ref_frame_mask;
-    mode_mask = ~mode_mask;
-  }
-
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
@@ -3930,7 +3891,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             break;
           case NONE:
           case MAX_REF_FRAMES:
-            assert(!"Invalid Reference frame");
+            assert(0 && "Invalid Reference frame");
         }
       }
       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
@@ -4001,14 +3962,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         continue;
       set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
 
-      mode_excluded = mode_excluded
-                         ? mode_excluded
-                         : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
+      mode_excluded = mode_excluded ? mode_excluded
+                                    : cm->reference_mode == SINGLE_REFERENCE;
     } else {
       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
-        mode_excluded =
-            mode_excluded ?
-                mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
+        mode_excluded = mode_excluded ?
+            mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
       }
     }
 
@@ -4060,7 +4019,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       distortion2 += distortion_y;
 
       if (rate_uv_intra[TX_4X4] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+        choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
+                             &rate_uv_intra[TX_4X4],
                              &rate_uv_tokenonly[TX_4X4],
                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
                              &mode_uv[TX_4X4]);
@@ -4104,6 +4064,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
             cpi->sf.disable_filter_search_var_thresh) {
           tmp_best_filter = EIGHTTAP;
           vp9_zero(cpi->rd_filter_cache);
+        } else if (cpi->sf.adaptive_pred_filter_type == 1 &&
+                   ctx->pred_filter_type < SWITCHABLE) {
+          tmp_best_filter = ctx->pred_filter_type;
+          vp9_zero(cpi->rd_filter_cache);
+        } else if (cpi->sf.adaptive_pred_filter_type == 2) {
+          tmp_best_filter = ctx->pred_filter_type < SWITCHABLE ?
+                              ctx->pred_filter_type : 0;
+          vp9_zero(cpi->rd_filter_cache);
         } else {
           for (switchable_filter_index = 0;
                switchable_filter_index < SWITCHABLE_FILTERS;
@@ -4151,7 +4119,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
               tmp_best_mbmode = *mbmi;
               for (i = 0; i < 4; i++) {
                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
-                x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
+                x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
               }
               pred_exists = 1;
               if (switchable_filter_index == 0 &&
@@ -4170,7 +4138,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         }
       }
 
-      if (tmp_best_rdu == INT64_MAX)
+      if (tmp_best_rdu == INT64_MAX && pred_exists)
         continue;
 
       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
@@ -4212,12 +4180,11 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       if (cpi->common.mcomp_filter_type == SWITCHABLE)
         rate2 += get_switchable_rate(x);
 
-      if (!mode_excluded) {
-        if (comp_pred)
-          mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
-        else
-          mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
-      }
+      if (!mode_excluded)
+         mode_excluded = comp_pred
+             ? cpi->common.reference_mode == SINGLE_REFERENCE
+             : cpi->common.reference_mode == COMPOUND_REFERENCE;
+
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
 
       tmp_best_rdu = best_rd -
@@ -4244,9 +4211,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       }
     }
 
-    if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+    if (cpi->common.reference_mode == REFERENCE_MODE_SELECT)
       rate2 += compmode_cost;
-    }
 
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
@@ -4269,14 +4235,10 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
           // Add in the cost of the no skip flag.
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            0);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
         } else {
           // FIXME(rbultje) make this work for splitmv also
-          int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                            1);
-          rate2 += prob_skip_cost;
+          rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
           distortion2 = total_sse;
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
@@ -4286,9 +4248,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
         }
       } else if (mb_skip_allowed) {
         // Add in the cost of the no skip flag.
-        int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
-                                          0);
-        rate2 += prob_skip_cost;
+        rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
       }
 
       // Calculate the final RD estimate for this mode.
@@ -4296,8 +4256,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     // Keep record of best inter rd with single reference
-    if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
-        xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
+    if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
+        !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
         !mode_excluded &&
         this_rd < best_inter_rd) {
       best_inter_rd = this_rd;
@@ -4305,7 +4265,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
-      for (i = 0; i < NB_PREDICTION_TYPES; ++i)
+      for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
@@ -4314,12 +4274,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
+        int max_plane = MAX_MB_PLANE;
         // Note index of best mode so far
         best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -4329,6 +4291,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
+        if (!x->select_txfm_size)
+          swap_block_ptr(x, ctx, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
@@ -4356,9 +4320,9 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
     /* keep record of best compound/single-only prediction */
     if (!disable_skip && ref_frame != INTRA_FRAME) {
-      int single_rd, hybrid_rd, single_rate, hybrid_rate;
+      int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
 
-      if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
+      if (cpi->common.reference_mode == REFERENCE_MODE_SELECT) {
         single_rate = rate2 - compmode_cost;
         hybrid_rate = rate2;
       } else {
@@ -4370,14 +4334,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
 
       if (second_ref_frame <= INTRA_FRAME &&
-          single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
-        best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
+          single_rd < best_pred_rd[SINGLE_REFERENCE]) {
+        best_pred_rd[SINGLE_REFERENCE] = single_rd;
       } else if (second_ref_frame > INTRA_FRAME &&
-                 single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
-        best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
+                 single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
+        best_pred_rd[COMPOUND_REFERENCE] = single_rd;
       }
-      if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
-        best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
+      if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
+        best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
     /* keep record of best filter type */
@@ -4434,12 +4398,14 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   if (cpi->sf.use_uv_intra_rd_estimate) {
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
-      TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+      TX_SIZE uv_tx_size;
+      *mbmi = best_mbmode;
+      uv_tx_size = get_uv_tx_size(mbmi);
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
-                              BLOCK_8X8);
+                              BLOCK_8X8, uv_tx_size);
     }
   }
 
@@ -4456,7 +4422,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
 
   assert((cm->mcomp_filter_type == SWITCHABLE) ||
          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
-         (best_mbmode.ref_frame[0] == INTRA_FRAME));
+         !is_inter_block(&best_mbmode));
 
   // Updating rd_thresh_freq_fact[] here means that the different
   // partition/block sizes are handled independently based on the best
@@ -4482,7 +4448,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
-  if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
+  if (!is_inter_block(&best_mbmode)) {
     for (i = 0; i < 4; i++)
       xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   } else {
@@ -4493,7 +4459,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
     mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
   }
 
-  for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
+  for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
     else
diff --git a/source/libvpx/vp9/encoder/vp9_rdopt.h b/source/libvpx/vp9/encoder/vp9_rdopt.h
index 92fb235..f0e8849 100644
--- a/source/libvpx/vp9/encoder/vp9_rdopt.h
+++ b/source/libvpx/vp9/encoder/vp9_rdopt.h
@@ -8,10 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#include "vp9/encoder/vp9_onyx_int.h"
+
 #define RDDIV_BITS          7
 
 #define RDCOST(RM, DM, R, D) \
diff --git a/source/libvpx/vp9/encoder/vp9_sad_c.c b/source/libvpx/vp9/encoder/vp9_sad_c.c
index 42ddb21..55d595b 100644
--- a/source/libvpx/vp9/encoder/vp9_sad_c.c
+++ b/source/libvpx/vp9/encoder/vp9_sad_c.c
@@ -10,11 +10,11 @@
 
 
 #include <stdlib.h>
-#include "vp9/common/vp9_sadmxn.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "vp9/encoder/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
 #include "vpx/vpx_integer.h"
-#include "./vp9_rtcd.h"
 
 #define sad_mxn_func(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \
diff --git a/source/libvpx/vp9/common/vp9_sadmxn.h b/source/libvpx/vp9/encoder/vp9_sadmxn.h
index b2dfd63..b2dfd63 100644
--- a/source/libvpx/vp9/common/vp9_sadmxn.h
+++ b/source/libvpx/vp9/encoder/vp9_sadmxn.h
diff --git a/source/libvpx/vp9/encoder/vp9_segmentation.c b/source/libvpx/vp9/encoder/vp9_segmentation.c
index 24f011f..a9cdc9a 100644
--- a/source/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/source/libvpx/vp9/encoder/vp9_segmentation.c
@@ -149,7 +149,7 @@ static void count_segs(VP9_COMP *cpi, const TileInfo *const tile,
 
     // Store the prediction status for this mb and update counts
     // as appropriate
-    vp9_set_pred_flag_seg_id(xd, pred_flag);
+    xd->mi_8x8[0]->mbmi.seg_id_predicted = pred_flag;
     temporal_predictor_count[pred_context][pred_flag]++;
 
     if (!pred_flag)
diff --git a/source/libvpx/vp9/encoder/vp9_subexp.c b/source/libvpx/vp9/encoder/vp9_subexp.c
index eb864d9..f31e568 100644
--- a/source/libvpx/vp9/encoder/vp9_subexp.c
+++ b/source/libvpx/vp9/encoder/vp9_subexp.c
@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_boolhuff.h"
 #include "vp9/encoder/vp9_treewriter.h"
 
-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
 static int update_bits[255];
@@ -221,7 +220,7 @@ int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
 }
 
 void vp9_cond_prob_diff_update(vp9_writer *w, vp9_prob *oldp,
-                               unsigned int *ct) {
+                               const unsigned int ct[2]) {
   const vp9_prob upd = DIFF_UPDATE_PROB;
   vp9_prob newp = get_binary_prob(ct[0], ct[1]);
   const int savings = vp9_prob_diff_update_savings_search(ct, *oldp, &newp,
diff --git a/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/source/libvpx/vp9/encoder/vp9_temporal_filter.c
index 2cace03..6d4075e 100644
--- a/source/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/source/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -11,22 +11,22 @@
 #include <math.h>
 #include <limits.h>
 
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
+#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_ratectrl.h"
-#include "vp9/common/vp9_quant_common.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering
@@ -469,7 +469,7 @@ void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
   // cases where the filter extends beyond the end of clip.
   // Note: this_frame->frame has been updated in the loop
   // so it now points at the ARF frame.
-  half_gf_int = cpi->baseline_gf_interval >> 1;
+  half_gf_int = cpi->rc.baseline_gf_interval >> 1;
   frames_after_arf = (int)(cpi->twopass.total_stats.count - this_frame - 1);
 
   switch (cpi->oxcf.arnr_type) {
@@ -507,7 +507,7 @@ void configure_arnr_filter(VP9_COMP *cpi, const unsigned int this_frame,
   cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd;
 
   // Adjust the strength based on active max q
-  q = ((int)vp9_convert_qindex_to_q(cpi->active_worst_quality) >> 1);
+  q = ((int)vp9_convert_qindex_to_q(cpi->rc.active_worst_quality) >> 1);
   if (q > 8) {
     cpi->active_arnr_strength = cpi->oxcf.arnr_strength;
   } else {
diff --git a/source/libvpx/vp9/encoder/vp9_tokenize.c b/source/libvpx/vp9/encoder/vp9_tokenize.c
index 550263a..970a27a 100644
--- a/source/libvpx/vp9/encoder/vp9_tokenize.c
+++ b/source/libvpx/vp9/encoder/vp9_tokenize.c
@@ -21,19 +21,91 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_entropy.h"
 
-/* Global event counters used for accumulating statistics across several
-   compressions, then generating vp9_context.c = initial stats. */
-
-#ifdef ENTROPY_STATS
-vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-extern vp9_coeff_stats tree_update_hist[TX_SIZES][BLOCK_TYPES];
-#endif  /* ENTROPY_STATS */
-
 static TOKENVALUE dct_value_tokens[DCT_MAX_VALUE * 2];
 const TOKENVALUE *vp9_dct_value_tokens_ptr;
 static int dct_value_cost[DCT_MAX_VALUE * 2];
 const int *vp9_dct_value_cost_ptr;
 
+// Array indices are identical to previously-existing CONTEXT_NODE indices
+const vp9_tree_index vp9_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  -EOB_TOKEN, 2,                       // 0  = EOB
+  -ZERO_TOKEN, 4,                      // 1  = ZERO
+  -ONE_TOKEN, 6,                       // 2  = ONE
+  8, 12,                               // 3  = LOW_VAL
+  -TWO_TOKEN, 10,                      // 4  = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 5  = THREE
+  14, 16,                              // 6  = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 7  = CAT_ONE
+  18, 20,                              // 8  = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 9  = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
+};
+
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
+static const vp9_prob Pcat1[] = { 159};
+static const vp9_prob Pcat2[] = { 165, 145};
+static const vp9_prob Pcat3[] = { 173, 148, 140};
+static const vp9_prob Pcat4[] = { 176, 155, 140, 135};
+static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130};
+static const vp9_prob Pcat6[] = {
+  254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129
+};
+
+static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
+
+static void init_bit_tree(vp9_tree_index *p, int n) {
+  int i = 0;
+
+  while (++i < n) {
+    p[0] = p[1] = i << 1;
+    p += 2;
+  }
+
+  p[0] = p[1] = 0;
+}
+
+static void init_bit_trees() {
+  init_bit_tree(cat1, 1);
+  init_bit_tree(cat2, 2);
+  init_bit_tree(cat3, 3);
+  init_bit_tree(cat4, 4);
+  init_bit_tree(cat5, 5);
+  init_bit_tree(cat6, 14);
+}
+
+const vp9_extra_bit vp9_extra_bits[ENTROPY_TOKENS] = {
+  {0, 0, 0, 0},           // ZERO_TOKEN
+  {0, 0, 0, 1},           // ONE_TOKEN
+  {0, 0, 0, 2},           // TWO_TOKEN
+  {0, 0, 0, 3},           // THREE_TOKEN
+  {0, 0, 0, 4},           // FOUR_TOKEN
+  {cat1, Pcat1, 1, 5},    // CATEGORY1_TOKEN
+  {cat2, Pcat2, 2, 7},    // CATEGORY2_TOKEN
+  {cat3, Pcat3, 3, 11},   // CATEGORY3_TOKEN
+  {cat4, Pcat4, 4, 19},   // CATEGORY4_TOKEN
+  {cat5, Pcat5, 5, 35},   // CATEGORY5_TOKEN
+  {cat6, Pcat6, 14, 67},  // CATEGORY6_TOKEN
+  {0, 0, 0, 0}            // EOB_TOKEN
+};
+
+struct vp9_token vp9_coef_encodings[ENTROPY_TOKENS];
+
+void vp9_coef_tree_initialize() {
+  init_bit_trees();
+  vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
+}
+
 static void fill_value_tokens() {
   TOKENVALUE *const t = dct_value_tokens + DCT_MAX_VALUE;
   const vp9_extra_bit *const e = vp9_extra_bits;
@@ -65,7 +137,7 @@ static void fill_value_tokens() {
     // initialize the cost for extra bits for all possible coefficient value.
     {
       int cost = 0;
-      const vp9_extra_bit *p = vp9_extra_bits + t[i].token;
+      const vp9_extra_bit *p = &vp9_extra_bits[t[i].token];
 
       if (p->base_val) {
         const int extra = t[i].extra;
@@ -81,7 +153,7 @@ static void fill_value_tokens() {
   } while (++i < DCT_MAX_VALUE);
 
   vp9_dct_value_tokens_ptr = dct_value_tokens + DCT_MAX_VALUE;
-  vp9_dct_value_cost_ptr   = dct_value_cost + DCT_MAX_VALUE;
+  vp9_dct_value_cost_ptr = dct_value_cost + DCT_MAX_VALUE;
 }
 
 struct tokenize_b_args {
@@ -89,16 +161,18 @@ struct tokenize_b_args {
   MACROBLOCKD *xd;
   TOKENEXTRA **tp;
   TX_SIZE tx_size;
+  uint8_t *token_cache;
 };
 
 static void set_entropy_context_b(int plane, int block, BLOCK_SIZE plane_bsize,
                                   TX_SIZE tx_size, void *arg) {
   struct tokenize_b_args* const args = arg;
   MACROBLOCKD *const xd = args->xd;
+  struct macroblock_plane *p = &args->cpi->mb.plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
-  set_contexts(xd, pd, plane_bsize, tx_size, pd->eobs[block] > 0, aoff, loff);
+  set_contexts(xd, pd, plane_bsize, tx_size, p->eobs[block] > 0, aoff, loff);
 }
 
 static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
@@ -107,23 +181,25 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   VP9_COMP *cpi = args->cpi;
   MACROBLOCKD *xd = args->xd;
   TOKENEXTRA **tp = args->tp;
+  uint8_t *token_cache = args->token_cache;
+  struct macroblock_plane *p = &cpi->mb.plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   int pt; /* near block/prev token context index */
   int c = 0, rc = 0;
   TOKENEXTRA *t = *tp;        /* store tokens starting here */
-  const int eob = pd->eobs[block];
+  const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-
+  const int16_t *qcoeff_ptr = BLOCK_OFFSET(p->qcoeff, block);
   const int segment_id = mbmi->segment_id;
   const int16_t *scan, *nb;
+  const scan_order *so;
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  uint8_t token_cache[1024];
   const uint8_t *const band_translate = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);
 
@@ -131,10 +207,13 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 
   pt = get_entropy_context(tx_size, pd->above_context + aoff,
                                     pd->left_context + loff);
-  get_scan(xd, tx_size, type, block, &scan, &nb);
+  so = get_scan(xd, tx_size, type, block);
+  scan = so->scan;
+  nb = so->neighbors;
+
   c = 0;
   do {
-    const int band = get_coef_band(band_translate, c);
+    const int band = band_translate[c];
     int token;
     int v = 0;
     rc = scan[c];
@@ -147,7 +226,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
       t->extra = vp9_dct_value_tokens_ptr[v].extra;
       token    = vp9_dct_value_tokens_ptr[v].token;
     } else {
-      token = DCT_EOB_TOKEN;
+      token = EOB_TOKEN;
     }
 
     t->token = token;
@@ -170,7 +249,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
 }
 
 struct is_skippable_args {
-  MACROBLOCKD *xd;
+  MACROBLOCK *x;
   int *skippable;
 };
 
@@ -178,21 +257,21 @@ static void is_skippable(int plane, int block,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
                          void *argv) {
   struct is_skippable_args *args = argv;
-  args->skippable[0] &= (!args->xd->plane[plane].eobs[block]);
+  args->skippable[0] &= (!args->x->plane[plane].eobs[block]);
 }
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+static int sb_is_skippable(MACROBLOCK *x, BLOCK_SIZE bsize) {
   int result = 1;
-  struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block(xd, bsize, is_skippable, &args);
+  struct is_skippable_args args = {x, &result};
+  foreach_transformed_block(&x->e_mbd, bsize, is_skippable, &args);
   return result;
 }
 
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              int plane) {
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   int result = 1;
-  struct is_skippable_args args = {xd, &result};
-  foreach_transformed_block_in_plane(xd, bsize, plane, is_skippable, &args);
+  struct is_skippable_args args = {x, &result};
+  foreach_transformed_block_in_plane(&x->e_mbd, bsize, plane, is_skippable,
+                                     &args);
   return result;
 }
 
@@ -202,15 +281,15 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   TOKENEXTRA *t_backup = *t;
-  const int mb_skip_context = vp9_get_pred_context_mbskip(xd);
+  const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
-  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size};
+  struct tokenize_b_args arg = {cpi, xd, t, mbmi->tx_size, cpi->mb.token_cache};
 
-  mbmi->skip_coeff = vp9_sb_is_skippable(xd, bsize);
+  mbmi->skip_coeff = sb_is_skippable(&cpi->mb, bsize);
   if (mbmi->skip_coeff) {
     if (!dry_run)
-      cm->counts.mbskip[mb_skip_context][1] += skip_inc;
+      cm->counts.mbskip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
     if (dry_run)
       *t = t_backup;
@@ -218,7 +297,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   }
 
   if (!dry_run) {
-    cm->counts.mbskip[mb_skip_context][0] += skip_inc;
+    cm->counts.mbskip[ctx][0] += skip_inc;
     foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
     foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
@@ -226,149 +305,6 @@ void vp9_tokenize_sb(VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
   }
 }
 
-#ifdef ENTROPY_STATS
-void init_context_counters(void) {
-  FILE *f = fopen("context.bin", "rb");
-  if (!f) {
-    vp9_zero(context_counters);
-  } else {
-    fread(context_counters, sizeof(context_counters), 1, f);
-    fclose(f);
-  }
-
-  f = fopen("treeupdate.bin", "rb");
-  if (!f) {
-    vpx_memset(tree_update_hist, 0, sizeof(tree_update_hist));
-  } else {
-    fread(tree_update_hist, sizeof(tree_update_hist), 1, f);
-    fclose(f);
-  }
-}
-
-static void print_counter(FILE *f, vp9_coeff_accum *context_counters,
-                          int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_count %s = {\n", header);
-
-#define Comma(X) (X ? "," : "")
-  type = 0;
-  do {
-    ref = 0;
-    fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
-    do {
-      fprintf(f, "%s\n    { /* %s */", Comma(type), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s\n      { /* Coeff Band %d */", Comma(band), band);
-        pt = 0;
-        do {
-          fprintf(f, "%s\n        {", Comma(pt));
-
-          t = 0;
-          do {
-            const int64_t x = context_counters[type][ref][band][pt][t];
-            const int y = (int) x;
-
-            assert(x == (int64_t) y);  /* no overflow handling yet */
-            fprintf(f, "%s %d", Comma(t), y);
-          } while (++t < 1 + MAX_ENTROPY_TOKENS);
-          fprintf(f, "}");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-static void print_probs(FILE *f, vp9_coeff_accum *context_counters,
-                        int block_types, const char *header) {
-  int type, ref, band, pt, t;
-
-  fprintf(f, "static const vp9_coeff_probs %s = {", header);
-
-  type = 0;
-#define Newline(x, spaces) (x ? " " : "\n" spaces)
-  do {
-    fprintf(f, "%s%s{ /* block Type %d */",
-            Comma(type), Newline(type, "  "), type);
-    ref = 0;
-    do {
-      fprintf(f, "%s%s{ /* %s */",
-              Comma(band), Newline(band, "    "), ref ? "Inter" : "Intra");
-      band = 0;
-      do {
-        fprintf(f, "%s%s{ /* Coeff Band %d */",
-                Comma(band), Newline(band, "      "), band);
-        pt = 0;
-        do {
-          unsigned int branch_ct[ENTROPY_NODES][2];
-          unsigned int coef_counts[MAX_ENTROPY_TOKENS + 1];
-          vp9_prob coef_probs[ENTROPY_NODES];
-
-          if (pt >= 3 && band == 0)
-            break;
-          for (t = 0; t < MAX_ENTROPY_TOKENS + 1; ++t)
-            coef_counts[t] = context_counters[type][ref][band][pt][t];
-          vp9_tree_probs_from_distribution(vp9_coef_tree, coef_probs,
-                                           branch_ct, coef_counts, 0);
-          branch_ct[0][1] = coef_counts[MAX_ENTROPY_TOKENS] - branch_ct[0][0];
-          coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]);
-          fprintf(f, "%s\n      {", Comma(pt));
-
-          t = 0;
-          do {
-            fprintf(f, "%s %3d", Comma(t), coef_probs[t]);
-          } while (++t < ENTROPY_NODES);
-
-          fprintf(f, " }");
-        } while (++pt < PREV_COEF_CONTEXTS);
-        fprintf(f, "\n      }");
-      } while (++band < COEF_BANDS);
-      fprintf(f, "\n    }");
-    } while (++ref < REF_TYPES);
-    fprintf(f, "\n  }");
-  } while (++type < block_types);
-  fprintf(f, "\n};\n");
-}
-
-void print_context_counters() {
-  FILE *f = fopen("vp9_context.c", "w");
-
-  fprintf(f, "#include \"vp9_entropy.h\"\n");
-  fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
-
-  /* print counts */
-  print_counter(f, context_counters[TX_4X4], BLOCK_TYPES,
-                "vp9_default_coef_counts_4x4[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_8X8], BLOCK_TYPES,
-                "vp9_default_coef_counts_8x8[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_16X16], BLOCK_TYPES,
-                "vp9_default_coef_counts_16x16[BLOCK_TYPES]");
-  print_counter(f, context_counters[TX_32X32], BLOCK_TYPES,
-                "vp9_default_coef_counts_32x32[BLOCK_TYPES]");
-
-  /* print coefficient probabilities */
-  print_probs(f, context_counters[TX_4X4], BLOCK_TYPES,
-              "default_coef_probs_4x4[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_8X8], BLOCK_TYPES,
-              "default_coef_probs_8x8[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_16X16], BLOCK_TYPES,
-              "default_coef_probs_16x16[BLOCK_TYPES]");
-  print_probs(f, context_counters[TX_32X32], BLOCK_TYPES,
-              "default_coef_probs_32x32[BLOCK_TYPES]");
-
-  fclose(f);
-
-  f = fopen("context.bin", "wb");
-  fwrite(context_counters, sizeof(context_counters), 1, f);
-  fclose(f);
-}
-#endif
-
 void vp9_tokenize_initialize() {
   fill_value_tokens();
 }
diff --git a/source/libvpx/vp9/encoder/vp9_tokenize.h b/source/libvpx/vp9/encoder/vp9_tokenize.h
index b78e100..67e6c9d 100644
--- a/source/libvpx/vp9/encoder/vp9_tokenize.h
+++ b/source/libvpx/vp9/encoder/vp9_tokenize.h
@@ -12,10 +12,14 @@
 #define VP9_ENCODER_VP9_TOKENIZE_H_
 
 #include "vp9/common/vp9_entropy.h"
+
 #include "vp9/encoder/vp9_block.h"
+#include "vp9/encoder/vp9_treewriter.h"
 
 void vp9_tokenize_initialize();
 
+#define EOSB_TOKEN 127     // Not signalled, encoder only
+
 typedef struct {
   int16_t token;
   int16_t extra;
@@ -28,24 +32,17 @@ typedef struct {
   uint8_t         skip_eob_node;
 } TOKENEXTRA;
 
-typedef int64_t vp9_coeff_accum[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
-                               [MAX_ENTROPY_TOKENS + 1];
+extern const vp9_tree_index vp9_coef_tree[];
+extern const vp9_tree_index vp9_coef_con_tree[];
+extern struct vp9_token vp9_coef_encodings[];
+
+int vp9_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
-int vp9_sb_is_skippable(MACROBLOCKD *xd, BLOCK_SIZE bsize);
-int vp9_is_skippable_in_plane(MACROBLOCKD *xd, BLOCK_SIZE bsize,
-                              int plane);
 struct VP9_COMP;
 
 void vp9_tokenize_sb(struct VP9_COMP *cpi, TOKENEXTRA **t, int dry_run,
                      BLOCK_SIZE bsize);
 
-#ifdef ENTROPY_STATS
-void init_context_counters();
-void print_context_counters();
-
-extern vp9_coeff_accum context_counters[TX_SIZES][BLOCK_TYPES];
-#endif
-
 extern const int *vp9_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
  *  improve cache locality, since it's needed for costing when the rest of the
diff --git a/source/libvpx/vp9/encoder/vp9_treewriter.c b/source/libvpx/vp9/encoder/vp9_treewriter.c
index e4aed53..35e5a8f 100644
--- a/source/libvpx/vp9/encoder/vp9_treewriter.c
+++ b/source/libvpx/vp9/encoder/vp9_treewriter.c
@@ -36,3 +36,50 @@ void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree) {
   costs[-tree[0]] = vp9_cost_bit(probs[0], 0);
   cost(costs, tree, probs, 2, 0);
 }
+
+static void tree2tok(struct vp9_token *tokens, const vp9_tree_index *tree,
+                     int i, int v, int l) {
+  v += v;
+  ++l;
+
+  do {
+    const vp9_tree_index j = tree[i++];
+    if (j <= 0) {
+      tokens[-j].value = v;
+      tokens[-j].len = l;
+    } else {
+      tree2tok(tokens, tree, j, v, l);
+    }
+  } while (++v & 1);
+}
+
+void vp9_tokens_from_tree(struct vp9_token *tokens,
+                          const vp9_tree_index *tree) {
+  tree2tok(tokens, tree, 0, 0, 0);
+}
+
+static unsigned int convert_distribution(unsigned int i, vp9_tree tree,
+                                         unsigned int branch_ct[][2],
+                                         const unsigned int num_events[]) {
+  unsigned int left, right;
+
+  if (tree[i] <= 0)
+    left = num_events[-tree[i]];
+  else
+    left = convert_distribution(tree[i], tree, branch_ct, num_events);
+
+  if (tree[i + 1] <= 0)
+    right = num_events[-tree[i + 1]];
+  else
+    right = convert_distribution(tree[i + 1], tree, branch_ct, num_events);
+
+  branch_ct[i >> 1][0] = left;
+  branch_ct[i >> 1][1] = right;
+  return left + right;
+}
+
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+                                      unsigned int branch_ct[/* n-1 */][2],
+                                      const unsigned int num_events[/* n */]) {
+  convert_distribution(0, tree, branch_ct, num_events);
+}
diff --git a/source/libvpx/vp9/encoder/vp9_treewriter.h b/source/libvpx/vp9/encoder/vp9_treewriter.h
index eeda5cd..703272c 100644
--- a/source/libvpx/vp9/encoder/vp9_treewriter.h
+++ b/source/libvpx/vp9/encoder/vp9_treewriter.h
@@ -8,80 +8,66 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_TREEWRITER_H_
 #define VP9_ENCODER_VP9_TREEWRITER_H_
 
-/* Trees map alphabets into huffman-like codes suitable for an arithmetic
-   bit coder.  Timothy S Murphy  11 October 2004 */
-
 #include "vp9/common/vp9_treecoder.h"
-
 #include "vp9/encoder/vp9_boolhuff.h"       /* for now */
 
+#define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
-
-/* Approximate length of an encoded bool in 256ths of a bit at given prob */
-
-#define vp9_cost_zero(x) (vp9_prob_cost[x])
-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
+#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
 
-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+                                                    : (prob))
 
-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
-
-
-/* Both of these return bits, not scaled bits. */
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           vp9_prob p) {
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static INLINE unsigned int cost_branch(const unsigned int ct[2],
-                                       vp9_prob p) {
-  return cost_branch256(ct, p) >> 8;
-}
-
-
-static INLINE void treed_write(vp9_writer *w,
-                               vp9_tree tree, const vp9_prob *probs,
-                               int bits, int len) {
+static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
+                             int bits, int len) {
+  int cost = 0;
   vp9_tree_index i = 0;
 
   do {
     const int bit = (bits >> --len) & 1;
-    vp9_write(w, bit, probs[i >> 1]);
+    cost += vp9_cost_bit(probs[i >> 1], bit);
     i = tree[i + bit];
   } while (len);
-}
 
-static INLINE void write_token(vp9_writer *w, vp9_tree tree,
-                               const vp9_prob *probs,
-                               const struct vp9_token *token) {
-  treed_write(w, tree, probs, token->value, token->len);
+  return cost;
 }
 
-static INLINE int treed_cost(vp9_tree tree, const vp9_prob *probs,
-                             int bits, int len) {
-  int cost = 0;
-  vp9_tree_index i = 0;
+void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
+void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
+
+void vp9_tree_probs_from_distribution(vp9_tree tree,
+                                      unsigned int branch_ct[ /* n - 1 */ ][2],
+                                      const unsigned int num_events[ /* n */ ]);
+
+struct vp9_token {
+  int value;
+  int len;
+};
 
+void vp9_tokens_from_tree(struct vp9_token*, const vp9_tree_index *);
+
+static INLINE void vp9_write_tree(vp9_writer *w, const vp9_tree_index *tree,
+                                  const vp9_prob *probs, int bits, int len,
+                                  vp9_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
-    cost += vp9_cost_bit(probs[i >> 1], bit);
+    vp9_write(w, bit, probs[i >> 1]);
     i = tree[i + bit];
   } while (len);
-
-  return cost;
 }
 
-static INLINE int cost_token(vp9_tree tree, const vp9_prob *probs,
-                             const struct vp9_token *token) {
-  return treed_cost(tree, probs, token->value, token->len);
+static INLINE void vp9_write_token(vp9_writer *w, const vp9_tree_index *tree,
+                                   const vp9_prob *probs,
+                                   const struct vp9_token *token) {
+  vp9_write_tree(w, tree, probs, token->value, token->len, 0);
 }
 
-void vp9_cost_tokens(int *costs, const vp9_prob *probs, vp9_tree tree);
-void vp9_cost_tokens_skip(int *costs, const vp9_prob *probs, vp9_tree tree);
-
 #endif  // VP9_ENCODER_VP9_TREEWRITER_H_
diff --git a/source/libvpx/vp9/encoder/vp9_vaq.c b/source/libvpx/vp9/encoder/vp9_vaq.c
index 3179ae3..1f9cb87 100644
--- a/source/libvpx/vp9/encoder/vp9_vaq.c
+++ b/source/libvpx/vp9/encoder/vp9_vaq.c
@@ -118,8 +118,8 @@ static unsigned int block_variance(VP9_COMP *cpi, MACROBLOCK *x,
       ((-xd->mb_to_bottom_edge) >> 3) : 0;
 
   if (right_overflow || bottom_overflow) {
-    int bw = (1 << (mi_width_log2(bs)  + 3)) - right_overflow;
-    int bh = (1 << (mi_height_log2(bs) + 3)) - bottom_overflow;
+    const int bw = 8 * num_8x8_blocks_wide_lookup[bs] - right_overflow;
+    const int bh = 8 * num_8x8_blocks_high_lookup[bs] - bottom_overflow;
     int avg;
     variance(x->plane[0].src.buf, x->plane[0].src.stride,
              vp9_64_zeros, 0, bw, bh, &sse, &avg);
diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c b/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
new file mode 100644
index 0000000..9ea22fe
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_dct32x32_avx2.c
@@ -0,0 +1,2710 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+#define pair256_set_epi16(a, b) \
+  _mm256_set_epi16(b, a, b, a, b, a, b, a, b, a, b, a, b, a, b, a)
+
+#define pair256_set_epi32(a, b) \
+  _mm256_set_epi32(b, a, b, a, b, a, b, a)
+
+
+
+
+#if FDCT32x32_HIGH_PRECISION
+static INLINE __m256i k_madd_epi32_avx2(__m256i a, __m256i b) {
+  __m256i buf0, buf1;
+  buf0 = _mm256_mul_epu32(a, b);
+  a = _mm256_srli_epi64(a, 32);
+  b = _mm256_srli_epi64(b, 32);
+  buf1 = _mm256_mul_epu32(a, b);
+  return _mm256_add_epi64(buf0, buf1);
+}
+
+static INLINE __m256i k_packs_epi64_avx2(__m256i a, __m256i b) {
+  __m256i buf0 = _mm256_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m256i buf1 = _mm256_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm256_unpacklo_epi64(buf0, buf1);
+}
+#endif
+
+void FDCT32x32_2D_AVX2(const int16_t *input,
+                  int16_t *output_org, int stride) {
+  // Calculate pre-multiplied strides
+  const int str1 = stride;
+  const int str2 = 2 * stride;
+  const int str3 = 2 * stride + str1;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED(32, int16_t, intermediate[32 * 32]);
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(+cospi_16_64);
+  const __m256i k__cospi_p16_m16 = pair256_set_epi16(+cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64,   cospi_24_64);
+  const __m256i k__cospi_m24_m08 = pair256_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m256i k__cospi_p24_p08 = pair256_set_epi16(+cospi_24_64,  cospi_8_64);
+  const __m256i k__cospi_p12_p20 = pair256_set_epi16(+cospi_12_64,  cospi_20_64);
+  const __m256i k__cospi_m20_p12 = pair256_set_epi16(-cospi_20_64,  cospi_12_64);
+  const __m256i k__cospi_m04_p28 = pair256_set_epi16(-cospi_4_64,   cospi_28_64);
+  const __m256i k__cospi_p28_p04 = pair256_set_epi16(+cospi_28_64,  cospi_4_64);
+  const __m256i k__cospi_m28_m04 = pair256_set_epi16(-cospi_28_64, -cospi_4_64);
+  const __m256i k__cospi_m12_m20 = pair256_set_epi16(-cospi_12_64, -cospi_20_64);
+  const __m256i k__cospi_p30_p02 = pair256_set_epi16(+cospi_30_64,  cospi_2_64);
+  const __m256i k__cospi_p14_p18 = pair256_set_epi16(+cospi_14_64,  cospi_18_64);
+  const __m256i k__cospi_p22_p10 = pair256_set_epi16(+cospi_22_64,  cospi_10_64);
+  const __m256i k__cospi_p06_p26 = pair256_set_epi16(+cospi_6_64,   cospi_26_64);
+  const __m256i k__cospi_m26_p06 = pair256_set_epi16(-cospi_26_64,  cospi_6_64);
+  const __m256i k__cospi_m10_p22 = pair256_set_epi16(-cospi_10_64,  cospi_22_64);
+  const __m256i k__cospi_m18_p14 = pair256_set_epi16(-cospi_18_64,  cospi_14_64);
+  const __m256i k__cospi_m02_p30 = pair256_set_epi16(-cospi_2_64,   cospi_30_64);
+  const __m256i k__cospi_p31_p01 = pair256_set_epi16(+cospi_31_64,  cospi_1_64);
+  const __m256i k__cospi_p15_p17 = pair256_set_epi16(+cospi_15_64,  cospi_17_64);
+  const __m256i k__cospi_p23_p09 = pair256_set_epi16(+cospi_23_64,  cospi_9_64);
+  const __m256i k__cospi_p07_p25 = pair256_set_epi16(+cospi_7_64,   cospi_25_64);
+  const __m256i k__cospi_m25_p07 = pair256_set_epi16(-cospi_25_64,  cospi_7_64);
+  const __m256i k__cospi_m09_p23 = pair256_set_epi16(-cospi_9_64,   cospi_23_64);
+  const __m256i k__cospi_m17_p15 = pair256_set_epi16(-cospi_17_64,  cospi_15_64);
+  const __m256i k__cospi_m01_p31 = pair256_set_epi16(-cospi_1_64,   cospi_31_64);
+  const __m256i k__cospi_p27_p05 = pair256_set_epi16(+cospi_27_64,  cospi_5_64);
+  const __m256i k__cospi_p11_p21 = pair256_set_epi16(+cospi_11_64,  cospi_21_64);
+  const __m256i k__cospi_p19_p13 = pair256_set_epi16(+cospi_19_64,  cospi_13_64);
+  const __m256i k__cospi_p03_p29 = pair256_set_epi16(+cospi_3_64,   cospi_29_64);
+  const __m256i k__cospi_m29_p03 = pair256_set_epi16(-cospi_29_64,  cospi_3_64);
+  const __m256i k__cospi_m13_p19 = pair256_set_epi16(-cospi_13_64,  cospi_19_64);
+  const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64,  cospi_11_64);
+  const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64,   cospi_27_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kOne  = _mm256_set1_epi16(1);
+  // Do the two transform/transpose passes
+  int pass;
+  for (pass = 0; pass < 2; ++pass) {
+    // We process sixteen columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 32; column_start += 16) {
+      __m256i step1[32];
+      __m256i step2[32];
+      __m256i step3[32];
+      __m256i out[32];
+      // Stage 1
+      // Note: even though all the loads below are aligned, using the aligned
+      //       intrinsic make the code slightly slower.
+      if (0 == pass) {
+        const int16_t *in  = &input[column_start];
+        // step1[i] =  (in[ 0 * stride] + in[(32 -  1) * stride]) << 2;
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          const int16_t *ina =  in +  0 * str1;
+          const int16_t *inb =  in + 31 * str1;
+          __m256i *step1a = &step1[ 0];
+          __m256i *step1b = &step1[31];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  4 * str1;
+          const int16_t *inb =  in + 27 * str1;
+          __m256i *step1a = &step1[ 4];
+          __m256i *step1b = &step1[27];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in +  8 * str1;
+          const int16_t *inb =  in + 23 * str1;
+          __m256i *step1a = &step1[ 8];
+          __m256i *step1b = &step1[23];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+        {
+          const int16_t *ina =  in + 12 * str1;
+          const int16_t *inb =  in + 19 * str1;
+          __m256i *step1a = &step1[12];
+          __m256i *step1b = &step1[19];
+          const __m256i ina0  = _mm256_loadu_si256((const __m256i *)(ina));
+          const __m256i ina1  = _mm256_loadu_si256((const __m256i *)(ina + str1));
+          const __m256i ina2  = _mm256_loadu_si256((const __m256i *)(ina + str2));
+          const __m256i ina3  = _mm256_loadu_si256((const __m256i *)(ina + str3));
+          const __m256i inb3  = _mm256_loadu_si256((const __m256i *)(inb - str3));
+          const __m256i inb2  = _mm256_loadu_si256((const __m256i *)(inb - str2));
+          const __m256i inb1  = _mm256_loadu_si256((const __m256i *)(inb - str1));
+          const __m256i inb0  = _mm256_loadu_si256((const __m256i *)(inb));
+          step1a[ 0] = _mm256_add_epi16(ina0, inb0);
+          step1a[ 1] = _mm256_add_epi16(ina1, inb1);
+          step1a[ 2] = _mm256_add_epi16(ina2, inb2);
+          step1a[ 3] = _mm256_add_epi16(ina3, inb3);
+          step1b[-3] = _mm256_sub_epi16(ina3, inb3);
+          step1b[-2] = _mm256_sub_epi16(ina2, inb2);
+          step1b[-1] = _mm256_sub_epi16(ina1, inb1);
+          step1b[-0] = _mm256_sub_epi16(ina0, inb0);
+          step1a[ 0] = _mm256_slli_epi16(step1a[ 0], 2);
+          step1a[ 1] = _mm256_slli_epi16(step1a[ 1], 2);
+          step1a[ 2] = _mm256_slli_epi16(step1a[ 2], 2);
+          step1a[ 3] = _mm256_slli_epi16(step1a[ 3], 2);
+          step1b[-3] = _mm256_slli_epi16(step1b[-3], 2);
+          step1b[-2] = _mm256_slli_epi16(step1b[-2], 2);
+          step1b[-1] = _mm256_slli_epi16(step1b[-1], 2);
+          step1b[-0] = _mm256_slli_epi16(step1b[-0], 2);
+        }
+      } else {
+        int16_t *in = &intermediate[column_start];
+        // step1[i] =  in[ 0 * 32] + in[(32 -  1) * 32];
+        // Note: using the same approach as above to have common offset is
+        //       counter-productive as all offsets can be calculated at compile
+        //       time.
+        // Note: the next four blocks could be in a loop. That would help the
+        //       instruction cache but is actually slower.
+        {
+          __m256i in00  = _mm256_loadu_si256((const __m256i *)(in +  0 * 32));
+          __m256i in01  = _mm256_loadu_si256((const __m256i *)(in +  1 * 32));
+          __m256i in02  = _mm256_loadu_si256((const __m256i *)(in +  2 * 32));
+          __m256i in03  = _mm256_loadu_si256((const __m256i *)(in +  3 * 32));
+          __m256i in28  = _mm256_loadu_si256((const __m256i *)(in + 28 * 32));
+          __m256i in29  = _mm256_loadu_si256((const __m256i *)(in + 29 * 32));
+          __m256i in30  = _mm256_loadu_si256((const __m256i *)(in + 30 * 32));
+          __m256i in31  = _mm256_loadu_si256((const __m256i *)(in + 31 * 32));
+          step1[ 0] = _mm256_add_epi16(in00, in31);
+          step1[ 1] = _mm256_add_epi16(in01, in30);
+          step1[ 2] = _mm256_add_epi16(in02, in29);
+          step1[ 3] = _mm256_add_epi16(in03, in28);
+          step1[28] = _mm256_sub_epi16(in03, in28);
+          step1[29] = _mm256_sub_epi16(in02, in29);
+          step1[30] = _mm256_sub_epi16(in01, in30);
+          step1[31] = _mm256_sub_epi16(in00, in31);
+        }
+        {
+          __m256i in04  = _mm256_loadu_si256((const __m256i *)(in +  4 * 32));
+          __m256i in05  = _mm256_loadu_si256((const __m256i *)(in +  5 * 32));
+          __m256i in06  = _mm256_loadu_si256((const __m256i *)(in +  6 * 32));
+          __m256i in07  = _mm256_loadu_si256((const __m256i *)(in +  7 * 32));
+          __m256i in24  = _mm256_loadu_si256((const __m256i *)(in + 24 * 32));
+          __m256i in25  = _mm256_loadu_si256((const __m256i *)(in + 25 * 32));
+          __m256i in26  = _mm256_loadu_si256((const __m256i *)(in + 26 * 32));
+          __m256i in27  = _mm256_loadu_si256((const __m256i *)(in + 27 * 32));
+          step1[ 4] = _mm256_add_epi16(in04, in27);
+          step1[ 5] = _mm256_add_epi16(in05, in26);
+          step1[ 6] = _mm256_add_epi16(in06, in25);
+          step1[ 7] = _mm256_add_epi16(in07, in24);
+          step1[24] = _mm256_sub_epi16(in07, in24);
+          step1[25] = _mm256_sub_epi16(in06, in25);
+          step1[26] = _mm256_sub_epi16(in05, in26);
+          step1[27] = _mm256_sub_epi16(in04, in27);
+        }
+        {
+          __m256i in08  = _mm256_loadu_si256((const __m256i *)(in +  8 * 32));
+          __m256i in09  = _mm256_loadu_si256((const __m256i *)(in +  9 * 32));
+          __m256i in10  = _mm256_loadu_si256((const __m256i *)(in + 10 * 32));
+          __m256i in11  = _mm256_loadu_si256((const __m256i *)(in + 11 * 32));
+          __m256i in20  = _mm256_loadu_si256((const __m256i *)(in + 20 * 32));
+          __m256i in21  = _mm256_loadu_si256((const __m256i *)(in + 21 * 32));
+          __m256i in22  = _mm256_loadu_si256((const __m256i *)(in + 22 * 32));
+          __m256i in23  = _mm256_loadu_si256((const __m256i *)(in + 23 * 32));
+          step1[ 8] = _mm256_add_epi16(in08, in23);
+          step1[ 9] = _mm256_add_epi16(in09, in22);
+          step1[10] = _mm256_add_epi16(in10, in21);
+          step1[11] = _mm256_add_epi16(in11, in20);
+          step1[20] = _mm256_sub_epi16(in11, in20);
+          step1[21] = _mm256_sub_epi16(in10, in21);
+          step1[22] = _mm256_sub_epi16(in09, in22);
+          step1[23] = _mm256_sub_epi16(in08, in23);
+        }
+        {
+          __m256i in12  = _mm256_loadu_si256((const __m256i *)(in + 12 * 32));
+          __m256i in13  = _mm256_loadu_si256((const __m256i *)(in + 13 * 32));
+          __m256i in14  = _mm256_loadu_si256((const __m256i *)(in + 14 * 32));
+          __m256i in15  = _mm256_loadu_si256((const __m256i *)(in + 15 * 32));
+          __m256i in16  = _mm256_loadu_si256((const __m256i *)(in + 16 * 32));
+          __m256i in17  = _mm256_loadu_si256((const __m256i *)(in + 17 * 32));
+          __m256i in18  = _mm256_loadu_si256((const __m256i *)(in + 18 * 32));
+          __m256i in19  = _mm256_loadu_si256((const __m256i *)(in + 19 * 32));
+          step1[12] = _mm256_add_epi16(in12, in19);
+          step1[13] = _mm256_add_epi16(in13, in18);
+          step1[14] = _mm256_add_epi16(in14, in17);
+          step1[15] = _mm256_add_epi16(in15, in16);
+          step1[16] = _mm256_sub_epi16(in15, in16);
+          step1[17] = _mm256_sub_epi16(in14, in17);
+          step1[18] = _mm256_sub_epi16(in13, in18);
+          step1[19] = _mm256_sub_epi16(in12, in19);
+        }
+      }
+      // Stage 2
+      {
+        step2[ 0] = _mm256_add_epi16(step1[0], step1[15]);
+        step2[ 1] = _mm256_add_epi16(step1[1], step1[14]);
+        step2[ 2] = _mm256_add_epi16(step1[2], step1[13]);
+        step2[ 3] = _mm256_add_epi16(step1[3], step1[12]);
+        step2[ 4] = _mm256_add_epi16(step1[4], step1[11]);
+        step2[ 5] = _mm256_add_epi16(step1[5], step1[10]);
+        step2[ 6] = _mm256_add_epi16(step1[6], step1[ 9]);
+        step2[ 7] = _mm256_add_epi16(step1[7], step1[ 8]);
+        step2[ 8] = _mm256_sub_epi16(step1[7], step1[ 8]);
+        step2[ 9] = _mm256_sub_epi16(step1[6], step1[ 9]);
+        step2[10] = _mm256_sub_epi16(step1[5], step1[10]);
+        step2[11] = _mm256_sub_epi16(step1[4], step1[11]);
+        step2[12] = _mm256_sub_epi16(step1[3], step1[12]);
+        step2[13] = _mm256_sub_epi16(step1[2], step1[13]);
+        step2[14] = _mm256_sub_epi16(step1[1], step1[14]);
+        step2[15] = _mm256_sub_epi16(step1[0], step1[15]);
+      }
+      {
+        const __m256i s2_20_0 = _mm256_unpacklo_epi16(step1[27], step1[20]);
+        const __m256i s2_20_1 = _mm256_unpackhi_epi16(step1[27], step1[20]);
+        const __m256i s2_21_0 = _mm256_unpacklo_epi16(step1[26], step1[21]);
+        const __m256i s2_21_1 = _mm256_unpackhi_epi16(step1[26], step1[21]);
+        const __m256i s2_22_0 = _mm256_unpacklo_epi16(step1[25], step1[22]);
+        const __m256i s2_22_1 = _mm256_unpackhi_epi16(step1[25], step1[22]);
+        const __m256i s2_23_0 = _mm256_unpacklo_epi16(step1[24], step1[23]);
+        const __m256i s2_23_1 = _mm256_unpackhi_epi16(step1[24], step1[23]);
+        const __m256i s2_20_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_m16);
+        const __m256i s2_20_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_m16);
+        const __m256i s2_21_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_m16);
+        const __m256i s2_21_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_m16);
+        const __m256i s2_22_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_m16);
+        const __m256i s2_22_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_m16);
+        const __m256i s2_23_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_m16);
+        const __m256i s2_23_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_m16);
+        const __m256i s2_24_2 = _mm256_madd_epi16(s2_23_0, k__cospi_p16_p16);
+        const __m256i s2_24_3 = _mm256_madd_epi16(s2_23_1, k__cospi_p16_p16);
+        const __m256i s2_25_2 = _mm256_madd_epi16(s2_22_0, k__cospi_p16_p16);
+        const __m256i s2_25_3 = _mm256_madd_epi16(s2_22_1, k__cospi_p16_p16);
+        const __m256i s2_26_2 = _mm256_madd_epi16(s2_21_0, k__cospi_p16_p16);
+        const __m256i s2_26_3 = _mm256_madd_epi16(s2_21_1, k__cospi_p16_p16);
+        const __m256i s2_27_2 = _mm256_madd_epi16(s2_20_0, k__cospi_p16_p16);
+        const __m256i s2_27_3 = _mm256_madd_epi16(s2_20_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s2_20_4 = _mm256_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_5 = _mm256_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_4 = _mm256_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_21_5 = _mm256_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_4 = _mm256_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_22_5 = _mm256_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_4 = _mm256_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_23_5 = _mm256_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_4 = _mm256_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_24_5 = _mm256_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_4 = _mm256_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_25_5 = _mm256_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_4 = _mm256_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_26_5 = _mm256_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_4 = _mm256_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_27_5 = _mm256_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_20_6 = _mm256_srai_epi32(s2_20_4, DCT_CONST_BITS);
+        const __m256i s2_20_7 = _mm256_srai_epi32(s2_20_5, DCT_CONST_BITS);
+        const __m256i s2_21_6 = _mm256_srai_epi32(s2_21_4, DCT_CONST_BITS);
+        const __m256i s2_21_7 = _mm256_srai_epi32(s2_21_5, DCT_CONST_BITS);
+        const __m256i s2_22_6 = _mm256_srai_epi32(s2_22_4, DCT_CONST_BITS);
+        const __m256i s2_22_7 = _mm256_srai_epi32(s2_22_5, DCT_CONST_BITS);
+        const __m256i s2_23_6 = _mm256_srai_epi32(s2_23_4, DCT_CONST_BITS);
+        const __m256i s2_23_7 = _mm256_srai_epi32(s2_23_5, DCT_CONST_BITS);
+        const __m256i s2_24_6 = _mm256_srai_epi32(s2_24_4, DCT_CONST_BITS);
+        const __m256i s2_24_7 = _mm256_srai_epi32(s2_24_5, DCT_CONST_BITS);
+        const __m256i s2_25_6 = _mm256_srai_epi32(s2_25_4, DCT_CONST_BITS);
+        const __m256i s2_25_7 = _mm256_srai_epi32(s2_25_5, DCT_CONST_BITS);
+        const __m256i s2_26_6 = _mm256_srai_epi32(s2_26_4, DCT_CONST_BITS);
+        const __m256i s2_26_7 = _mm256_srai_epi32(s2_26_5, DCT_CONST_BITS);
+        const __m256i s2_27_6 = _mm256_srai_epi32(s2_27_4, DCT_CONST_BITS);
+        const __m256i s2_27_7 = _mm256_srai_epi32(s2_27_5, DCT_CONST_BITS);
+        // Combine
+        step2[20] = _mm256_packs_epi32(s2_20_6, s2_20_7);
+        step2[21] = _mm256_packs_epi32(s2_21_6, s2_21_7);
+        step2[22] = _mm256_packs_epi32(s2_22_6, s2_22_7);
+        step2[23] = _mm256_packs_epi32(s2_23_6, s2_23_7);
+        step2[24] = _mm256_packs_epi32(s2_24_6, s2_24_7);
+        step2[25] = _mm256_packs_epi32(s2_25_6, s2_25_7);
+        step2[26] = _mm256_packs_epi32(s2_26_6, s2_26_7);
+        step2[27] = _mm256_packs_epi32(s2_27_6, s2_27_7);
+      }
+
+#if !FDCT32x32_HIGH_PRECISION
+      // dump the magnitude by half, hence the intermediate values are within
+      // the range of 16 bits.
+      if (1 == pass) {
+        __m256i s3_00_0 = _mm256_cmpgt_epi16(kZero,step2[ 0]);
+        __m256i s3_01_0 = _mm256_cmpgt_epi16(kZero,step2[ 1]);
+        __m256i s3_02_0 = _mm256_cmpgt_epi16(kZero,step2[ 2]);
+        __m256i s3_03_0 = _mm256_cmpgt_epi16(kZero,step2[ 3]);
+        __m256i s3_04_0 = _mm256_cmpgt_epi16(kZero,step2[ 4]);
+        __m256i s3_05_0 = _mm256_cmpgt_epi16(kZero,step2[ 5]);
+        __m256i s3_06_0 = _mm256_cmpgt_epi16(kZero,step2[ 6]);
+        __m256i s3_07_0 = _mm256_cmpgt_epi16(kZero,step2[ 7]);
+        __m256i s2_08_0 = _mm256_cmpgt_epi16(kZero,step2[ 8]);
+        __m256i s2_09_0 = _mm256_cmpgt_epi16(kZero,step2[ 9]);
+        __m256i s3_10_0 = _mm256_cmpgt_epi16(kZero,step2[10]);
+        __m256i s3_11_0 = _mm256_cmpgt_epi16(kZero,step2[11]);
+        __m256i s3_12_0 = _mm256_cmpgt_epi16(kZero,step2[12]);
+        __m256i s3_13_0 = _mm256_cmpgt_epi16(kZero,step2[13]);
+        __m256i s2_14_0 = _mm256_cmpgt_epi16(kZero,step2[14]);
+        __m256i s2_15_0 = _mm256_cmpgt_epi16(kZero,step2[15]);
+        __m256i s3_16_0 = _mm256_cmpgt_epi16(kZero,step1[16]);
+        __m256i s3_17_0 = _mm256_cmpgt_epi16(kZero,step1[17]);
+        __m256i s3_18_0 = _mm256_cmpgt_epi16(kZero,step1[18]);
+        __m256i s3_19_0 = _mm256_cmpgt_epi16(kZero,step1[19]);
+        __m256i s3_20_0 = _mm256_cmpgt_epi16(kZero,step2[20]);
+        __m256i s3_21_0 = _mm256_cmpgt_epi16(kZero,step2[21]);
+        __m256i s3_22_0 = _mm256_cmpgt_epi16(kZero,step2[22]);
+        __m256i s3_23_0 = _mm256_cmpgt_epi16(kZero,step2[23]);
+        __m256i s3_24_0 = _mm256_cmpgt_epi16(kZero,step2[24]);
+        __m256i s3_25_0 = _mm256_cmpgt_epi16(kZero,step2[25]);
+        __m256i s3_26_0 = _mm256_cmpgt_epi16(kZero,step2[26]);
+        __m256i s3_27_0 = _mm256_cmpgt_epi16(kZero,step2[27]);
+        __m256i s3_28_0 = _mm256_cmpgt_epi16(kZero,step1[28]);
+        __m256i s3_29_0 = _mm256_cmpgt_epi16(kZero,step1[29]);
+        __m256i s3_30_0 = _mm256_cmpgt_epi16(kZero,step1[30]);
+        __m256i s3_31_0 = _mm256_cmpgt_epi16(kZero,step1[31]);
+
+        step2[ 0] = _mm256_sub_epi16(step2[ 0], s3_00_0);
+        step2[ 1] = _mm256_sub_epi16(step2[ 1], s3_01_0);
+        step2[ 2] = _mm256_sub_epi16(step2[ 2], s3_02_0);
+        step2[ 3] = _mm256_sub_epi16(step2[ 3], s3_03_0);
+        step2[ 4] = _mm256_sub_epi16(step2[ 4], s3_04_0);
+        step2[ 5] = _mm256_sub_epi16(step2[ 5], s3_05_0);
+        step2[ 6] = _mm256_sub_epi16(step2[ 6], s3_06_0);
+        step2[ 7] = _mm256_sub_epi16(step2[ 7], s3_07_0);
+        step2[ 8] = _mm256_sub_epi16(step2[ 8], s2_08_0);
+        step2[ 9] = _mm256_sub_epi16(step2[ 9], s2_09_0);
+        step2[10] = _mm256_sub_epi16(step2[10], s3_10_0);
+        step2[11] = _mm256_sub_epi16(step2[11], s3_11_0);
+        step2[12] = _mm256_sub_epi16(step2[12], s3_12_0);
+        step2[13] = _mm256_sub_epi16(step2[13], s3_13_0);
+        step2[14] = _mm256_sub_epi16(step2[14], s2_14_0);
+        step2[15] = _mm256_sub_epi16(step2[15], s2_15_0);
+        step1[16] = _mm256_sub_epi16(step1[16], s3_16_0);
+        step1[17] = _mm256_sub_epi16(step1[17], s3_17_0);
+        step1[18] = _mm256_sub_epi16(step1[18], s3_18_0);
+        step1[19] = _mm256_sub_epi16(step1[19], s3_19_0);
+        step2[20] = _mm256_sub_epi16(step2[20], s3_20_0);
+        step2[21] = _mm256_sub_epi16(step2[21], s3_21_0);
+        step2[22] = _mm256_sub_epi16(step2[22], s3_22_0);
+        step2[23] = _mm256_sub_epi16(step2[23], s3_23_0);
+        step2[24] = _mm256_sub_epi16(step2[24], s3_24_0);
+        step2[25] = _mm256_sub_epi16(step2[25], s3_25_0);
+        step2[26] = _mm256_sub_epi16(step2[26], s3_26_0);
+        step2[27] = _mm256_sub_epi16(step2[27], s3_27_0);
+        step1[28] = _mm256_sub_epi16(step1[28], s3_28_0);
+        step1[29] = _mm256_sub_epi16(step1[29], s3_29_0);
+        step1[30] = _mm256_sub_epi16(step1[30], s3_30_0);
+        step1[31] = _mm256_sub_epi16(step1[31], s3_31_0);
+
+        step2[ 0] = _mm256_add_epi16(step2[ 0], kOne);
+        step2[ 1] = _mm256_add_epi16(step2[ 1], kOne);
+        step2[ 2] = _mm256_add_epi16(step2[ 2], kOne);
+        step2[ 3] = _mm256_add_epi16(step2[ 3], kOne);
+        step2[ 4] = _mm256_add_epi16(step2[ 4], kOne);
+        step2[ 5] = _mm256_add_epi16(step2[ 5], kOne);
+        step2[ 6] = _mm256_add_epi16(step2[ 6], kOne);
+        step2[ 7] = _mm256_add_epi16(step2[ 7], kOne);
+        step2[ 8] = _mm256_add_epi16(step2[ 8], kOne);
+        step2[ 9] = _mm256_add_epi16(step2[ 9], kOne);
+        step2[10] = _mm256_add_epi16(step2[10], kOne);
+        step2[11] = _mm256_add_epi16(step2[11], kOne);
+        step2[12] = _mm256_add_epi16(step2[12], kOne);
+        step2[13] = _mm256_add_epi16(step2[13], kOne);
+        step2[14] = _mm256_add_epi16(step2[14], kOne);
+        step2[15] = _mm256_add_epi16(step2[15], kOne);
+        step1[16] = _mm256_add_epi16(step1[16], kOne);
+        step1[17] = _mm256_add_epi16(step1[17], kOne);
+        step1[18] = _mm256_add_epi16(step1[18], kOne);
+        step1[19] = _mm256_add_epi16(step1[19], kOne);
+        step2[20] = _mm256_add_epi16(step2[20], kOne);
+        step2[21] = _mm256_add_epi16(step2[21], kOne);
+        step2[22] = _mm256_add_epi16(step2[22], kOne);
+        step2[23] = _mm256_add_epi16(step2[23], kOne);
+        step2[24] = _mm256_add_epi16(step2[24], kOne);
+        step2[25] = _mm256_add_epi16(step2[25], kOne);
+        step2[26] = _mm256_add_epi16(step2[26], kOne);
+        step2[27] = _mm256_add_epi16(step2[27], kOne);
+        step1[28] = _mm256_add_epi16(step1[28], kOne);
+        step1[29] = _mm256_add_epi16(step1[29], kOne);
+        step1[30] = _mm256_add_epi16(step1[30], kOne);
+        step1[31] = _mm256_add_epi16(step1[31], kOne);
+
+        step2[ 0] = _mm256_srai_epi16(step2[ 0], 2);
+        step2[ 1] = _mm256_srai_epi16(step2[ 1], 2);
+        step2[ 2] = _mm256_srai_epi16(step2[ 2], 2);
+        step2[ 3] = _mm256_srai_epi16(step2[ 3], 2);
+        step2[ 4] = _mm256_srai_epi16(step2[ 4], 2);
+        step2[ 5] = _mm256_srai_epi16(step2[ 5], 2);
+        step2[ 6] = _mm256_srai_epi16(step2[ 6], 2);
+        step2[ 7] = _mm256_srai_epi16(step2[ 7], 2);
+        step2[ 8] = _mm256_srai_epi16(step2[ 8], 2);
+        step2[ 9] = _mm256_srai_epi16(step2[ 9], 2);
+        step2[10] = _mm256_srai_epi16(step2[10], 2);
+        step2[11] = _mm256_srai_epi16(step2[11], 2);
+        step2[12] = _mm256_srai_epi16(step2[12], 2);
+        step2[13] = _mm256_srai_epi16(step2[13], 2);
+        step2[14] = _mm256_srai_epi16(step2[14], 2);
+        step2[15] = _mm256_srai_epi16(step2[15], 2);
+        step1[16] = _mm256_srai_epi16(step1[16], 2);
+        step1[17] = _mm256_srai_epi16(step1[17], 2);
+        step1[18] = _mm256_srai_epi16(step1[18], 2);
+        step1[19] = _mm256_srai_epi16(step1[19], 2);
+        step2[20] = _mm256_srai_epi16(step2[20], 2);
+        step2[21] = _mm256_srai_epi16(step2[21], 2);
+        step2[22] = _mm256_srai_epi16(step2[22], 2);
+        step2[23] = _mm256_srai_epi16(step2[23], 2);
+        step2[24] = _mm256_srai_epi16(step2[24], 2);
+        step2[25] = _mm256_srai_epi16(step2[25], 2);
+        step2[26] = _mm256_srai_epi16(step2[26], 2);
+        step2[27] = _mm256_srai_epi16(step2[27], 2);
+        step1[28] = _mm256_srai_epi16(step1[28], 2);
+        step1[29] = _mm256_srai_epi16(step1[29], 2);
+        step1[30] = _mm256_srai_epi16(step1[30], 2);
+        step1[31] = _mm256_srai_epi16(step1[31], 2);
+      }
+#endif
+
+#if FDCT32x32_HIGH_PRECISION
+      if (pass == 0) {
+#endif
+      // Stage 3
+      {
+        step3[0] = _mm256_add_epi16(step2[(8 - 1)], step2[0]);
+        step3[1] = _mm256_add_epi16(step2[(8 - 2)], step2[1]);
+        step3[2] = _mm256_add_epi16(step2[(8 - 3)], step2[2]);
+        step3[3] = _mm256_add_epi16(step2[(8 - 4)], step2[3]);
+        step3[4] = _mm256_sub_epi16(step2[(8 - 5)], step2[4]);
+        step3[5] = _mm256_sub_epi16(step2[(8 - 6)], step2[5]);
+        step3[6] = _mm256_sub_epi16(step2[(8 - 7)], step2[6]);
+        step3[7] = _mm256_sub_epi16(step2[(8 - 8)], step2[7]);
+      }
+      {
+        const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+        const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+        const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+        const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+        const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+        const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+        const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+        const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+        const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+        const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+        const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+        const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_10_6 = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+        const __m256i s3_10_7 = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+        const __m256i s3_11_6 = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+        const __m256i s3_11_7 = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+        const __m256i s3_12_6 = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+        const __m256i s3_12_7 = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+        const __m256i s3_13_6 = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+        const __m256i s3_13_7 = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        // Combine
+        step3[10] = _mm256_packs_epi32(s3_10_6, s3_10_7);
+        step3[11] = _mm256_packs_epi32(s3_11_6, s3_11_7);
+        step3[12] = _mm256_packs_epi32(s3_12_6, s3_12_7);
+        step3[13] = _mm256_packs_epi32(s3_13_6, s3_13_7);
+      }
+      {
+        step3[16] = _mm256_add_epi16(step2[23], step1[16]);
+        step3[17] = _mm256_add_epi16(step2[22], step1[17]);
+        step3[18] = _mm256_add_epi16(step2[21], step1[18]);
+        step3[19] = _mm256_add_epi16(step2[20], step1[19]);
+        step3[20] = _mm256_sub_epi16(step1[19], step2[20]);
+        step3[21] = _mm256_sub_epi16(step1[18], step2[21]);
+        step3[22] = _mm256_sub_epi16(step1[17], step2[22]);
+        step3[23] = _mm256_sub_epi16(step1[16], step2[23]);
+        step3[24] = _mm256_sub_epi16(step1[31], step2[24]);
+        step3[25] = _mm256_sub_epi16(step1[30], step2[25]);
+        step3[26] = _mm256_sub_epi16(step1[29], step2[26]);
+        step3[27] = _mm256_sub_epi16(step1[28], step2[27]);
+        step3[28] = _mm256_add_epi16(step2[27], step1[28]);
+        step3[29] = _mm256_add_epi16(step2[26], step1[29]);
+        step3[30] = _mm256_add_epi16(step2[25], step1[30]);
+        step3[31] = _mm256_add_epi16(step2[24], step1[31]);
+      }
+
+      // Stage 4
+      {
+        step1[ 0] = _mm256_add_epi16(step3[ 3], step3[ 0]);
+        step1[ 1] = _mm256_add_epi16(step3[ 2], step3[ 1]);
+        step1[ 2] = _mm256_sub_epi16(step3[ 1], step3[ 2]);
+        step1[ 3] = _mm256_sub_epi16(step3[ 0], step3[ 3]);
+        step1[ 8] = _mm256_add_epi16(step3[11], step2[ 8]);
+        step1[ 9] = _mm256_add_epi16(step3[10], step2[ 9]);
+        step1[10] = _mm256_sub_epi16(step2[ 9], step3[10]);
+        step1[11] = _mm256_sub_epi16(step2[ 8], step3[11]);
+        step1[12] = _mm256_sub_epi16(step2[15], step3[12]);
+        step1[13] = _mm256_sub_epi16(step2[14], step3[13]);
+        step1[14] = _mm256_add_epi16(step3[13], step2[14]);
+        step1[15] = _mm256_add_epi16(step3[12], step2[15]);
+      }
+      {
+        const __m256i s1_05_0 = _mm256_unpacklo_epi16(step3[6], step3[5]);
+        const __m256i s1_05_1 = _mm256_unpackhi_epi16(step3[6], step3[5]);
+        const __m256i s1_05_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_m16);
+        const __m256i s1_05_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_m16);
+        const __m256i s1_06_2 = _mm256_madd_epi16(s1_05_0, k__cospi_p16_p16);
+        const __m256i s1_06_3 = _mm256_madd_epi16(s1_05_1, k__cospi_p16_p16);
+        // dct_const_round_shift
+        const __m256i s1_05_4 = _mm256_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_5 = _mm256_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_4 = _mm256_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_06_5 = _mm256_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_05_6 = _mm256_srai_epi32(s1_05_4, DCT_CONST_BITS);
+        const __m256i s1_05_7 = _mm256_srai_epi32(s1_05_5, DCT_CONST_BITS);
+        const __m256i s1_06_6 = _mm256_srai_epi32(s1_06_4, DCT_CONST_BITS);
+        const __m256i s1_06_7 = _mm256_srai_epi32(s1_06_5, DCT_CONST_BITS);
+        // Combine
+        step1[5] = _mm256_packs_epi32(s1_05_6, s1_05_7);
+        step1[6] = _mm256_packs_epi32(s1_06_6, s1_06_7);
+      }
+      {
+        const __m256i s1_18_0 = _mm256_unpacklo_epi16(step3[18], step3[29]);
+        const __m256i s1_18_1 = _mm256_unpackhi_epi16(step3[18], step3[29]);
+        const __m256i s1_19_0 = _mm256_unpacklo_epi16(step3[19], step3[28]);
+        const __m256i s1_19_1 = _mm256_unpackhi_epi16(step3[19], step3[28]);
+        const __m256i s1_20_0 = _mm256_unpacklo_epi16(step3[20], step3[27]);
+        const __m256i s1_20_1 = _mm256_unpackhi_epi16(step3[20], step3[27]);
+        const __m256i s1_21_0 = _mm256_unpacklo_epi16(step3[21], step3[26]);
+        const __m256i s1_21_1 = _mm256_unpackhi_epi16(step3[21], step3[26]);
+        const __m256i s1_18_2 = _mm256_madd_epi16(s1_18_0, k__cospi_m08_p24);
+        const __m256i s1_18_3 = _mm256_madd_epi16(s1_18_1, k__cospi_m08_p24);
+        const __m256i s1_19_2 = _mm256_madd_epi16(s1_19_0, k__cospi_m08_p24);
+        const __m256i s1_19_3 = _mm256_madd_epi16(s1_19_1, k__cospi_m08_p24);
+        const __m256i s1_20_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m24_m08);
+        const __m256i s1_20_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m24_m08);
+        const __m256i s1_21_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m24_m08);
+        const __m256i s1_21_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m24_m08);
+        const __m256i s1_26_2 = _mm256_madd_epi16(s1_21_0, k__cospi_m08_p24);
+        const __m256i s1_26_3 = _mm256_madd_epi16(s1_21_1, k__cospi_m08_p24);
+        const __m256i s1_27_2 = _mm256_madd_epi16(s1_20_0, k__cospi_m08_p24);
+        const __m256i s1_27_3 = _mm256_madd_epi16(s1_20_1, k__cospi_m08_p24);
+        const __m256i s1_28_2 = _mm256_madd_epi16(s1_19_0, k__cospi_p24_p08);
+        const __m256i s1_28_3 = _mm256_madd_epi16(s1_19_1, k__cospi_p24_p08);
+        const __m256i s1_29_2 = _mm256_madd_epi16(s1_18_0, k__cospi_p24_p08);
+        const __m256i s1_29_3 = _mm256_madd_epi16(s1_18_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s1_18_4 = _mm256_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_5 = _mm256_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_4 = _mm256_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_19_5 = _mm256_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_4 = _mm256_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_20_5 = _mm256_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_4 = _mm256_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_21_5 = _mm256_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_4 = _mm256_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_26_5 = _mm256_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_4 = _mm256_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_27_5 = _mm256_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_4 = _mm256_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_28_5 = _mm256_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_4 = _mm256_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s1_29_5 = _mm256_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s1_18_6 = _mm256_srai_epi32(s1_18_4, DCT_CONST_BITS);
+        const __m256i s1_18_7 = _mm256_srai_epi32(s1_18_5, DCT_CONST_BITS);
+        const __m256i s1_19_6 = _mm256_srai_epi32(s1_19_4, DCT_CONST_BITS);
+        const __m256i s1_19_7 = _mm256_srai_epi32(s1_19_5, DCT_CONST_BITS);
+        const __m256i s1_20_6 = _mm256_srai_epi32(s1_20_4, DCT_CONST_BITS);
+        const __m256i s1_20_7 = _mm256_srai_epi32(s1_20_5, DCT_CONST_BITS);
+        const __m256i s1_21_6 = _mm256_srai_epi32(s1_21_4, DCT_CONST_BITS);
+        const __m256i s1_21_7 = _mm256_srai_epi32(s1_21_5, DCT_CONST_BITS);
+        const __m256i s1_26_6 = _mm256_srai_epi32(s1_26_4, DCT_CONST_BITS);
+        const __m256i s1_26_7 = _mm256_srai_epi32(s1_26_5, DCT_CONST_BITS);
+        const __m256i s1_27_6 = _mm256_srai_epi32(s1_27_4, DCT_CONST_BITS);
+        const __m256i s1_27_7 = _mm256_srai_epi32(s1_27_5, DCT_CONST_BITS);
+        const __m256i s1_28_6 = _mm256_srai_epi32(s1_28_4, DCT_CONST_BITS);
+        const __m256i s1_28_7 = _mm256_srai_epi32(s1_28_5, DCT_CONST_BITS);
+        const __m256i s1_29_6 = _mm256_srai_epi32(s1_29_4, DCT_CONST_BITS);
+        const __m256i s1_29_7 = _mm256_srai_epi32(s1_29_5, DCT_CONST_BITS);
+        // Combine
+        step1[18] = _mm256_packs_epi32(s1_18_6, s1_18_7);
+        step1[19] = _mm256_packs_epi32(s1_19_6, s1_19_7);
+        step1[20] = _mm256_packs_epi32(s1_20_6, s1_20_7);
+        step1[21] = _mm256_packs_epi32(s1_21_6, s1_21_7);
+        step1[26] = _mm256_packs_epi32(s1_26_6, s1_26_7);
+        step1[27] = _mm256_packs_epi32(s1_27_6, s1_27_7);
+        step1[28] = _mm256_packs_epi32(s1_28_6, s1_28_7);
+        step1[29] = _mm256_packs_epi32(s1_29_6, s1_29_7);
+      }
+      // Stage 5
+      {
+        step2[4] = _mm256_add_epi16(step1[5], step3[4]);
+        step2[5] = _mm256_sub_epi16(step3[4], step1[5]);
+        step2[6] = _mm256_sub_epi16(step3[7], step1[6]);
+        step2[7] = _mm256_add_epi16(step1[6], step3[7]);
+      }
+      {
+        const __m256i out_00_0 = _mm256_unpacklo_epi16(step1[0], step1[1]);
+        const __m256i out_00_1 = _mm256_unpackhi_epi16(step1[0], step1[1]);
+        const __m256i out_08_0 = _mm256_unpacklo_epi16(step1[2], step1[3]);
+        const __m256i out_08_1 = _mm256_unpackhi_epi16(step1[2], step1[3]);
+        const __m256i out_00_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_p16);
+        const __m256i out_00_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_p16);
+        const __m256i out_16_2 = _mm256_madd_epi16(out_00_0, k__cospi_p16_m16);
+        const __m256i out_16_3 = _mm256_madd_epi16(out_00_1, k__cospi_p16_m16);
+        const __m256i out_08_2 = _mm256_madd_epi16(out_08_0, k__cospi_p24_p08);
+        const __m256i out_08_3 = _mm256_madd_epi16(out_08_1, k__cospi_p24_p08);
+        const __m256i out_24_2 = _mm256_madd_epi16(out_08_0, k__cospi_m08_p24);
+        const __m256i out_24_3 = _mm256_madd_epi16(out_08_1, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m256i out_00_4 = _mm256_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_5 = _mm256_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_4 = _mm256_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_16_5 = _mm256_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_4 = _mm256_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_08_5 = _mm256_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_4 = _mm256_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_24_5 = _mm256_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_00_6 = _mm256_srai_epi32(out_00_4, DCT_CONST_BITS);
+        const __m256i out_00_7 = _mm256_srai_epi32(out_00_5, DCT_CONST_BITS);
+        const __m256i out_16_6 = _mm256_srai_epi32(out_16_4, DCT_CONST_BITS);
+        const __m256i out_16_7 = _mm256_srai_epi32(out_16_5, DCT_CONST_BITS);
+        const __m256i out_08_6 = _mm256_srai_epi32(out_08_4, DCT_CONST_BITS);
+        const __m256i out_08_7 = _mm256_srai_epi32(out_08_5, DCT_CONST_BITS);
+        const __m256i out_24_6 = _mm256_srai_epi32(out_24_4, DCT_CONST_BITS);
+        const __m256i out_24_7 = _mm256_srai_epi32(out_24_5, DCT_CONST_BITS);
+        // Combine
+        out[ 0] = _mm256_packs_epi32(out_00_6, out_00_7);
+        out[16] = _mm256_packs_epi32(out_16_6, out_16_7);
+        out[ 8] = _mm256_packs_epi32(out_08_6, out_08_7);
+        out[24] = _mm256_packs_epi32(out_24_6, out_24_7);
+      }
+      {
+        const __m256i s2_09_0 = _mm256_unpacklo_epi16(step1[ 9], step1[14]);
+        const __m256i s2_09_1 = _mm256_unpackhi_epi16(step1[ 9], step1[14]);
+        const __m256i s2_10_0 = _mm256_unpacklo_epi16(step1[10], step1[13]);
+        const __m256i s2_10_1 = _mm256_unpackhi_epi16(step1[10], step1[13]);
+        const __m256i s2_09_2 = _mm256_madd_epi16(s2_09_0, k__cospi_m08_p24);
+        const __m256i s2_09_3 = _mm256_madd_epi16(s2_09_1, k__cospi_m08_p24);
+        const __m256i s2_10_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m24_m08);
+        const __m256i s2_10_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m24_m08);
+        const __m256i s2_13_2 = _mm256_madd_epi16(s2_10_0, k__cospi_m08_p24);
+        const __m256i s2_13_3 = _mm256_madd_epi16(s2_10_1, k__cospi_m08_p24);
+        const __m256i s2_14_2 = _mm256_madd_epi16(s2_09_0, k__cospi_p24_p08);
+        const __m256i s2_14_3 = _mm256_madd_epi16(s2_09_1, k__cospi_p24_p08);
+        // dct_const_round_shift
+        const __m256i s2_09_4 = _mm256_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_5 = _mm256_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_4 = _mm256_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_10_5 = _mm256_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_4 = _mm256_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_13_5 = _mm256_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_4 = _mm256_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i s2_14_5 = _mm256_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i s2_09_6 = _mm256_srai_epi32(s2_09_4, DCT_CONST_BITS);
+        const __m256i s2_09_7 = _mm256_srai_epi32(s2_09_5, DCT_CONST_BITS);
+        const __m256i s2_10_6 = _mm256_srai_epi32(s2_10_4, DCT_CONST_BITS);
+        const __m256i s2_10_7 = _mm256_srai_epi32(s2_10_5, DCT_CONST_BITS);
+        const __m256i s2_13_6 = _mm256_srai_epi32(s2_13_4, DCT_CONST_BITS);
+        const __m256i s2_13_7 = _mm256_srai_epi32(s2_13_5, DCT_CONST_BITS);
+        const __m256i s2_14_6 = _mm256_srai_epi32(s2_14_4, DCT_CONST_BITS);
+        const __m256i s2_14_7 = _mm256_srai_epi32(s2_14_5, DCT_CONST_BITS);
+        // Combine
+        step2[ 9] = _mm256_packs_epi32(s2_09_6, s2_09_7);
+        step2[10] = _mm256_packs_epi32(s2_10_6, s2_10_7);
+        step2[13] = _mm256_packs_epi32(s2_13_6, s2_13_7);
+        step2[14] = _mm256_packs_epi32(s2_14_6, s2_14_7);
+      }
+      {
+        step2[16] = _mm256_add_epi16(step1[19], step3[16]);
+        step2[17] = _mm256_add_epi16(step1[18], step3[17]);
+        step2[18] = _mm256_sub_epi16(step3[17], step1[18]);
+        step2[19] = _mm256_sub_epi16(step3[16], step1[19]);
+        step2[20] = _mm256_sub_epi16(step3[23], step1[20]);
+        step2[21] = _mm256_sub_epi16(step3[22], step1[21]);
+        step2[22] = _mm256_add_epi16(step1[21], step3[22]);
+        step2[23] = _mm256_add_epi16(step1[20], step3[23]);
+        step2[24] = _mm256_add_epi16(step1[27], step3[24]);
+        step2[25] = _mm256_add_epi16(step1[26], step3[25]);
+        step2[26] = _mm256_sub_epi16(step3[25], step1[26]);
+        step2[27] = _mm256_sub_epi16(step3[24], step1[27]);
+        step2[28] = _mm256_sub_epi16(step3[31], step1[28]);
+        step2[29] = _mm256_sub_epi16(step3[30], step1[29]);
+        step2[30] = _mm256_add_epi16(step1[29], step3[30]);
+        step2[31] = _mm256_add_epi16(step1[28], step3[31]);
+      }
+      // Stage 6
+      {
+        const __m256i out_04_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_04_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_20_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_20_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_12_0 = _mm256_unpacklo_epi16(step2[5], step2[6]);
+        const __m256i out_12_1 = _mm256_unpackhi_epi16(step2[5], step2[6]);
+        const __m256i out_28_0 = _mm256_unpacklo_epi16(step2[4], step2[7]);
+        const __m256i out_28_1 = _mm256_unpackhi_epi16(step2[4], step2[7]);
+        const __m256i out_04_2 = _mm256_madd_epi16(out_04_0, k__cospi_p28_p04);
+        const __m256i out_04_3 = _mm256_madd_epi16(out_04_1, k__cospi_p28_p04);
+        const __m256i out_20_2 = _mm256_madd_epi16(out_20_0, k__cospi_p12_p20);
+        const __m256i out_20_3 = _mm256_madd_epi16(out_20_1, k__cospi_p12_p20);
+        const __m256i out_12_2 = _mm256_madd_epi16(out_12_0, k__cospi_m20_p12);
+        const __m256i out_12_3 = _mm256_madd_epi16(out_12_1, k__cospi_m20_p12);
+        const __m256i out_28_2 = _mm256_madd_epi16(out_28_0, k__cospi_m04_p28);
+        const __m256i out_28_3 = _mm256_madd_epi16(out_28_1, k__cospi_m04_p28);
+        // dct_const_round_shift
+        const __m256i out_04_4 = _mm256_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_5 = _mm256_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_4 = _mm256_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_20_5 = _mm256_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_4 = _mm256_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_12_5 = _mm256_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_4 = _mm256_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_28_5 = _mm256_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_04_6 = _mm256_srai_epi32(out_04_4, DCT_CONST_BITS);
+        const __m256i out_04_7 = _mm256_srai_epi32(out_04_5, DCT_CONST_BITS);
+        const __m256i out_20_6 = _mm256_srai_epi32(out_20_4, DCT_CONST_BITS);
+        const __m256i out_20_7 = _mm256_srai_epi32(out_20_5, DCT_CONST_BITS);
+        const __m256i out_12_6 = _mm256_srai_epi32(out_12_4, DCT_CONST_BITS);
+        const __m256i out_12_7 = _mm256_srai_epi32(out_12_5, DCT_CONST_BITS);
+        const __m256i out_28_6 = _mm256_srai_epi32(out_28_4, DCT_CONST_BITS);
+        const __m256i out_28_7 = _mm256_srai_epi32(out_28_5, DCT_CONST_BITS);
+        // Combine
+        out[ 4] = _mm256_packs_epi32(out_04_6, out_04_7);
+        out[20] = _mm256_packs_epi32(out_20_6, out_20_7);
+        out[12] = _mm256_packs_epi32(out_12_6, out_12_7);
+        out[28] = _mm256_packs_epi32(out_28_6, out_28_7);
+      }
+      {
+        step3[ 8] = _mm256_add_epi16(step2[ 9], step1[ 8]);
+        step3[ 9] = _mm256_sub_epi16(step1[ 8], step2[ 9]);
+        step3[10] = _mm256_sub_epi16(step1[11], step2[10]);
+        step3[11] = _mm256_add_epi16(step2[10], step1[11]);
+        step3[12] = _mm256_add_epi16(step2[13], step1[12]);
+        step3[13] = _mm256_sub_epi16(step1[12], step2[13]);
+        step3[14] = _mm256_sub_epi16(step1[15], step2[14]);
+        step3[15] = _mm256_add_epi16(step2[14], step1[15]);
+      }
+      {
+        const __m256i s3_17_0 = _mm256_unpacklo_epi16(step2[17], step2[30]);
+        const __m256i s3_17_1 = _mm256_unpackhi_epi16(step2[17], step2[30]);
+        const __m256i s3_18_0 = _mm256_unpacklo_epi16(step2[18], step2[29]);
+        const __m256i s3_18_1 = _mm256_unpackhi_epi16(step2[18], step2[29]);
+        const __m256i s3_21_0 = _mm256_unpacklo_epi16(step2[21], step2[26]);
+        const __m256i s3_21_1 = _mm256_unpackhi_epi16(step2[21], step2[26]);
+        const __m256i s3_22_0 = _mm256_unpacklo_epi16(step2[22], step2[25]);
+        const __m256i s3_22_1 = _mm256_unpackhi_epi16(step2[22], step2[25]);
+        const __m256i s3_17_2 = _mm256_madd_epi16(s3_17_0, k__cospi_m04_p28);
+        const __m256i s3_17_3 = _mm256_madd_epi16(s3_17_1, k__cospi_m04_p28);
+        const __m256i s3_18_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m28_m04);
+        const __m256i s3_18_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m28_m04);
+        const __m256i s3_21_2 = _mm256_madd_epi16(s3_21_0, k__cospi_m20_p12);
+        const __m256i s3_21_3 = _mm256_madd_epi16(s3_21_1, k__cospi_m20_p12);
+        const __m256i s3_22_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m12_m20);
+        const __m256i s3_22_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m12_m20);
+        const __m256i s3_25_2 = _mm256_madd_epi16(s3_22_0, k__cospi_m20_p12);
+        const __m256i s3_25_3 = _mm256_madd_epi16(s3_22_1, k__cospi_m20_p12);
+        const __m256i s3_26_2 = _mm256_madd_epi16(s3_21_0, k__cospi_p12_p20);
+        const __m256i s3_26_3 = _mm256_madd_epi16(s3_21_1, k__cospi_p12_p20);
+        const __m256i s3_29_2 = _mm256_madd_epi16(s3_18_0, k__cospi_m04_p28);
+        const __m256i s3_29_3 = _mm256_madd_epi16(s3_18_1, k__cospi_m04_p28);
+        const __m256i s3_30_2 = _mm256_madd_epi16(s3_17_0, k__cospi_p28_p04);
+        const __m256i s3_30_3 = _mm256_madd_epi16(s3_17_1, k__cospi_p28_p04);
+        // dct_const_round_shift
+        const __m256i s3_17_4 = _mm256_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_5 = _mm256_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_4 = _mm256_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_18_5 = _mm256_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_4 = _mm256_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_21_5 = _mm256_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_4 = _mm256_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_22_5 = _mm256_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_17_6 = _mm256_srai_epi32(s3_17_4, DCT_CONST_BITS);
+        const __m256i s3_17_7 = _mm256_srai_epi32(s3_17_5, DCT_CONST_BITS);
+        const __m256i s3_18_6 = _mm256_srai_epi32(s3_18_4, DCT_CONST_BITS);
+        const __m256i s3_18_7 = _mm256_srai_epi32(s3_18_5, DCT_CONST_BITS);
+        const __m256i s3_21_6 = _mm256_srai_epi32(s3_21_4, DCT_CONST_BITS);
+        const __m256i s3_21_7 = _mm256_srai_epi32(s3_21_5, DCT_CONST_BITS);
+        const __m256i s3_22_6 = _mm256_srai_epi32(s3_22_4, DCT_CONST_BITS);
+        const __m256i s3_22_7 = _mm256_srai_epi32(s3_22_5, DCT_CONST_BITS);
+        const __m256i s3_25_4 = _mm256_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_5 = _mm256_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_4 = _mm256_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_26_5 = _mm256_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_4 = _mm256_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_29_5 = _mm256_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_4 = _mm256_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i s3_30_5 = _mm256_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i s3_25_6 = _mm256_srai_epi32(s3_25_4, DCT_CONST_BITS);
+        const __m256i s3_25_7 = _mm256_srai_epi32(s3_25_5, DCT_CONST_BITS);
+        const __m256i s3_26_6 = _mm256_srai_epi32(s3_26_4, DCT_CONST_BITS);
+        const __m256i s3_26_7 = _mm256_srai_epi32(s3_26_5, DCT_CONST_BITS);
+        const __m256i s3_29_6 = _mm256_srai_epi32(s3_29_4, DCT_CONST_BITS);
+        const __m256i s3_29_7 = _mm256_srai_epi32(s3_29_5, DCT_CONST_BITS);
+        const __m256i s3_30_6 = _mm256_srai_epi32(s3_30_4, DCT_CONST_BITS);
+        const __m256i s3_30_7 = _mm256_srai_epi32(s3_30_5, DCT_CONST_BITS);
+        // Combine
+        step3[17] = _mm256_packs_epi32(s3_17_6, s3_17_7);
+        step3[18] = _mm256_packs_epi32(s3_18_6, s3_18_7);
+        step3[21] = _mm256_packs_epi32(s3_21_6, s3_21_7);
+        step3[22] = _mm256_packs_epi32(s3_22_6, s3_22_7);
+        // Combine
+        step3[25] = _mm256_packs_epi32(s3_25_6, s3_25_7);
+        step3[26] = _mm256_packs_epi32(s3_26_6, s3_26_7);
+        step3[29] = _mm256_packs_epi32(s3_29_6, s3_29_7);
+        step3[30] = _mm256_packs_epi32(s3_30_6, s3_30_7);
+      }
+      // Stage 7
+      {
+        const __m256i out_02_0 = _mm256_unpacklo_epi16(step3[ 8], step3[15]);
+        const __m256i out_02_1 = _mm256_unpackhi_epi16(step3[ 8], step3[15]);
+        const __m256i out_18_0 = _mm256_unpacklo_epi16(step3[ 9], step3[14]);
+        const __m256i out_18_1 = _mm256_unpackhi_epi16(step3[ 9], step3[14]);
+        const __m256i out_10_0 = _mm256_unpacklo_epi16(step3[10], step3[13]);
+        const __m256i out_10_1 = _mm256_unpackhi_epi16(step3[10], step3[13]);
+        const __m256i out_26_0 = _mm256_unpacklo_epi16(step3[11], step3[12]);
+        const __m256i out_26_1 = _mm256_unpackhi_epi16(step3[11], step3[12]);
+        const __m256i out_02_2 = _mm256_madd_epi16(out_02_0, k__cospi_p30_p02);
+        const __m256i out_02_3 = _mm256_madd_epi16(out_02_1, k__cospi_p30_p02);
+        const __m256i out_18_2 = _mm256_madd_epi16(out_18_0, k__cospi_p14_p18);
+        const __m256i out_18_3 = _mm256_madd_epi16(out_18_1, k__cospi_p14_p18);
+        const __m256i out_10_2 = _mm256_madd_epi16(out_10_0, k__cospi_p22_p10);
+        const __m256i out_10_3 = _mm256_madd_epi16(out_10_1, k__cospi_p22_p10);
+        const __m256i out_26_2 = _mm256_madd_epi16(out_26_0, k__cospi_p06_p26);
+        const __m256i out_26_3 = _mm256_madd_epi16(out_26_1, k__cospi_p06_p26);
+        const __m256i out_06_2 = _mm256_madd_epi16(out_26_0, k__cospi_m26_p06);
+        const __m256i out_06_3 = _mm256_madd_epi16(out_26_1, k__cospi_m26_p06);
+        const __m256i out_22_2 = _mm256_madd_epi16(out_10_0, k__cospi_m10_p22);
+        const __m256i out_22_3 = _mm256_madd_epi16(out_10_1, k__cospi_m10_p22);
+        const __m256i out_14_2 = _mm256_madd_epi16(out_18_0, k__cospi_m18_p14);
+        const __m256i out_14_3 = _mm256_madd_epi16(out_18_1, k__cospi_m18_p14);
+        const __m256i out_30_2 = _mm256_madd_epi16(out_02_0, k__cospi_m02_p30);
+        const __m256i out_30_3 = _mm256_madd_epi16(out_02_1, k__cospi_m02_p30);
+        // dct_const_round_shift
+        const __m256i out_02_4 = _mm256_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_5 = _mm256_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_4 = _mm256_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_18_5 = _mm256_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_4 = _mm256_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_10_5 = _mm256_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_4 = _mm256_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_26_5 = _mm256_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_4 = _mm256_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_06_5 = _mm256_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_4 = _mm256_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_22_5 = _mm256_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_4 = _mm256_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_14_5 = _mm256_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_4 = _mm256_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_30_5 = _mm256_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_02_6 = _mm256_srai_epi32(out_02_4, DCT_CONST_BITS);
+        const __m256i out_02_7 = _mm256_srai_epi32(out_02_5, DCT_CONST_BITS);
+        const __m256i out_18_6 = _mm256_srai_epi32(out_18_4, DCT_CONST_BITS);
+        const __m256i out_18_7 = _mm256_srai_epi32(out_18_5, DCT_CONST_BITS);
+        const __m256i out_10_6 = _mm256_srai_epi32(out_10_4, DCT_CONST_BITS);
+        const __m256i out_10_7 = _mm256_srai_epi32(out_10_5, DCT_CONST_BITS);
+        const __m256i out_26_6 = _mm256_srai_epi32(out_26_4, DCT_CONST_BITS);
+        const __m256i out_26_7 = _mm256_srai_epi32(out_26_5, DCT_CONST_BITS);
+        const __m256i out_06_6 = _mm256_srai_epi32(out_06_4, DCT_CONST_BITS);
+        const __m256i out_06_7 = _mm256_srai_epi32(out_06_5, DCT_CONST_BITS);
+        const __m256i out_22_6 = _mm256_srai_epi32(out_22_4, DCT_CONST_BITS);
+        const __m256i out_22_7 = _mm256_srai_epi32(out_22_5, DCT_CONST_BITS);
+        const __m256i out_14_6 = _mm256_srai_epi32(out_14_4, DCT_CONST_BITS);
+        const __m256i out_14_7 = _mm256_srai_epi32(out_14_5, DCT_CONST_BITS);
+        const __m256i out_30_6 = _mm256_srai_epi32(out_30_4, DCT_CONST_BITS);
+        const __m256i out_30_7 = _mm256_srai_epi32(out_30_5, DCT_CONST_BITS);
+        // Combine
+        out[ 2] = _mm256_packs_epi32(out_02_6, out_02_7);
+        out[18] = _mm256_packs_epi32(out_18_6, out_18_7);
+        out[10] = _mm256_packs_epi32(out_10_6, out_10_7);
+        out[26] = _mm256_packs_epi32(out_26_6, out_26_7);
+        out[ 6] = _mm256_packs_epi32(out_06_6, out_06_7);
+        out[22] = _mm256_packs_epi32(out_22_6, out_22_7);
+        out[14] = _mm256_packs_epi32(out_14_6, out_14_7);
+        out[30] = _mm256_packs_epi32(out_30_6, out_30_7);
+      }
+      {
+        step1[16] = _mm256_add_epi16(step3[17], step2[16]);
+        step1[17] = _mm256_sub_epi16(step2[16], step3[17]);
+        step1[18] = _mm256_sub_epi16(step2[19], step3[18]);
+        step1[19] = _mm256_add_epi16(step3[18], step2[19]);
+        step1[20] = _mm256_add_epi16(step3[21], step2[20]);
+        step1[21] = _mm256_sub_epi16(step2[20], step3[21]);
+        step1[22] = _mm256_sub_epi16(step2[23], step3[22]);
+        step1[23] = _mm256_add_epi16(step3[22], step2[23]);
+        step1[24] = _mm256_add_epi16(step3[25], step2[24]);
+        step1[25] = _mm256_sub_epi16(step2[24], step3[25]);
+        step1[26] = _mm256_sub_epi16(step2[27], step3[26]);
+        step1[27] = _mm256_add_epi16(step3[26], step2[27]);
+        step1[28] = _mm256_add_epi16(step3[29], step2[28]);
+        step1[29] = _mm256_sub_epi16(step2[28], step3[29]);
+        step1[30] = _mm256_sub_epi16(step2[31], step3[30]);
+        step1[31] = _mm256_add_epi16(step3[30], step2[31]);
+      }
+      // Final stage --- outputs indices are bit-reversed.
+      {
+        const __m256i out_01_0 = _mm256_unpacklo_epi16(step1[16], step1[31]);
+        const __m256i out_01_1 = _mm256_unpackhi_epi16(step1[16], step1[31]);
+        const __m256i out_17_0 = _mm256_unpacklo_epi16(step1[17], step1[30]);
+        const __m256i out_17_1 = _mm256_unpackhi_epi16(step1[17], step1[30]);
+        const __m256i out_09_0 = _mm256_unpacklo_epi16(step1[18], step1[29]);
+        const __m256i out_09_1 = _mm256_unpackhi_epi16(step1[18], step1[29]);
+        const __m256i out_25_0 = _mm256_unpacklo_epi16(step1[19], step1[28]);
+        const __m256i out_25_1 = _mm256_unpackhi_epi16(step1[19], step1[28]);
+        const __m256i out_01_2 = _mm256_madd_epi16(out_01_0, k__cospi_p31_p01);
+        const __m256i out_01_3 = _mm256_madd_epi16(out_01_1, k__cospi_p31_p01);
+        const __m256i out_17_2 = _mm256_madd_epi16(out_17_0, k__cospi_p15_p17);
+        const __m256i out_17_3 = _mm256_madd_epi16(out_17_1, k__cospi_p15_p17);
+        const __m256i out_09_2 = _mm256_madd_epi16(out_09_0, k__cospi_p23_p09);
+        const __m256i out_09_3 = _mm256_madd_epi16(out_09_1, k__cospi_p23_p09);
+        const __m256i out_25_2 = _mm256_madd_epi16(out_25_0, k__cospi_p07_p25);
+        const __m256i out_25_3 = _mm256_madd_epi16(out_25_1, k__cospi_p07_p25);
+        const __m256i out_07_2 = _mm256_madd_epi16(out_25_0, k__cospi_m25_p07);
+        const __m256i out_07_3 = _mm256_madd_epi16(out_25_1, k__cospi_m25_p07);
+        const __m256i out_23_2 = _mm256_madd_epi16(out_09_0, k__cospi_m09_p23);
+        const __m256i out_23_3 = _mm256_madd_epi16(out_09_1, k__cospi_m09_p23);
+        const __m256i out_15_2 = _mm256_madd_epi16(out_17_0, k__cospi_m17_p15);
+        const __m256i out_15_3 = _mm256_madd_epi16(out_17_1, k__cospi_m17_p15);
+        const __m256i out_31_2 = _mm256_madd_epi16(out_01_0, k__cospi_m01_p31);
+        const __m256i out_31_3 = _mm256_madd_epi16(out_01_1, k__cospi_m01_p31);
+        // dct_const_round_shift
+        const __m256i out_01_4 = _mm256_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_5 = _mm256_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_4 = _mm256_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_17_5 = _mm256_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_4 = _mm256_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_09_5 = _mm256_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_4 = _mm256_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_25_5 = _mm256_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_4 = _mm256_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_07_5 = _mm256_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_4 = _mm256_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_23_5 = _mm256_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_4 = _mm256_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_15_5 = _mm256_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_4 = _mm256_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_31_5 = _mm256_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_01_6 = _mm256_srai_epi32(out_01_4, DCT_CONST_BITS);
+        const __m256i out_01_7 = _mm256_srai_epi32(out_01_5, DCT_CONST_BITS);
+        const __m256i out_17_6 = _mm256_srai_epi32(out_17_4, DCT_CONST_BITS);
+        const __m256i out_17_7 = _mm256_srai_epi32(out_17_5, DCT_CONST_BITS);
+        const __m256i out_09_6 = _mm256_srai_epi32(out_09_4, DCT_CONST_BITS);
+        const __m256i out_09_7 = _mm256_srai_epi32(out_09_5, DCT_CONST_BITS);
+        const __m256i out_25_6 = _mm256_srai_epi32(out_25_4, DCT_CONST_BITS);
+        const __m256i out_25_7 = _mm256_srai_epi32(out_25_5, DCT_CONST_BITS);
+        const __m256i out_07_6 = _mm256_srai_epi32(out_07_4, DCT_CONST_BITS);
+        const __m256i out_07_7 = _mm256_srai_epi32(out_07_5, DCT_CONST_BITS);
+        const __m256i out_23_6 = _mm256_srai_epi32(out_23_4, DCT_CONST_BITS);
+        const __m256i out_23_7 = _mm256_srai_epi32(out_23_5, DCT_CONST_BITS);
+        const __m256i out_15_6 = _mm256_srai_epi32(out_15_4, DCT_CONST_BITS);
+        const __m256i out_15_7 = _mm256_srai_epi32(out_15_5, DCT_CONST_BITS);
+        const __m256i out_31_6 = _mm256_srai_epi32(out_31_4, DCT_CONST_BITS);
+        const __m256i out_31_7 = _mm256_srai_epi32(out_31_5, DCT_CONST_BITS);
+        // Combine
+        out[ 1] = _mm256_packs_epi32(out_01_6, out_01_7);
+        out[17] = _mm256_packs_epi32(out_17_6, out_17_7);
+        out[ 9] = _mm256_packs_epi32(out_09_6, out_09_7);
+        out[25] = _mm256_packs_epi32(out_25_6, out_25_7);
+        out[ 7] = _mm256_packs_epi32(out_07_6, out_07_7);
+        out[23] = _mm256_packs_epi32(out_23_6, out_23_7);
+        out[15] = _mm256_packs_epi32(out_15_6, out_15_7);
+        out[31] = _mm256_packs_epi32(out_31_6, out_31_7);
+      }
+      {
+        const __m256i out_05_0 = _mm256_unpacklo_epi16(step1[20], step1[27]);
+        const __m256i out_05_1 = _mm256_unpackhi_epi16(step1[20], step1[27]);
+        const __m256i out_21_0 = _mm256_unpacklo_epi16(step1[21], step1[26]);
+        const __m256i out_21_1 = _mm256_unpackhi_epi16(step1[21], step1[26]);
+        const __m256i out_13_0 = _mm256_unpacklo_epi16(step1[22], step1[25]);
+        const __m256i out_13_1 = _mm256_unpackhi_epi16(step1[22], step1[25]);
+        const __m256i out_29_0 = _mm256_unpacklo_epi16(step1[23], step1[24]);
+        const __m256i out_29_1 = _mm256_unpackhi_epi16(step1[23], step1[24]);
+        const __m256i out_05_2 = _mm256_madd_epi16(out_05_0, k__cospi_p27_p05);
+        const __m256i out_05_3 = _mm256_madd_epi16(out_05_1, k__cospi_p27_p05);
+        const __m256i out_21_2 = _mm256_madd_epi16(out_21_0, k__cospi_p11_p21);
+        const __m256i out_21_3 = _mm256_madd_epi16(out_21_1, k__cospi_p11_p21);
+        const __m256i out_13_2 = _mm256_madd_epi16(out_13_0, k__cospi_p19_p13);
+        const __m256i out_13_3 = _mm256_madd_epi16(out_13_1, k__cospi_p19_p13);
+        const __m256i out_29_2 = _mm256_madd_epi16(out_29_0, k__cospi_p03_p29);
+        const __m256i out_29_3 = _mm256_madd_epi16(out_29_1, k__cospi_p03_p29);
+        const __m256i out_03_2 = _mm256_madd_epi16(out_29_0, k__cospi_m29_p03);
+        const __m256i out_03_3 = _mm256_madd_epi16(out_29_1, k__cospi_m29_p03);
+        const __m256i out_19_2 = _mm256_madd_epi16(out_13_0, k__cospi_m13_p19);
+        const __m256i out_19_3 = _mm256_madd_epi16(out_13_1, k__cospi_m13_p19);
+        const __m256i out_11_2 = _mm256_madd_epi16(out_21_0, k__cospi_m21_p11);
+        const __m256i out_11_3 = _mm256_madd_epi16(out_21_1, k__cospi_m21_p11);
+        const __m256i out_27_2 = _mm256_madd_epi16(out_05_0, k__cospi_m05_p27);
+        const __m256i out_27_3 = _mm256_madd_epi16(out_05_1, k__cospi_m05_p27);
+        // dct_const_round_shift
+        const __m256i out_05_4 = _mm256_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_5 = _mm256_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_4 = _mm256_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_21_5 = _mm256_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_4 = _mm256_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_13_5 = _mm256_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_4 = _mm256_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_29_5 = _mm256_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_4 = _mm256_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_03_5 = _mm256_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_4 = _mm256_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_19_5 = _mm256_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_4 = _mm256_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_11_5 = _mm256_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_4 = _mm256_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
+        const __m256i out_27_5 = _mm256_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
+        const __m256i out_05_6 = _mm256_srai_epi32(out_05_4, DCT_CONST_BITS);
+        const __m256i out_05_7 = _mm256_srai_epi32(out_05_5, DCT_CONST_BITS);
+        const __m256i out_21_6 = _mm256_srai_epi32(out_21_4, DCT_CONST_BITS);
+        const __m256i out_21_7 = _mm256_srai_epi32(out_21_5, DCT_CONST_BITS);
+        const __m256i out_13_6 = _mm256_srai_epi32(out_13_4, DCT_CONST_BITS);
+        const __m256i out_13_7 = _mm256_srai_epi32(out_13_5, DCT_CONST_BITS);
+        const __m256i out_29_6 = _mm256_srai_epi32(out_29_4, DCT_CONST_BITS);
+        const __m256i out_29_7 = _mm256_srai_epi32(out_29_5, DCT_CONST_BITS);
+        const __m256i out_03_6 = _mm256_srai_epi32(out_03_4, DCT_CONST_BITS);
+        const __m256i out_03_7 = _mm256_srai_epi32(out_03_5, DCT_CONST_BITS);
+        const __m256i out_19_6 = _mm256_srai_epi32(out_19_4, DCT_CONST_BITS);
+        const __m256i out_19_7 = _mm256_srai_epi32(out_19_5, DCT_CONST_BITS);
+        const __m256i out_11_6 = _mm256_srai_epi32(out_11_4, DCT_CONST_BITS);
+        const __m256i out_11_7 = _mm256_srai_epi32(out_11_5, DCT_CONST_BITS);
+        const __m256i out_27_6 = _mm256_srai_epi32(out_27_4, DCT_CONST_BITS);
+        const __m256i out_27_7 = _mm256_srai_epi32(out_27_5, DCT_CONST_BITS);
+        // Combine
+        out[ 5] = _mm256_packs_epi32(out_05_6, out_05_7);
+        out[21] = _mm256_packs_epi32(out_21_6, out_21_7);
+        out[13] = _mm256_packs_epi32(out_13_6, out_13_7);
+        out[29] = _mm256_packs_epi32(out_29_6, out_29_7);
+        out[ 3] = _mm256_packs_epi32(out_03_6, out_03_7);
+        out[19] = _mm256_packs_epi32(out_19_6, out_19_7);
+        out[11] = _mm256_packs_epi32(out_11_6, out_11_7);
+        out[27] = _mm256_packs_epi32(out_27_6, out_27_7);
+      }
+#if FDCT32x32_HIGH_PRECISION
+      } else {
+        __m256i lstep1[64], lstep2[64], lstep3[64];
+        __m256i u[32], v[32], sign[16];
+        const __m256i K32One = _mm256_set_epi32(1, 1, 1, 1, 1, 1, 1, 1);
+        // start using 32-bit operations
+        // stage 3
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[ 0] = _mm256_unpacklo_epi16(step2[ 0], kZero);
+          lstep2[ 1] = _mm256_unpackhi_epi16(step2[ 0], kZero);
+          lstep2[ 2] = _mm256_unpacklo_epi16(step2[ 1], kZero);
+          lstep2[ 3] = _mm256_unpackhi_epi16(step2[ 1], kZero);
+          lstep2[ 4] = _mm256_unpacklo_epi16(step2[ 2], kZero);
+          lstep2[ 5] = _mm256_unpackhi_epi16(step2[ 2], kZero);
+          lstep2[ 6] = _mm256_unpacklo_epi16(step2[ 3], kZero);
+          lstep2[ 7] = _mm256_unpackhi_epi16(step2[ 3], kZero);
+          lstep2[ 8] = _mm256_unpacklo_epi16(step2[ 4], kZero);
+          lstep2[ 9] = _mm256_unpackhi_epi16(step2[ 4], kZero);
+          lstep2[10] = _mm256_unpacklo_epi16(step2[ 5], kZero);
+          lstep2[11] = _mm256_unpackhi_epi16(step2[ 5], kZero);
+          lstep2[12] = _mm256_unpacklo_epi16(step2[ 6], kZero);
+          lstep2[13] = _mm256_unpackhi_epi16(step2[ 6], kZero);
+          lstep2[14] = _mm256_unpacklo_epi16(step2[ 7], kZero);
+          lstep2[15] = _mm256_unpackhi_epi16(step2[ 7], kZero);
+          lstep2[ 0] = _mm256_madd_epi16(lstep2[ 0], kOne);
+          lstep2[ 1] = _mm256_madd_epi16(lstep2[ 1], kOne);
+          lstep2[ 2] = _mm256_madd_epi16(lstep2[ 2], kOne);
+          lstep2[ 3] = _mm256_madd_epi16(lstep2[ 3], kOne);
+          lstep2[ 4] = _mm256_madd_epi16(lstep2[ 4], kOne);
+          lstep2[ 5] = _mm256_madd_epi16(lstep2[ 5], kOne);
+          lstep2[ 6] = _mm256_madd_epi16(lstep2[ 6], kOne);
+          lstep2[ 7] = _mm256_madd_epi16(lstep2[ 7], kOne);
+          lstep2[ 8] = _mm256_madd_epi16(lstep2[ 8], kOne);
+          lstep2[ 9] = _mm256_madd_epi16(lstep2[ 9], kOne);
+          lstep2[10] = _mm256_madd_epi16(lstep2[10], kOne);
+          lstep2[11] = _mm256_madd_epi16(lstep2[11], kOne);
+          lstep2[12] = _mm256_madd_epi16(lstep2[12], kOne);
+          lstep2[13] = _mm256_madd_epi16(lstep2[13], kOne);
+          lstep2[14] = _mm256_madd_epi16(lstep2[14], kOne);
+          lstep2[15] = _mm256_madd_epi16(lstep2[15], kOne);
+
+          lstep3[ 0] = _mm256_add_epi32(lstep2[14], lstep2[ 0]);
+          lstep3[ 1] = _mm256_add_epi32(lstep2[15], lstep2[ 1]);
+          lstep3[ 2] = _mm256_add_epi32(lstep2[12], lstep2[ 2]);
+          lstep3[ 3] = _mm256_add_epi32(lstep2[13], lstep2[ 3]);
+          lstep3[ 4] = _mm256_add_epi32(lstep2[10], lstep2[ 4]);
+          lstep3[ 5] = _mm256_add_epi32(lstep2[11], lstep2[ 5]);
+          lstep3[ 6] = _mm256_add_epi32(lstep2[ 8], lstep2[ 6]);
+          lstep3[ 7] = _mm256_add_epi32(lstep2[ 9], lstep2[ 7]);
+          lstep3[ 8] = _mm256_sub_epi32(lstep2[ 6], lstep2[ 8]);
+          lstep3[ 9] = _mm256_sub_epi32(lstep2[ 7], lstep2[ 9]);
+          lstep3[10] = _mm256_sub_epi32(lstep2[ 4], lstep2[10]);
+          lstep3[11] = _mm256_sub_epi32(lstep2[ 5], lstep2[11]);
+          lstep3[12] = _mm256_sub_epi32(lstep2[ 2], lstep2[12]);
+          lstep3[13] = _mm256_sub_epi32(lstep2[ 3], lstep2[13]);
+          lstep3[14] = _mm256_sub_epi32(lstep2[ 0], lstep2[14]);
+          lstep3[15] = _mm256_sub_epi32(lstep2[ 1], lstep2[15]);
+        }
+        {
+          const __m256i s3_10_0 = _mm256_unpacklo_epi16(step2[13], step2[10]);
+          const __m256i s3_10_1 = _mm256_unpackhi_epi16(step2[13], step2[10]);
+          const __m256i s3_11_0 = _mm256_unpacklo_epi16(step2[12], step2[11]);
+          const __m256i s3_11_1 = _mm256_unpackhi_epi16(step2[12], step2[11]);
+          const __m256i s3_10_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_m16);
+          const __m256i s3_10_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_m16);
+          const __m256i s3_11_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_m16);
+          const __m256i s3_11_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_m16);
+          const __m256i s3_12_2 = _mm256_madd_epi16(s3_11_0, k__cospi_p16_p16);
+          const __m256i s3_12_3 = _mm256_madd_epi16(s3_11_1, k__cospi_p16_p16);
+          const __m256i s3_13_2 = _mm256_madd_epi16(s3_10_0, k__cospi_p16_p16);
+          const __m256i s3_13_3 = _mm256_madd_epi16(s3_10_1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m256i s3_10_4 = _mm256_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_10_5 = _mm256_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_4 = _mm256_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_11_5 = _mm256_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_4 = _mm256_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_12_5 = _mm256_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_4 = _mm256_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
+          const __m256i s3_13_5 = _mm256_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
+          lstep3[20] = _mm256_srai_epi32(s3_10_4, DCT_CONST_BITS);
+          lstep3[21] = _mm256_srai_epi32(s3_10_5, DCT_CONST_BITS);
+          lstep3[22] = _mm256_srai_epi32(s3_11_4, DCT_CONST_BITS);
+          lstep3[23] = _mm256_srai_epi32(s3_11_5, DCT_CONST_BITS);
+          lstep3[24] = _mm256_srai_epi32(s3_12_4, DCT_CONST_BITS);
+          lstep3[25] = _mm256_srai_epi32(s3_12_5, DCT_CONST_BITS);
+          lstep3[26] = _mm256_srai_epi32(s3_13_4, DCT_CONST_BITS);
+          lstep3[27] = _mm256_srai_epi32(s3_13_5, DCT_CONST_BITS);
+        }
+        {
+          lstep2[40] = _mm256_unpacklo_epi16(step2[20], kZero);
+          lstep2[41] = _mm256_unpackhi_epi16(step2[20], kZero);
+          lstep2[42] = _mm256_unpacklo_epi16(step2[21], kZero);
+          lstep2[43] = _mm256_unpackhi_epi16(step2[21], kZero);
+          lstep2[44] = _mm256_unpacklo_epi16(step2[22], kZero);
+          lstep2[45] = _mm256_unpackhi_epi16(step2[22], kZero);
+          lstep2[46] = _mm256_unpacklo_epi16(step2[23], kZero);
+          lstep2[47] = _mm256_unpackhi_epi16(step2[23], kZero);
+          lstep2[48] = _mm256_unpacklo_epi16(step2[24], kZero);
+          lstep2[49] = _mm256_unpackhi_epi16(step2[24], kZero);
+          lstep2[50] = _mm256_unpacklo_epi16(step2[25], kZero);
+          lstep2[51] = _mm256_unpackhi_epi16(step2[25], kZero);
+          lstep2[52] = _mm256_unpacklo_epi16(step2[26], kZero);
+          lstep2[53] = _mm256_unpackhi_epi16(step2[26], kZero);
+          lstep2[54] = _mm256_unpacklo_epi16(step2[27], kZero);
+          lstep2[55] = _mm256_unpackhi_epi16(step2[27], kZero);
+          lstep2[40] = _mm256_madd_epi16(lstep2[40], kOne);
+          lstep2[41] = _mm256_madd_epi16(lstep2[41], kOne);
+          lstep2[42] = _mm256_madd_epi16(lstep2[42], kOne);
+          lstep2[43] = _mm256_madd_epi16(lstep2[43], kOne);
+          lstep2[44] = _mm256_madd_epi16(lstep2[44], kOne);
+          lstep2[45] = _mm256_madd_epi16(lstep2[45], kOne);
+          lstep2[46] = _mm256_madd_epi16(lstep2[46], kOne);
+          lstep2[47] = _mm256_madd_epi16(lstep2[47], kOne);
+          lstep2[48] = _mm256_madd_epi16(lstep2[48], kOne);
+          lstep2[49] = _mm256_madd_epi16(lstep2[49], kOne);
+          lstep2[50] = _mm256_madd_epi16(lstep2[50], kOne);
+          lstep2[51] = _mm256_madd_epi16(lstep2[51], kOne);
+          lstep2[52] = _mm256_madd_epi16(lstep2[52], kOne);
+          lstep2[53] = _mm256_madd_epi16(lstep2[53], kOne);
+          lstep2[54] = _mm256_madd_epi16(lstep2[54], kOne);
+          lstep2[55] = _mm256_madd_epi16(lstep2[55], kOne);
+
+          lstep1[32] = _mm256_unpacklo_epi16(step1[16], kZero);
+          lstep1[33] = _mm256_unpackhi_epi16(step1[16], kZero);
+          lstep1[34] = _mm256_unpacklo_epi16(step1[17], kZero);
+          lstep1[35] = _mm256_unpackhi_epi16(step1[17], kZero);
+          lstep1[36] = _mm256_unpacklo_epi16(step1[18], kZero);
+          lstep1[37] = _mm256_unpackhi_epi16(step1[18], kZero);
+          lstep1[38] = _mm256_unpacklo_epi16(step1[19], kZero);
+          lstep1[39] = _mm256_unpackhi_epi16(step1[19], kZero);
+          lstep1[56] = _mm256_unpacklo_epi16(step1[28], kZero);
+          lstep1[57] = _mm256_unpackhi_epi16(step1[28], kZero);
+          lstep1[58] = _mm256_unpacklo_epi16(step1[29], kZero);
+          lstep1[59] = _mm256_unpackhi_epi16(step1[29], kZero);
+          lstep1[60] = _mm256_unpacklo_epi16(step1[30], kZero);
+          lstep1[61] = _mm256_unpackhi_epi16(step1[30], kZero);
+          lstep1[62] = _mm256_unpacklo_epi16(step1[31], kZero);
+          lstep1[63] = _mm256_unpackhi_epi16(step1[31], kZero);
+          lstep1[32] = _mm256_madd_epi16(lstep1[32], kOne);
+          lstep1[33] = _mm256_madd_epi16(lstep1[33], kOne);
+          lstep1[34] = _mm256_madd_epi16(lstep1[34], kOne);
+          lstep1[35] = _mm256_madd_epi16(lstep1[35], kOne);
+          lstep1[36] = _mm256_madd_epi16(lstep1[36], kOne);
+          lstep1[37] = _mm256_madd_epi16(lstep1[37], kOne);
+          lstep1[38] = _mm256_madd_epi16(lstep1[38], kOne);
+          lstep1[39] = _mm256_madd_epi16(lstep1[39], kOne);
+          lstep1[56] = _mm256_madd_epi16(lstep1[56], kOne);
+          lstep1[57] = _mm256_madd_epi16(lstep1[57], kOne);
+          lstep1[58] = _mm256_madd_epi16(lstep1[58], kOne);
+          lstep1[59] = _mm256_madd_epi16(lstep1[59], kOne);
+          lstep1[60] = _mm256_madd_epi16(lstep1[60], kOne);
+          lstep1[61] = _mm256_madd_epi16(lstep1[61], kOne);
+          lstep1[62] = _mm256_madd_epi16(lstep1[62], kOne);
+          lstep1[63] = _mm256_madd_epi16(lstep1[63], kOne);
+
+          lstep3[32] = _mm256_add_epi32(lstep2[46], lstep1[32]);
+          lstep3[33] = _mm256_add_epi32(lstep2[47], lstep1[33]);
+
+          lstep3[34] = _mm256_add_epi32(lstep2[44], lstep1[34]);
+          lstep3[35] = _mm256_add_epi32(lstep2[45], lstep1[35]);
+          lstep3[36] = _mm256_add_epi32(lstep2[42], lstep1[36]);
+          lstep3[37] = _mm256_add_epi32(lstep2[43], lstep1[37]);
+          lstep3[38] = _mm256_add_epi32(lstep2[40], lstep1[38]);
+          lstep3[39] = _mm256_add_epi32(lstep2[41], lstep1[39]);
+          lstep3[40] = _mm256_sub_epi32(lstep1[38], lstep2[40]);
+          lstep3[41] = _mm256_sub_epi32(lstep1[39], lstep2[41]);
+          lstep3[42] = _mm256_sub_epi32(lstep1[36], lstep2[42]);
+          lstep3[43] = _mm256_sub_epi32(lstep1[37], lstep2[43]);
+          lstep3[44] = _mm256_sub_epi32(lstep1[34], lstep2[44]);
+          lstep3[45] = _mm256_sub_epi32(lstep1[35], lstep2[45]);
+          lstep3[46] = _mm256_sub_epi32(lstep1[32], lstep2[46]);
+          lstep3[47] = _mm256_sub_epi32(lstep1[33], lstep2[47]);
+          lstep3[48] = _mm256_sub_epi32(lstep1[62], lstep2[48]);
+          lstep3[49] = _mm256_sub_epi32(lstep1[63], lstep2[49]);
+          lstep3[50] = _mm256_sub_epi32(lstep1[60], lstep2[50]);
+          lstep3[51] = _mm256_sub_epi32(lstep1[61], lstep2[51]);
+          lstep3[52] = _mm256_sub_epi32(lstep1[58], lstep2[52]);
+          lstep3[53] = _mm256_sub_epi32(lstep1[59], lstep2[53]);
+          lstep3[54] = _mm256_sub_epi32(lstep1[56], lstep2[54]);
+          lstep3[55] = _mm256_sub_epi32(lstep1[57], lstep2[55]);
+          lstep3[56] = _mm256_add_epi32(lstep2[54], lstep1[56]);
+          lstep3[57] = _mm256_add_epi32(lstep2[55], lstep1[57]);
+          lstep3[58] = _mm256_add_epi32(lstep2[52], lstep1[58]);
+          lstep3[59] = _mm256_add_epi32(lstep2[53], lstep1[59]);
+          lstep3[60] = _mm256_add_epi32(lstep2[50], lstep1[60]);
+          lstep3[61] = _mm256_add_epi32(lstep2[51], lstep1[61]);
+          lstep3[62] = _mm256_add_epi32(lstep2[48], lstep1[62]);
+          lstep3[63] = _mm256_add_epi32(lstep2[49], lstep1[63]);
+        }
+
+        // stage 4
+        {
+          // expanding to 32-bit length priori to addition operations
+          lstep2[16] = _mm256_unpacklo_epi16(step2[ 8], kZero);
+          lstep2[17] = _mm256_unpackhi_epi16(step2[ 8], kZero);
+          lstep2[18] = _mm256_unpacklo_epi16(step2[ 9], kZero);
+          lstep2[19] = _mm256_unpackhi_epi16(step2[ 9], kZero);
+          lstep2[28] = _mm256_unpacklo_epi16(step2[14], kZero);
+          lstep2[29] = _mm256_unpackhi_epi16(step2[14], kZero);
+          lstep2[30] = _mm256_unpacklo_epi16(step2[15], kZero);
+          lstep2[31] = _mm256_unpackhi_epi16(step2[15], kZero);
+          lstep2[16] = _mm256_madd_epi16(lstep2[16], kOne);
+          lstep2[17] = _mm256_madd_epi16(lstep2[17], kOne);
+          lstep2[18] = _mm256_madd_epi16(lstep2[18], kOne);
+          lstep2[19] = _mm256_madd_epi16(lstep2[19], kOne);
+          lstep2[28] = _mm256_madd_epi16(lstep2[28], kOne);
+          lstep2[29] = _mm256_madd_epi16(lstep2[29], kOne);
+          lstep2[30] = _mm256_madd_epi16(lstep2[30], kOne);
+          lstep2[31] = _mm256_madd_epi16(lstep2[31], kOne);
+
+          lstep1[ 0] = _mm256_add_epi32(lstep3[ 6], lstep3[ 0]);
+          lstep1[ 1] = _mm256_add_epi32(lstep3[ 7], lstep3[ 1]);
+          lstep1[ 2] = _mm256_add_epi32(lstep3[ 4], lstep3[ 2]);
+          lstep1[ 3] = _mm256_add_epi32(lstep3[ 5], lstep3[ 3]);
+          lstep1[ 4] = _mm256_sub_epi32(lstep3[ 2], lstep3[ 4]);
+          lstep1[ 5] = _mm256_sub_epi32(lstep3[ 3], lstep3[ 5]);
+          lstep1[ 6] = _mm256_sub_epi32(lstep3[ 0], lstep3[ 6]);
+          lstep1[ 7] = _mm256_sub_epi32(lstep3[ 1], lstep3[ 7]);
+          lstep1[16] = _mm256_add_epi32(lstep3[22], lstep2[16]);
+          lstep1[17] = _mm256_add_epi32(lstep3[23], lstep2[17]);
+          lstep1[18] = _mm256_add_epi32(lstep3[20], lstep2[18]);
+          lstep1[19] = _mm256_add_epi32(lstep3[21], lstep2[19]);
+          lstep1[20] = _mm256_sub_epi32(lstep2[18], lstep3[20]);
+          lstep1[21] = _mm256_sub_epi32(lstep2[19], lstep3[21]);
+          lstep1[22] = _mm256_sub_epi32(lstep2[16], lstep3[22]);
+          lstep1[23] = _mm256_sub_epi32(lstep2[17], lstep3[23]);
+          lstep1[24] = _mm256_sub_epi32(lstep2[30], lstep3[24]);
+          lstep1[25] = _mm256_sub_epi32(lstep2[31], lstep3[25]);
+          lstep1[26] = _mm256_sub_epi32(lstep2[28], lstep3[26]);
+          lstep1[27] = _mm256_sub_epi32(lstep2[29], lstep3[27]);
+          lstep1[28] = _mm256_add_epi32(lstep3[26], lstep2[28]);
+          lstep1[29] = _mm256_add_epi32(lstep3[27], lstep2[29]);
+          lstep1[30] = _mm256_add_epi32(lstep3[24], lstep2[30]);
+          lstep1[31] = _mm256_add_epi32(lstep3[25], lstep2[31]);
+        }
+        {
+        // to be continued...
+        //
+        const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+        const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+
+        u[0] = _mm256_unpacklo_epi32(lstep3[12], lstep3[10]);
+        u[1] = _mm256_unpackhi_epi32(lstep3[12], lstep3[10]);
+        u[2] = _mm256_unpacklo_epi32(lstep3[13], lstep3[11]);
+        u[3] = _mm256_unpackhi_epi32(lstep3[13], lstep3[11]);
+
+        // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+        // instruction latency.
+        v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+        v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+        v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+        v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+        v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+        v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+        v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+        v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+
+        u[0] = k_packs_epi64_avx2(v[0], v[1]);
+        u[1] = k_packs_epi64_avx2(v[2], v[3]);
+        u[2] = k_packs_epi64_avx2(v[4], v[5]);
+        u[3] = k_packs_epi64_avx2(v[6], v[7]);
+
+        v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+        v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+        v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+        v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+
+        lstep1[10] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+        lstep1[11] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+        lstep1[12] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+        lstep1[13] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[36], lstep3[58]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[36], lstep3[58]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[37], lstep3[59]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[37], lstep3[59]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[38], lstep3[56]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[38], lstep3[56]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[39], lstep3[57]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[39], lstep3[57]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[40], lstep3[54]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[40], lstep3[54]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[41], lstep3[55]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[41], lstep3[55]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[42], lstep3[52]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[42], lstep3[52]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[43], lstep3[53]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[43], lstep3[53]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m08_p24);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m08_p24);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m08_p24);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m08_p24);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m08_p24);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m08_p24);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m08_p24);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m08_p24);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m24_m08);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m24_m08);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m24_m08);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m24_m08);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m24_m08);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m24_m08);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m24_m08);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m24_m08);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m08_p24);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m08_p24);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m08_p24);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m08_p24);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m08_p24);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m08_p24);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m08_p24);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m08_p24);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_p24_p08);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_p24_p08);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_p24_p08);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_p24_p08);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p24_p08);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p24_p08);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p24_p08);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p24_p08);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep1[36] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep1[37] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep1[38] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep1[39] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep1[40] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep1[41] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep1[42] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep1[43] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep1[52] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep1[53] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep1[54] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep1[55] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep1[56] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep1[57] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep1[58] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep1[59] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 5
+        {
+          lstep2[ 8] = _mm256_add_epi32(lstep1[10], lstep3[ 8]);
+          lstep2[ 9] = _mm256_add_epi32(lstep1[11], lstep3[ 9]);
+          lstep2[10] = _mm256_sub_epi32(lstep3[ 8], lstep1[10]);
+          lstep2[11] = _mm256_sub_epi32(lstep3[ 9], lstep1[11]);
+          lstep2[12] = _mm256_sub_epi32(lstep3[14], lstep1[12]);
+          lstep2[13] = _mm256_sub_epi32(lstep3[15], lstep1[13]);
+          lstep2[14] = _mm256_add_epi32(lstep1[12], lstep3[14]);
+          lstep2[15] = _mm256_add_epi32(lstep1[13], lstep3[15]);
+        }
+        {
+          const __m256i k32_p16_p16 = pair256_set_epi32(cospi_16_64, cospi_16_64);
+          const __m256i k32_p16_m16 = pair256_set_epi32(cospi_16_64, -cospi_16_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[0], lstep1[2]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[0], lstep1[2]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[1], lstep1[3]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[1], lstep1[3]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[4], lstep1[6]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[4], lstep1[6]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[5], lstep1[7]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[5], lstep1[7]);
+
+          // TODO(jingning): manually inline k_madd_epi32_avx2_ to further hide
+          // instruction latency.
+          v[ 0] = k_madd_epi32_avx2(u[0], k32_p16_p16);
+          v[ 1] = k_madd_epi32_avx2(u[1], k32_p16_p16);
+          v[ 2] = k_madd_epi32_avx2(u[2], k32_p16_p16);
+          v[ 3] = k_madd_epi32_avx2(u[3], k32_p16_p16);
+          v[ 4] = k_madd_epi32_avx2(u[0], k32_p16_m16);
+          v[ 5] = k_madd_epi32_avx2(u[1], k32_p16_m16);
+          v[ 6] = k_madd_epi32_avx2(u[2], k32_p16_m16);
+          v[ 7] = k_madd_epi32_avx2(u[3], k32_p16_m16);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_p24_p08);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_p24_p08);
+          v[10] = k_madd_epi32_avx2(u[6], k32_p24_p08);
+          v[11] = k_madd_epi32_avx2(u[7], k32_p24_p08);
+          v[12] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[13] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[14] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[15] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          // Combine
+          out[ 0] = _mm256_packs_epi32(u[0], u[1]);
+          out[16] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 8] = _mm256_packs_epi32(u[4], u[5]);
+          out[24] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          const __m256i k32_m08_p24 = pair256_set_epi32(-cospi_8_64, cospi_24_64);
+          const __m256i k32_m24_m08 = pair256_set_epi32(-cospi_24_64, -cospi_8_64);
+          const __m256i k32_p24_p08 = pair256_set_epi32(cospi_24_64, cospi_8_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep1[18], lstep1[28]);
+          u[1] = _mm256_unpackhi_epi32(lstep1[18], lstep1[28]);
+          u[2] = _mm256_unpacklo_epi32(lstep1[19], lstep1[29]);
+          u[3] = _mm256_unpackhi_epi32(lstep1[19], lstep1[29]);
+          u[4] = _mm256_unpacklo_epi32(lstep1[20], lstep1[26]);
+          u[5] = _mm256_unpackhi_epi32(lstep1[20], lstep1[26]);
+          u[6] = _mm256_unpacklo_epi32(lstep1[21], lstep1[27]);
+          u[7] = _mm256_unpackhi_epi32(lstep1[21], lstep1[27]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_m08_p24);
+          v[1] = k_madd_epi32_avx2(u[1], k32_m08_p24);
+          v[2] = k_madd_epi32_avx2(u[2], k32_m08_p24);
+          v[3] = k_madd_epi32_avx2(u[3], k32_m08_p24);
+          v[4] = k_madd_epi32_avx2(u[4], k32_m24_m08);
+          v[5] = k_madd_epi32_avx2(u[5], k32_m24_m08);
+          v[6] = k_madd_epi32_avx2(u[6], k32_m24_m08);
+          v[7] = k_madd_epi32_avx2(u[7], k32_m24_m08);
+          v[ 8] = k_madd_epi32_avx2(u[4], k32_m08_p24);
+          v[ 9] = k_madd_epi32_avx2(u[5], k32_m08_p24);
+          v[10] = k_madd_epi32_avx2(u[6], k32_m08_p24);
+          v[11] = k_madd_epi32_avx2(u[7], k32_m08_p24);
+          v[12] = k_madd_epi32_avx2(u[0], k32_p24_p08);
+          v[13] = k_madd_epi32_avx2(u[1], k32_p24_p08);
+          v[14] = k_madd_epi32_avx2(u[2], k32_p24_p08);
+          v[15] = k_madd_epi32_avx2(u[3], k32_p24_p08);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          u[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          u[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          u[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          u[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          u[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          u[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          u[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          u[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          lstep2[18] = _mm256_srai_epi32(u[0], DCT_CONST_BITS);
+          lstep2[19] = _mm256_srai_epi32(u[1], DCT_CONST_BITS);
+          lstep2[20] = _mm256_srai_epi32(u[2], DCT_CONST_BITS);
+          lstep2[21] = _mm256_srai_epi32(u[3], DCT_CONST_BITS);
+          lstep2[26] = _mm256_srai_epi32(u[4], DCT_CONST_BITS);
+          lstep2[27] = _mm256_srai_epi32(u[5], DCT_CONST_BITS);
+          lstep2[28] = _mm256_srai_epi32(u[6], DCT_CONST_BITS);
+          lstep2[29] = _mm256_srai_epi32(u[7], DCT_CONST_BITS);
+        }
+        {
+          lstep2[32] = _mm256_add_epi32(lstep1[38], lstep3[32]);
+          lstep2[33] = _mm256_add_epi32(lstep1[39], lstep3[33]);
+          lstep2[34] = _mm256_add_epi32(lstep1[36], lstep3[34]);
+          lstep2[35] = _mm256_add_epi32(lstep1[37], lstep3[35]);
+          lstep2[36] = _mm256_sub_epi32(lstep3[34], lstep1[36]);
+          lstep2[37] = _mm256_sub_epi32(lstep3[35], lstep1[37]);
+          lstep2[38] = _mm256_sub_epi32(lstep3[32], lstep1[38]);
+          lstep2[39] = _mm256_sub_epi32(lstep3[33], lstep1[39]);
+          lstep2[40] = _mm256_sub_epi32(lstep3[46], lstep1[40]);
+          lstep2[41] = _mm256_sub_epi32(lstep3[47], lstep1[41]);
+          lstep2[42] = _mm256_sub_epi32(lstep3[44], lstep1[42]);
+          lstep2[43] = _mm256_sub_epi32(lstep3[45], lstep1[43]);
+          lstep2[44] = _mm256_add_epi32(lstep1[42], lstep3[44]);
+          lstep2[45] = _mm256_add_epi32(lstep1[43], lstep3[45]);
+          lstep2[46] = _mm256_add_epi32(lstep1[40], lstep3[46]);
+          lstep2[47] = _mm256_add_epi32(lstep1[41], lstep3[47]);
+          lstep2[48] = _mm256_add_epi32(lstep1[54], lstep3[48]);
+          lstep2[49] = _mm256_add_epi32(lstep1[55], lstep3[49]);
+          lstep2[50] = _mm256_add_epi32(lstep1[52], lstep3[50]);
+          lstep2[51] = _mm256_add_epi32(lstep1[53], lstep3[51]);
+          lstep2[52] = _mm256_sub_epi32(lstep3[50], lstep1[52]);
+          lstep2[53] = _mm256_sub_epi32(lstep3[51], lstep1[53]);
+          lstep2[54] = _mm256_sub_epi32(lstep3[48], lstep1[54]);
+          lstep2[55] = _mm256_sub_epi32(lstep3[49], lstep1[55]);
+          lstep2[56] = _mm256_sub_epi32(lstep3[62], lstep1[56]);
+          lstep2[57] = _mm256_sub_epi32(lstep3[63], lstep1[57]);
+          lstep2[58] = _mm256_sub_epi32(lstep3[60], lstep1[58]);
+          lstep2[59] = _mm256_sub_epi32(lstep3[61], lstep1[59]);
+          lstep2[60] = _mm256_add_epi32(lstep1[58], lstep3[60]);
+          lstep2[61] = _mm256_add_epi32(lstep1[59], lstep3[61]);
+          lstep2[62] = _mm256_add_epi32(lstep1[56], lstep3[62]);
+          lstep2[63] = _mm256_add_epi32(lstep1[57], lstep3[63]);
+        }
+        // stage 6
+        {
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+
+          u[0] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[1] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[2] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[3] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+          u[4] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[5] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[6] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[7] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[8] = _mm256_unpacklo_epi32(lstep2[10], lstep2[12]);
+          u[9] = _mm256_unpackhi_epi32(lstep2[10], lstep2[12]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[11], lstep2[13]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[11], lstep2[13]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[ 8], lstep2[14]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[ 8], lstep2[14]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[ 9], lstep2[15]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[ 9], lstep2[15]);
+
+          v[0] = k_madd_epi32_avx2(u[0], k32_p28_p04);
+          v[1] = k_madd_epi32_avx2(u[1], k32_p28_p04);
+          v[2] = k_madd_epi32_avx2(u[2], k32_p28_p04);
+          v[3] = k_madd_epi32_avx2(u[3], k32_p28_p04);
+          v[4] = k_madd_epi32_avx2(u[4], k32_p12_p20);
+          v[5] = k_madd_epi32_avx2(u[5], k32_p12_p20);
+          v[6] = k_madd_epi32_avx2(u[6], k32_p12_p20);
+          v[7] = k_madd_epi32_avx2(u[7], k32_p12_p20);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m04_p28);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m04_p28);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m04_p28);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m04_p28);
+
+          u[0] = k_packs_epi64_avx2(v[0], v[1]);
+          u[1] = k_packs_epi64_avx2(v[2], v[3]);
+          u[2] = k_packs_epi64_avx2(v[4], v[5]);
+          u[3] = k_packs_epi64_avx2(v[6], v[7]);
+          u[4] = k_packs_epi64_avx2(v[8], v[9]);
+          u[5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[7] = k_packs_epi64_avx2(v[14], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+          v[1] = _mm256_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+          v[2] = _mm256_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+          v[3] = _mm256_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+          v[4] = _mm256_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+          v[5] = _mm256_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+          v[6] = _mm256_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+          v[7] = _mm256_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+          u[0] = _mm256_srai_epi32(v[0], DCT_CONST_BITS);
+          u[1] = _mm256_srai_epi32(v[1], DCT_CONST_BITS);
+          u[2] = _mm256_srai_epi32(v[2], DCT_CONST_BITS);
+          u[3] = _mm256_srai_epi32(v[3], DCT_CONST_BITS);
+          u[4] = _mm256_srai_epi32(v[4], DCT_CONST_BITS);
+          u[5] = _mm256_srai_epi32(v[5], DCT_CONST_BITS);
+          u[6] = _mm256_srai_epi32(v[6], DCT_CONST_BITS);
+          u[7] = _mm256_srai_epi32(v[7], DCT_CONST_BITS);
+
+          sign[0] = _mm256_cmpgt_epi32(kZero,u[0]);
+          sign[1] = _mm256_cmpgt_epi32(kZero,u[1]);
+          sign[2] = _mm256_cmpgt_epi32(kZero,u[2]);
+          sign[3] = _mm256_cmpgt_epi32(kZero,u[3]);
+          sign[4] = _mm256_cmpgt_epi32(kZero,u[4]);
+          sign[5] = _mm256_cmpgt_epi32(kZero,u[5]);
+          sign[6] = _mm256_cmpgt_epi32(kZero,u[6]);
+          sign[7] = _mm256_cmpgt_epi32(kZero,u[7]);
+
+          u[0] = _mm256_sub_epi32(u[0], sign[0]);
+          u[1] = _mm256_sub_epi32(u[1], sign[1]);
+          u[2] = _mm256_sub_epi32(u[2], sign[2]);
+          u[3] = _mm256_sub_epi32(u[3], sign[3]);
+          u[4] = _mm256_sub_epi32(u[4], sign[4]);
+          u[5] = _mm256_sub_epi32(u[5], sign[5]);
+          u[6] = _mm256_sub_epi32(u[6], sign[6]);
+          u[7] = _mm256_sub_epi32(u[7], sign[7]);
+
+          u[0] = _mm256_add_epi32(u[0], K32One);
+          u[1] = _mm256_add_epi32(u[1], K32One);
+          u[2] = _mm256_add_epi32(u[2], K32One);
+          u[3] = _mm256_add_epi32(u[3], K32One);
+          u[4] = _mm256_add_epi32(u[4], K32One);
+          u[5] = _mm256_add_epi32(u[5], K32One);
+          u[6] = _mm256_add_epi32(u[6], K32One);
+          u[7] = _mm256_add_epi32(u[7], K32One);
+
+          u[0] = _mm256_srai_epi32(u[0], 2);
+          u[1] = _mm256_srai_epi32(u[1], 2);
+          u[2] = _mm256_srai_epi32(u[2], 2);
+          u[3] = _mm256_srai_epi32(u[3], 2);
+          u[4] = _mm256_srai_epi32(u[4], 2);
+          u[5] = _mm256_srai_epi32(u[5], 2);
+          u[6] = _mm256_srai_epi32(u[6], 2);
+          u[7] = _mm256_srai_epi32(u[7], 2);
+
+          out[ 4] = _mm256_packs_epi32(u[0], u[1]);
+          out[20] = _mm256_packs_epi32(u[2], u[3]);
+          out[12] = _mm256_packs_epi32(u[4], u[5]);
+          out[28] = _mm256_packs_epi32(u[6], u[7]);
+        }
+        {
+          lstep3[16] = _mm256_add_epi32(lstep2[18], lstep1[16]);
+          lstep3[17] = _mm256_add_epi32(lstep2[19], lstep1[17]);
+          lstep3[18] = _mm256_sub_epi32(lstep1[16], lstep2[18]);
+          lstep3[19] = _mm256_sub_epi32(lstep1[17], lstep2[19]);
+          lstep3[20] = _mm256_sub_epi32(lstep1[22], lstep2[20]);
+          lstep3[21] = _mm256_sub_epi32(lstep1[23], lstep2[21]);
+          lstep3[22] = _mm256_add_epi32(lstep2[20], lstep1[22]);
+          lstep3[23] = _mm256_add_epi32(lstep2[21], lstep1[23]);
+          lstep3[24] = _mm256_add_epi32(lstep2[26], lstep1[24]);
+          lstep3[25] = _mm256_add_epi32(lstep2[27], lstep1[25]);
+          lstep3[26] = _mm256_sub_epi32(lstep1[24], lstep2[26]);
+          lstep3[27] = _mm256_sub_epi32(lstep1[25], lstep2[27]);
+          lstep3[28] = _mm256_sub_epi32(lstep1[30], lstep2[28]);
+          lstep3[29] = _mm256_sub_epi32(lstep1[31], lstep2[29]);
+          lstep3[30] = _mm256_add_epi32(lstep2[28], lstep1[30]);
+          lstep3[31] = _mm256_add_epi32(lstep2[29], lstep1[31]);
+        }
+        {
+          const __m256i k32_m04_p28 = pair256_set_epi32(-cospi_4_64, cospi_28_64);
+          const __m256i k32_m28_m04 = pair256_set_epi32(-cospi_28_64, -cospi_4_64);
+          const __m256i k32_m20_p12 = pair256_set_epi32(-cospi_20_64, cospi_12_64);
+          const __m256i k32_m12_m20 = pair256_set_epi32(-cospi_12_64,
+                                                     -cospi_20_64);
+          const __m256i k32_p12_p20 = pair256_set_epi32(cospi_12_64, cospi_20_64);
+          const __m256i k32_p28_p04 = pair256_set_epi32(cospi_28_64, cospi_4_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep2[34], lstep2[60]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep2[34], lstep2[60]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep2[35], lstep2[61]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep2[35], lstep2[61]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep2[36], lstep2[58]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep2[36], lstep2[58]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep2[37], lstep2[59]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep2[37], lstep2[59]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep2[42], lstep2[52]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep2[42], lstep2[52]);
+          u[10] = _mm256_unpacklo_epi32(lstep2[43], lstep2[53]);
+          u[11] = _mm256_unpackhi_epi32(lstep2[43], lstep2[53]);
+          u[12] = _mm256_unpacklo_epi32(lstep2[44], lstep2[50]);
+          u[13] = _mm256_unpackhi_epi32(lstep2[44], lstep2[50]);
+          u[14] = _mm256_unpacklo_epi32(lstep2[45], lstep2[51]);
+          u[15] = _mm256_unpackhi_epi32(lstep2[45], lstep2[51]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_m04_p28);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_m04_p28);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_m04_p28);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_m04_p28);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_m28_m04);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_m28_m04);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_m28_m04);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_m28_m04);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_m20_p12);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_m20_p12);
+          v[10] = k_madd_epi32_avx2(u[10], k32_m20_p12);
+          v[11] = k_madd_epi32_avx2(u[11], k32_m20_p12);
+          v[12] = k_madd_epi32_avx2(u[12], k32_m12_m20);
+          v[13] = k_madd_epi32_avx2(u[13], k32_m12_m20);
+          v[14] = k_madd_epi32_avx2(u[14], k32_m12_m20);
+          v[15] = k_madd_epi32_avx2(u[15], k32_m12_m20);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m20_p12);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m20_p12);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m20_p12);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m20_p12);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_p12_p20);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_p12_p20);
+          v[22] = k_madd_epi32_avx2(u[10], k32_p12_p20);
+          v[23] = k_madd_epi32_avx2(u[11], k32_p12_p20);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m04_p28);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m04_p28);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m04_p28);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m04_p28);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_p28_p04);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_p28_p04);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_p28_p04);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_p28_p04);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          lstep3[34] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          lstep3[35] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          lstep3[36] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          lstep3[37] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          lstep3[42] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          lstep3[43] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          lstep3[44] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          lstep3[45] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          lstep3[50] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          lstep3[51] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          lstep3[52] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          lstep3[53] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          lstep3[58] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          lstep3[59] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          lstep3[60] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          lstep3[61] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+        }
+        // stage 7
+        {
+          const __m256i k32_p30_p02 = pair256_set_epi32(cospi_30_64, cospi_2_64);
+          const __m256i k32_p14_p18 = pair256_set_epi32(cospi_14_64, cospi_18_64);
+          const __m256i k32_p22_p10 = pair256_set_epi32(cospi_22_64, cospi_10_64);
+          const __m256i k32_p06_p26 = pair256_set_epi32(cospi_6_64,  cospi_26_64);
+          const __m256i k32_m26_p06 = pair256_set_epi32(-cospi_26_64, cospi_6_64);
+          const __m256i k32_m10_p22 = pair256_set_epi32(-cospi_10_64, cospi_22_64);
+          const __m256i k32_m18_p14 = pair256_set_epi32(-cospi_18_64, cospi_14_64);
+          const __m256i k32_m02_p30 = pair256_set_epi32(-cospi_2_64, cospi_30_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep3[16], lstep3[30]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep3[16], lstep3[30]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep3[17], lstep3[31]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep3[17], lstep3[31]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep3[18], lstep3[28]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep3[18], lstep3[28]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep3[19], lstep3[29]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep3[19], lstep3[29]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep3[20], lstep3[26]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep3[20], lstep3[26]);
+          u[10] = _mm256_unpacklo_epi32(lstep3[21], lstep3[27]);
+          u[11] = _mm256_unpackhi_epi32(lstep3[21], lstep3[27]);
+          u[12] = _mm256_unpacklo_epi32(lstep3[22], lstep3[24]);
+          u[13] = _mm256_unpackhi_epi32(lstep3[22], lstep3[24]);
+          u[14] = _mm256_unpacklo_epi32(lstep3[23], lstep3[25]);
+          u[15] = _mm256_unpackhi_epi32(lstep3[23], lstep3[25]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p30_p02);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p30_p02);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p30_p02);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p30_p02);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p14_p18);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p14_p18);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p14_p18);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p14_p18);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p22_p10);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p22_p10);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p22_p10);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p22_p10);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p06_p26);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p06_p26);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p06_p26);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p06_p26);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m26_p06);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m26_p06);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m26_p06);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m26_p06);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m10_p22);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m10_p22);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m10_p22);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m10_p22);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m18_p14);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m18_p14);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m18_p14);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m18_p14);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m02_p30);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m02_p30);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m02_p30);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m02_p30);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], K32One);
+          v[ 1] = _mm256_add_epi32(u[ 1], K32One);
+          v[ 2] = _mm256_add_epi32(u[ 2], K32One);
+          v[ 3] = _mm256_add_epi32(u[ 3], K32One);
+          v[ 4] = _mm256_add_epi32(u[ 4], K32One);
+          v[ 5] = _mm256_add_epi32(u[ 5], K32One);
+          v[ 6] = _mm256_add_epi32(u[ 6], K32One);
+          v[ 7] = _mm256_add_epi32(u[ 7], K32One);
+          v[ 8] = _mm256_add_epi32(u[ 8], K32One);
+          v[ 9] = _mm256_add_epi32(u[ 9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], 2);
+          u[ 1] = _mm256_srai_epi32(v[ 1], 2);
+          u[ 2] = _mm256_srai_epi32(v[ 2], 2);
+          u[ 3] = _mm256_srai_epi32(v[ 3], 2);
+          u[ 4] = _mm256_srai_epi32(v[ 4], 2);
+          u[ 5] = _mm256_srai_epi32(v[ 5], 2);
+          u[ 6] = _mm256_srai_epi32(v[ 6], 2);
+          u[ 7] = _mm256_srai_epi32(v[ 7], 2);
+          u[ 8] = _mm256_srai_epi32(v[ 8], 2);
+          u[ 9] = _mm256_srai_epi32(v[ 9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 2] = _mm256_packs_epi32(u[0], u[1]);
+          out[18] = _mm256_packs_epi32(u[2], u[3]);
+          out[10] = _mm256_packs_epi32(u[4], u[5]);
+          out[26] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 6] = _mm256_packs_epi32(u[8], u[9]);
+          out[22] = _mm256_packs_epi32(u[10], u[11]);
+          out[14] = _mm256_packs_epi32(u[12], u[13]);
+          out[30] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          lstep1[32] = _mm256_add_epi32(lstep3[34], lstep2[32]);
+          lstep1[33] = _mm256_add_epi32(lstep3[35], lstep2[33]);
+          lstep1[34] = _mm256_sub_epi32(lstep2[32], lstep3[34]);
+          lstep1[35] = _mm256_sub_epi32(lstep2[33], lstep3[35]);
+          lstep1[36] = _mm256_sub_epi32(lstep2[38], lstep3[36]);
+          lstep1[37] = _mm256_sub_epi32(lstep2[39], lstep3[37]);
+          lstep1[38] = _mm256_add_epi32(lstep3[36], lstep2[38]);
+          lstep1[39] = _mm256_add_epi32(lstep3[37], lstep2[39]);
+          lstep1[40] = _mm256_add_epi32(lstep3[42], lstep2[40]);
+          lstep1[41] = _mm256_add_epi32(lstep3[43], lstep2[41]);
+          lstep1[42] = _mm256_sub_epi32(lstep2[40], lstep3[42]);
+          lstep1[43] = _mm256_sub_epi32(lstep2[41], lstep3[43]);
+          lstep1[44] = _mm256_sub_epi32(lstep2[46], lstep3[44]);
+          lstep1[45] = _mm256_sub_epi32(lstep2[47], lstep3[45]);
+          lstep1[46] = _mm256_add_epi32(lstep3[44], lstep2[46]);
+          lstep1[47] = _mm256_add_epi32(lstep3[45], lstep2[47]);
+          lstep1[48] = _mm256_add_epi32(lstep3[50], lstep2[48]);
+          lstep1[49] = _mm256_add_epi32(lstep3[51], lstep2[49]);
+          lstep1[50] = _mm256_sub_epi32(lstep2[48], lstep3[50]);
+          lstep1[51] = _mm256_sub_epi32(lstep2[49], lstep3[51]);
+          lstep1[52] = _mm256_sub_epi32(lstep2[54], lstep3[52]);
+          lstep1[53] = _mm256_sub_epi32(lstep2[55], lstep3[53]);
+          lstep1[54] = _mm256_add_epi32(lstep3[52], lstep2[54]);
+          lstep1[55] = _mm256_add_epi32(lstep3[53], lstep2[55]);
+          lstep1[56] = _mm256_add_epi32(lstep3[58], lstep2[56]);
+          lstep1[57] = _mm256_add_epi32(lstep3[59], lstep2[57]);
+          lstep1[58] = _mm256_sub_epi32(lstep2[56], lstep3[58]);
+          lstep1[59] = _mm256_sub_epi32(lstep2[57], lstep3[59]);
+          lstep1[60] = _mm256_sub_epi32(lstep2[62], lstep3[60]);
+          lstep1[61] = _mm256_sub_epi32(lstep2[63], lstep3[61]);
+          lstep1[62] = _mm256_add_epi32(lstep3[60], lstep2[62]);
+          lstep1[63] = _mm256_add_epi32(lstep3[61], lstep2[63]);
+        }
+        // stage 8
+        {
+          const __m256i k32_p31_p01 = pair256_set_epi32(cospi_31_64, cospi_1_64);
+          const __m256i k32_p15_p17 = pair256_set_epi32(cospi_15_64, cospi_17_64);
+          const __m256i k32_p23_p09 = pair256_set_epi32(cospi_23_64, cospi_9_64);
+          const __m256i k32_p07_p25 = pair256_set_epi32(cospi_7_64, cospi_25_64);
+          const __m256i k32_m25_p07 = pair256_set_epi32(-cospi_25_64, cospi_7_64);
+          const __m256i k32_m09_p23 = pair256_set_epi32(-cospi_9_64, cospi_23_64);
+          const __m256i k32_m17_p15 = pair256_set_epi32(-cospi_17_64, cospi_15_64);
+          const __m256i k32_m01_p31 = pair256_set_epi32(-cospi_1_64, cospi_31_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[32], lstep1[62]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[32], lstep1[62]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[33], lstep1[63]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[33], lstep1[63]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[34], lstep1[60]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[34], lstep1[60]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[35], lstep1[61]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[35], lstep1[61]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[36], lstep1[58]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[36], lstep1[58]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[37], lstep1[59]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[37], lstep1[59]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[38], lstep1[56]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[38], lstep1[56]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[39], lstep1[57]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[39], lstep1[57]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p31_p01);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p31_p01);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p31_p01);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p31_p01);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p15_p17);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p15_p17);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p15_p17);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p15_p17);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p23_p09);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p23_p09);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p23_p09);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p23_p09);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p07_p25);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p07_p25);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p07_p25);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p07_p25);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m25_p07);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m25_p07);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m25_p07);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m25_p07);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m09_p23);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m09_p23);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m09_p23);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m09_p23);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m17_p15);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m17_p15);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m17_p15);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m17_p15);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m01_p31);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m01_p31);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m01_p31);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m01_p31);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 1] = _mm256_packs_epi32(u[0], u[1]);
+          out[17] = _mm256_packs_epi32(u[2], u[3]);
+          out[ 9] = _mm256_packs_epi32(u[4], u[5]);
+          out[25] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 7] = _mm256_packs_epi32(u[8], u[9]);
+          out[23] = _mm256_packs_epi32(u[10], u[11]);
+          out[15] = _mm256_packs_epi32(u[12], u[13]);
+          out[31] = _mm256_packs_epi32(u[14], u[15]);
+        }
+        {
+          const __m256i k32_p27_p05 = pair256_set_epi32(cospi_27_64, cospi_5_64);
+          const __m256i k32_p11_p21 = pair256_set_epi32(cospi_11_64, cospi_21_64);
+          const __m256i k32_p19_p13 = pair256_set_epi32(cospi_19_64, cospi_13_64);
+          const __m256i k32_p03_p29 = pair256_set_epi32(cospi_3_64, cospi_29_64);
+          const __m256i k32_m29_p03 = pair256_set_epi32(-cospi_29_64, cospi_3_64);
+          const __m256i k32_m13_p19 = pair256_set_epi32(-cospi_13_64, cospi_19_64);
+          const __m256i k32_m21_p11 = pair256_set_epi32(-cospi_21_64, cospi_11_64);
+          const __m256i k32_m05_p27 = pair256_set_epi32(-cospi_5_64, cospi_27_64);
+
+          u[ 0] = _mm256_unpacklo_epi32(lstep1[40], lstep1[54]);
+          u[ 1] = _mm256_unpackhi_epi32(lstep1[40], lstep1[54]);
+          u[ 2] = _mm256_unpacklo_epi32(lstep1[41], lstep1[55]);
+          u[ 3] = _mm256_unpackhi_epi32(lstep1[41], lstep1[55]);
+          u[ 4] = _mm256_unpacklo_epi32(lstep1[42], lstep1[52]);
+          u[ 5] = _mm256_unpackhi_epi32(lstep1[42], lstep1[52]);
+          u[ 6] = _mm256_unpacklo_epi32(lstep1[43], lstep1[53]);
+          u[ 7] = _mm256_unpackhi_epi32(lstep1[43], lstep1[53]);
+          u[ 8] = _mm256_unpacklo_epi32(lstep1[44], lstep1[50]);
+          u[ 9] = _mm256_unpackhi_epi32(lstep1[44], lstep1[50]);
+          u[10] = _mm256_unpacklo_epi32(lstep1[45], lstep1[51]);
+          u[11] = _mm256_unpackhi_epi32(lstep1[45], lstep1[51]);
+          u[12] = _mm256_unpacklo_epi32(lstep1[46], lstep1[48]);
+          u[13] = _mm256_unpackhi_epi32(lstep1[46], lstep1[48]);
+          u[14] = _mm256_unpacklo_epi32(lstep1[47], lstep1[49]);
+          u[15] = _mm256_unpackhi_epi32(lstep1[47], lstep1[49]);
+
+          v[ 0] = k_madd_epi32_avx2(u[ 0], k32_p27_p05);
+          v[ 1] = k_madd_epi32_avx2(u[ 1], k32_p27_p05);
+          v[ 2] = k_madd_epi32_avx2(u[ 2], k32_p27_p05);
+          v[ 3] = k_madd_epi32_avx2(u[ 3], k32_p27_p05);
+          v[ 4] = k_madd_epi32_avx2(u[ 4], k32_p11_p21);
+          v[ 5] = k_madd_epi32_avx2(u[ 5], k32_p11_p21);
+          v[ 6] = k_madd_epi32_avx2(u[ 6], k32_p11_p21);
+          v[ 7] = k_madd_epi32_avx2(u[ 7], k32_p11_p21);
+          v[ 8] = k_madd_epi32_avx2(u[ 8], k32_p19_p13);
+          v[ 9] = k_madd_epi32_avx2(u[ 9], k32_p19_p13);
+          v[10] = k_madd_epi32_avx2(u[10], k32_p19_p13);
+          v[11] = k_madd_epi32_avx2(u[11], k32_p19_p13);
+          v[12] = k_madd_epi32_avx2(u[12], k32_p03_p29);
+          v[13] = k_madd_epi32_avx2(u[13], k32_p03_p29);
+          v[14] = k_madd_epi32_avx2(u[14], k32_p03_p29);
+          v[15] = k_madd_epi32_avx2(u[15], k32_p03_p29);
+          v[16] = k_madd_epi32_avx2(u[12], k32_m29_p03);
+          v[17] = k_madd_epi32_avx2(u[13], k32_m29_p03);
+          v[18] = k_madd_epi32_avx2(u[14], k32_m29_p03);
+          v[19] = k_madd_epi32_avx2(u[15], k32_m29_p03);
+          v[20] = k_madd_epi32_avx2(u[ 8], k32_m13_p19);
+          v[21] = k_madd_epi32_avx2(u[ 9], k32_m13_p19);
+          v[22] = k_madd_epi32_avx2(u[10], k32_m13_p19);
+          v[23] = k_madd_epi32_avx2(u[11], k32_m13_p19);
+          v[24] = k_madd_epi32_avx2(u[ 4], k32_m21_p11);
+          v[25] = k_madd_epi32_avx2(u[ 5], k32_m21_p11);
+          v[26] = k_madd_epi32_avx2(u[ 6], k32_m21_p11);
+          v[27] = k_madd_epi32_avx2(u[ 7], k32_m21_p11);
+          v[28] = k_madd_epi32_avx2(u[ 0], k32_m05_p27);
+          v[29] = k_madd_epi32_avx2(u[ 1], k32_m05_p27);
+          v[30] = k_madd_epi32_avx2(u[ 2], k32_m05_p27);
+          v[31] = k_madd_epi32_avx2(u[ 3], k32_m05_p27);
+
+          u[ 0] = k_packs_epi64_avx2(v[ 0], v[ 1]);
+          u[ 1] = k_packs_epi64_avx2(v[ 2], v[ 3]);
+          u[ 2] = k_packs_epi64_avx2(v[ 4], v[ 5]);
+          u[ 3] = k_packs_epi64_avx2(v[ 6], v[ 7]);
+          u[ 4] = k_packs_epi64_avx2(v[ 8], v[ 9]);
+          u[ 5] = k_packs_epi64_avx2(v[10], v[11]);
+          u[ 6] = k_packs_epi64_avx2(v[12], v[13]);
+          u[ 7] = k_packs_epi64_avx2(v[14], v[15]);
+          u[ 8] = k_packs_epi64_avx2(v[16], v[17]);
+          u[ 9] = k_packs_epi64_avx2(v[18], v[19]);
+          u[10] = k_packs_epi64_avx2(v[20], v[21]);
+          u[11] = k_packs_epi64_avx2(v[22], v[23]);
+          u[12] = k_packs_epi64_avx2(v[24], v[25]);
+          u[13] = k_packs_epi64_avx2(v[26], v[27]);
+          u[14] = k_packs_epi64_avx2(v[28], v[29]);
+          u[15] = k_packs_epi64_avx2(v[30], v[31]);
+
+          v[ 0] = _mm256_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
+          v[ 1] = _mm256_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
+          v[ 2] = _mm256_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
+          v[ 3] = _mm256_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
+          v[ 4] = _mm256_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
+          v[ 5] = _mm256_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
+          v[ 6] = _mm256_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
+          v[ 7] = _mm256_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
+          v[ 8] = _mm256_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
+          v[ 9] = _mm256_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
+          v[10] = _mm256_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+          v[11] = _mm256_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+          v[12] = _mm256_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+          v[13] = _mm256_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+          v[14] = _mm256_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+          v[15] = _mm256_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+          u[ 0] = _mm256_srai_epi32(v[ 0], DCT_CONST_BITS);
+          u[ 1] = _mm256_srai_epi32(v[ 1], DCT_CONST_BITS);
+          u[ 2] = _mm256_srai_epi32(v[ 2], DCT_CONST_BITS);
+          u[ 3] = _mm256_srai_epi32(v[ 3], DCT_CONST_BITS);
+          u[ 4] = _mm256_srai_epi32(v[ 4], DCT_CONST_BITS);
+          u[ 5] = _mm256_srai_epi32(v[ 5], DCT_CONST_BITS);
+          u[ 6] = _mm256_srai_epi32(v[ 6], DCT_CONST_BITS);
+          u[ 7] = _mm256_srai_epi32(v[ 7], DCT_CONST_BITS);
+          u[ 8] = _mm256_srai_epi32(v[ 8], DCT_CONST_BITS);
+          u[ 9] = _mm256_srai_epi32(v[ 9], DCT_CONST_BITS);
+          u[10] = _mm256_srai_epi32(v[10], DCT_CONST_BITS);
+          u[11] = _mm256_srai_epi32(v[11], DCT_CONST_BITS);
+          u[12] = _mm256_srai_epi32(v[12], DCT_CONST_BITS);
+          u[13] = _mm256_srai_epi32(v[13], DCT_CONST_BITS);
+          u[14] = _mm256_srai_epi32(v[14], DCT_CONST_BITS);
+          u[15] = _mm256_srai_epi32(v[15], DCT_CONST_BITS);
+
+          v[ 0] = _mm256_cmpgt_epi32(kZero,u[ 0]);
+          v[ 1] = _mm256_cmpgt_epi32(kZero,u[ 1]);
+          v[ 2] = _mm256_cmpgt_epi32(kZero,u[ 2]);
+          v[ 3] = _mm256_cmpgt_epi32(kZero,u[ 3]);
+          v[ 4] = _mm256_cmpgt_epi32(kZero,u[ 4]);
+          v[ 5] = _mm256_cmpgt_epi32(kZero,u[ 5]);
+          v[ 6] = _mm256_cmpgt_epi32(kZero,u[ 6]);
+          v[ 7] = _mm256_cmpgt_epi32(kZero,u[ 7]);
+          v[ 8] = _mm256_cmpgt_epi32(kZero,u[ 8]);
+          v[ 9] = _mm256_cmpgt_epi32(kZero,u[ 9]);
+          v[10] = _mm256_cmpgt_epi32(kZero,u[10]);
+          v[11] = _mm256_cmpgt_epi32(kZero,u[11]);
+          v[12] = _mm256_cmpgt_epi32(kZero,u[12]);
+          v[13] = _mm256_cmpgt_epi32(kZero,u[13]);
+          v[14] = _mm256_cmpgt_epi32(kZero,u[14]);
+          v[15] = _mm256_cmpgt_epi32(kZero,u[15]);
+
+          u[ 0] = _mm256_sub_epi32(u[ 0], v[ 0]);
+          u[ 1] = _mm256_sub_epi32(u[ 1], v[ 1]);
+          u[ 2] = _mm256_sub_epi32(u[ 2], v[ 2]);
+          u[ 3] = _mm256_sub_epi32(u[ 3], v[ 3]);
+          u[ 4] = _mm256_sub_epi32(u[ 4], v[ 4]);
+          u[ 5] = _mm256_sub_epi32(u[ 5], v[ 5]);
+          u[ 6] = _mm256_sub_epi32(u[ 6], v[ 6]);
+          u[ 7] = _mm256_sub_epi32(u[ 7], v[ 7]);
+          u[ 8] = _mm256_sub_epi32(u[ 8], v[ 8]);
+          u[ 9] = _mm256_sub_epi32(u[ 9], v[ 9]);
+          u[10] = _mm256_sub_epi32(u[10], v[10]);
+          u[11] = _mm256_sub_epi32(u[11], v[11]);
+          u[12] = _mm256_sub_epi32(u[12], v[12]);
+          u[13] = _mm256_sub_epi32(u[13], v[13]);
+          u[14] = _mm256_sub_epi32(u[14], v[14]);
+          u[15] = _mm256_sub_epi32(u[15], v[15]);
+
+          v[0] = _mm256_add_epi32(u[0], K32One);
+          v[1] = _mm256_add_epi32(u[1], K32One);
+          v[2] = _mm256_add_epi32(u[2], K32One);
+          v[3] = _mm256_add_epi32(u[3], K32One);
+          v[4] = _mm256_add_epi32(u[4], K32One);
+          v[5] = _mm256_add_epi32(u[5], K32One);
+          v[6] = _mm256_add_epi32(u[6], K32One);
+          v[7] = _mm256_add_epi32(u[7], K32One);
+          v[8] = _mm256_add_epi32(u[8], K32One);
+          v[9] = _mm256_add_epi32(u[9], K32One);
+          v[10] = _mm256_add_epi32(u[10], K32One);
+          v[11] = _mm256_add_epi32(u[11], K32One);
+          v[12] = _mm256_add_epi32(u[12], K32One);
+          v[13] = _mm256_add_epi32(u[13], K32One);
+          v[14] = _mm256_add_epi32(u[14], K32One);
+          v[15] = _mm256_add_epi32(u[15], K32One);
+
+          u[0] = _mm256_srai_epi32(v[0], 2);
+          u[1] = _mm256_srai_epi32(v[1], 2);
+          u[2] = _mm256_srai_epi32(v[2], 2);
+          u[3] = _mm256_srai_epi32(v[3], 2);
+          u[4] = _mm256_srai_epi32(v[4], 2);
+          u[5] = _mm256_srai_epi32(v[5], 2);
+          u[6] = _mm256_srai_epi32(v[6], 2);
+          u[7] = _mm256_srai_epi32(v[7], 2);
+          u[8] = _mm256_srai_epi32(v[8], 2);
+          u[9] = _mm256_srai_epi32(v[9], 2);
+          u[10] = _mm256_srai_epi32(v[10], 2);
+          u[11] = _mm256_srai_epi32(v[11], 2);
+          u[12] = _mm256_srai_epi32(v[12], 2);
+          u[13] = _mm256_srai_epi32(v[13], 2);
+          u[14] = _mm256_srai_epi32(v[14], 2);
+          u[15] = _mm256_srai_epi32(v[15], 2);
+
+          out[ 5] = _mm256_packs_epi32(u[0], u[1]);
+          out[21] = _mm256_packs_epi32(u[2], u[3]);
+          out[13] = _mm256_packs_epi32(u[4], u[5]);
+          out[29] = _mm256_packs_epi32(u[6], u[7]);
+          out[ 3] = _mm256_packs_epi32(u[8], u[9]);
+          out[19] = _mm256_packs_epi32(u[10], u[11]);
+          out[11] = _mm256_packs_epi32(u[12], u[13]);
+          out[27] = _mm256_packs_epi32(u[14], u[15]);
+        }
+      }
+#endif
+      // Transpose the results, do it as four 8x8 transposes.
+      {
+        int transpose_block;
+        int16_t *output_currStep,*output_nextStep;
+        if (0 == pass){
+                 output_currStep = &intermediate[column_start * 32];
+                 output_nextStep = &intermediate[(column_start + 8) * 32];
+        } else{
+                 output_currStep = &output_org[column_start * 32];
+                 output_nextStep = &output_org[(column_start + 8) * 32];
+        }
+        for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
+          __m256i *this_out = &out[8 * transpose_block];
+          // 00  01  02  03  04  05  06  07  08  09  10  11  12  13  14  15
+          // 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
+          // 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55
+          // 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75
+          // 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
+          // 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
+          // 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
+          // 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
+          const __m256i tr0_0 = _mm256_unpacklo_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_1 = _mm256_unpacklo_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_2 = _mm256_unpackhi_epi16(this_out[0], this_out[1]);
+          const __m256i tr0_3 = _mm256_unpackhi_epi16(this_out[2], this_out[3]);
+          const __m256i tr0_4 = _mm256_unpacklo_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_5 = _mm256_unpacklo_epi16(this_out[6], this_out[7]);
+          const __m256i tr0_6 = _mm256_unpackhi_epi16(this_out[4], this_out[5]);
+          const __m256i tr0_7 = _mm256_unpackhi_epi16(this_out[6], this_out[7]);
+          // 00  20  01  21  02  22  03  23  08  28  09  29  10  30  11  31
+          // 40  60  41  61  42  62  43  63  48  68  49  69  50  70  51  71
+          // 04  24  05  25  06  26  07  27  12  32  13  33  14  34  15  35
+          // 44  64  45  65  46  66  47  67  52  72  53  73  54  74  55  75
+          // 80  100 81  101 82  102 83  103 88  108 89  109 90  110 91  101
+          // 120 140 121 141 122 142 123 143 128 148 129 149 130 150 131 151
+          // 84  104 85  105 86  106 87  107 92  112 93  113 94  114 95  115
+          // 124 144 125 145 126 146 127 147 132 152 133 153 134 154 135 155
+
+          const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
+          const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
+          const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
+          const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
+          const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
+          const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
+          const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
+          const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
+          // 00 20  40  60  01 21  41  61  08 28  48  68  09 29  49  69
+          // 04 24  44  64  05 25  45  65  12 32  52  72  13 33  53  73
+          // 02 22  42  62  03 23  43  63  10 30  50  70  11 31  51  71
+          // 06 26  46  66  07 27  47  67  14 34  54  74  15 35  55  75
+          // 80 100 120 140 81 101 121 141 88 108 128 148 89 109 129 149
+          // 84 104 124 144 85 105 125 145 92 112 132 152 93 113 133 153
+          // 82 102 122 142 83 103 123 143 90 110 130 150 91 101 131 151
+          // 86 106 126 146 87 107 127 147 94 114 134 154 95 115 135 155
+          __m256i tr2_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+          __m256i tr2_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+          __m256i tr2_2 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+          __m256i tr2_3 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+          __m256i tr2_4 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+          __m256i tr2_5 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+          __m256i tr2_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+          __m256i tr2_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+          // 00 20 40 60 80 100 120 140 08 28 48 68 88 108 128 148
+          // 01 21 41 61 81 101 121 141 09 29 49 69 89 109 129 149
+          // 02 22 42 62 82 102 122 142 10 30 50 70 90 110 130 150
+          // 03 23 43 63 83 103 123 143 11 31 51 71 91 101 131 151
+          // 04 24 44 64 84 104 124 144 12 32 52 72 92 112 132 152
+          // 05 25 45 65 85 105 125 145 13 33 53 73 93 113 133 153
+          // 06 26 46 66 86 106 126 146 14 34 54 74 94 114 134 154
+          // 07 27 47 67 87 107 127 147 15 35 55 75 95 115 135 155
+          if (0 == pass) {
+            // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
+            // TODO(cd): see quality impact of only doing
+            //           output[j] = (output[j] + 1) >> 2;
+            //           which would remove the code between here ...
+            __m256i tr2_0_0 = _mm256_cmpgt_epi16(tr2_0, kZero);
+            __m256i tr2_1_0 = _mm256_cmpgt_epi16(tr2_1, kZero);
+            __m256i tr2_2_0 = _mm256_cmpgt_epi16(tr2_2, kZero);
+            __m256i tr2_3_0 = _mm256_cmpgt_epi16(tr2_3, kZero);
+            __m256i tr2_4_0 = _mm256_cmpgt_epi16(tr2_4, kZero);
+            __m256i tr2_5_0 = _mm256_cmpgt_epi16(tr2_5, kZero);
+            __m256i tr2_6_0 = _mm256_cmpgt_epi16(tr2_6, kZero);
+            __m256i tr2_7_0 = _mm256_cmpgt_epi16(tr2_7, kZero);
+            tr2_0 = _mm256_sub_epi16(tr2_0, tr2_0_0);
+            tr2_1 = _mm256_sub_epi16(tr2_1, tr2_1_0);
+            tr2_2 = _mm256_sub_epi16(tr2_2, tr2_2_0);
+            tr2_3 = _mm256_sub_epi16(tr2_3, tr2_3_0);
+            tr2_4 = _mm256_sub_epi16(tr2_4, tr2_4_0);
+            tr2_5 = _mm256_sub_epi16(tr2_5, tr2_5_0);
+            tr2_6 = _mm256_sub_epi16(tr2_6, tr2_6_0);
+            tr2_7 = _mm256_sub_epi16(tr2_7, tr2_7_0);
+            //           ... and here.
+            //           PS: also change code in vp9/encoder/vp9_dct.c
+            tr2_0 = _mm256_add_epi16(tr2_0, kOne);
+            tr2_1 = _mm256_add_epi16(tr2_1, kOne);
+            tr2_2 = _mm256_add_epi16(tr2_2, kOne);
+            tr2_3 = _mm256_add_epi16(tr2_3, kOne);
+            tr2_4 = _mm256_add_epi16(tr2_4, kOne);
+            tr2_5 = _mm256_add_epi16(tr2_5, kOne);
+            tr2_6 = _mm256_add_epi16(tr2_6, kOne);
+            tr2_7 = _mm256_add_epi16(tr2_7, kOne);
+            tr2_0 = _mm256_srai_epi16(tr2_0, 2);
+            tr2_1 = _mm256_srai_epi16(tr2_1, 2);
+            tr2_2 = _mm256_srai_epi16(tr2_2, 2);
+            tr2_3 = _mm256_srai_epi16(tr2_3, 2);
+            tr2_4 = _mm256_srai_epi16(tr2_4, 2);
+            tr2_5 = _mm256_srai_epi16(tr2_5, 2);
+            tr2_6 = _mm256_srai_epi16(tr2_6, 2);
+            tr2_7 = _mm256_srai_epi16(tr2_7, 2);
+          }
+          // Note: even though all these stores are aligned, using the aligned
+          //       intrinsic make the code slightly slower.
+          _mm_storeu_si128((__m128i *)(output_currStep + 0 * 32), _mm256_castsi256_si128(tr2_0));
+          _mm_storeu_si128((__m128i *)(output_currStep + 1 * 32), _mm256_castsi256_si128(tr2_1));
+          _mm_storeu_si128((__m128i *)(output_currStep + 2 * 32), _mm256_castsi256_si128(tr2_2));
+          _mm_storeu_si128((__m128i *)(output_currStep + 3 * 32), _mm256_castsi256_si128(tr2_3));
+          _mm_storeu_si128((__m128i *)(output_currStep + 4 * 32), _mm256_castsi256_si128(tr2_4));
+          _mm_storeu_si128((__m128i *)(output_currStep + 5 * 32), _mm256_castsi256_si128(tr2_5));
+          _mm_storeu_si128((__m128i *)(output_currStep + 6 * 32), _mm256_castsi256_si128(tr2_6));
+          _mm_storeu_si128((__m128i *)(output_currStep + 7 * 32), _mm256_castsi256_si128(tr2_7));
+
+          _mm_storeu_si128((__m128i *)(output_nextStep + 0 * 32), _mm256_extractf128_si256(tr2_0,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 1 * 32), _mm256_extractf128_si256(tr2_1,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 2 * 32), _mm256_extractf128_si256(tr2_2,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 3 * 32), _mm256_extractf128_si256(tr2_3,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 4 * 32), _mm256_extractf128_si256(tr2_4,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 5 * 32), _mm256_extractf128_si256(tr2_5,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 6 * 32), _mm256_extractf128_si256(tr2_6,1));
+          _mm_storeu_si128((__m128i *)(output_nextStep + 7 * 32), _mm256_extractf128_si256(tr2_7,1));
+          // Process next 8x8
+          output_currStep += 8;
+          output_nextStep += 8;
+        }
+      }
+    }
+  }
+}  // NOLINT
diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c b/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c
new file mode 100644
index 0000000..d81b72b
--- /dev/null
+++ b/source/libvpx/vp9/encoder/x86/vp9_dct_avx2.c
@@ -0,0 +1,2579 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vp9/common/vp9_idct.h"  // for cospi constants
+#include "vpx_ports/mem.h"
+
+void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in0, in1, in2, in3;
+  // Load inputs.
+  {
+    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
+    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    // x = x << 4
+    in0 = _mm_slli_epi16(in0, 4);
+    in1 = _mm_slli_epi16(in1, 4);
+    in2 = _mm_slli_epi16(in2, 4);
+    in3 = _mm_slli_epi16(in3, 4);
+    // if (i == 0 && input[0]) input[0] += 1;
+    {
+      // The mask will only contain wether the first value is zero, all
+      // other comparison will fail as something shifted by 4 (above << 4)
+      // can never be equal to one. To increment in the non-zero case, we
+      // add the mask and one for the first element:
+      //   - if zero, mask = -1, v = v - 1 + 1 = v
+      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
+      in0 = _mm_add_epi16(in0, mask);
+      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
+    }
+  }
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // Transform 1/2: Add/substract
+    const __m128i r0 = _mm_add_epi16(in0, in3);
+    const __m128i r1 = _mm_add_epi16(in1, in2);
+    const __m128i r2 = _mm_sub_epi16(in1, in2);
+    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    // Transform 1/2: Interleave to do the multiply by constants which gets us
+    //                into 32 bits.
+    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+    // Combine and transpose
+    const __m128i res0 = _mm_packs_epi32(w0, w2);
+    const __m128i res1 = _mm_packs_epi32(w4, w6);
+    // 00 01 02 03 20 21 22 23
+    // 10 11 12 13 30 31 32 33
+    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
+    // 00 10 01 11 02 12 03 13
+    // 20 30 21 31 22 32 23 33
+    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
+    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
+    if (0 == pass) {
+      // Extract values in the high part for second pass as transform code
+      // only uses the first four values.
+      in1 = _mm_unpackhi_epi64(in0, in0);
+      in3 = _mm_unpackhi_epi64(in2, in2);
+    } else {
+      // Post-condition output and store it (v + 1) >> 2, taking advantage
+      // of the fact 1/3 are stored just after 0/2.
+      __m128i out01 = _mm_add_epi16(in0, kOne);
+      __m128i out23 = _mm_add_epi16(in2, kOne);
+      out01 = _mm_srai_epi16(out01, 2);
+      out23 = _mm_srai_epi16(out23, 2);
+      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
+    }
+  }
+}
+
+static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+  __m128i mask;
+
+  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 4);
+  in[1] = _mm_slli_epi16(in[1], 4);
+  in[2] = _mm_slli_epi16(in[2], 4);
+  in[3] = _mm_slli_epi16(in[3], 4);
+
+  mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi16(in[0], mask);
+  in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
+  __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
+  __m128i out01 = _mm_add_epi16(in01, kOne);
+  __m128i out23 = _mm_add_epi16(in23, kOne);
+  out01 = _mm_srai_epi16(out01, 2);
+  out23 = _mm_srai_epi16(out23, 2);
+  _mm_store_si128((__m128i *)(output + 0 * 8), out01);
+  _mm_store_si128((__m128i *)(output + 1 * 8), out23);
+}
+
+static INLINE void transpose_4x4_avx2(__m128i *res) {
+  // Combine and transpose
+  // 00 01 02 03 20 21 22 23
+  // 10 11 12 13 30 31 32 33
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+
+  // 00 10 20 30 01 11 21 31
+  // 02 12 22 32 03 13 23 33
+  // only use the first 4 16-bit integers
+  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
+  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+}
+
+void fdct4_1d_avx2(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u[4], v[4];
+  u[0]=_mm_unpacklo_epi16(in[0], in[1]);
+  u[1]=_mm_unpacklo_epi16(in[3], in[2]);
+
+  v[0] = _mm_add_epi16(u[0], u[1]);
+  v[1] = _mm_sub_epi16(u[0], u[1]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);  // 0
+  u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16);  // 2
+  u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24);  // 1
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08);  // 3
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void fadst4_1d_avx2(__m128i *in) {
+  const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
+  const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
+  const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
+  const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
+  const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
+  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u[8], v[8];
+  __m128i in7 = _mm_add_epi16(in[0], in[1]);
+
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpacklo_epi16(in[2], in[3]);
+  u[2] = _mm_unpacklo_epi16(in7, kZero);
+  u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
+
+  v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
+  v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
+  v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03);  // x1
+  v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
+  v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
+  v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
+
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[1] = _mm_sub_epi32(v[2], v[6]);
+  u[2] = _mm_add_epi32(v[3], v[4]);
+  u[3] = _mm_sub_epi32(u[2], u[0]);
+  u[4] = _mm_slli_epi32(v[5], 2);
+  u[5] = _mm_sub_epi32(u[4], v[5]);
+  u[6] = _mm_add_epi32(u[3], u[5]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[2]);
+  in[1] = _mm_packs_epi32(u[1], u[3]);
+  transpose_4x4_avx2(in);
+}
+
+void vp9_short_fht4x4_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[4];
+  load_buffer_4x4_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst4_1d_avx2(in);
+      fdct4_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst4_1d_avx2(in);
+      fadst4_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_4x4_avx2(output, in);
+}
+
+void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // Load input
+  __m128i in0  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/substract
+    const __m128i q0 = _mm_add_epi16(in0, in7);
+    const __m128i q1 = _mm_add_epi16(in1, in6);
+    const __m128i q2 = _mm_add_epi16(in2, in5);
+    const __m128i q3 = _mm_add_epi16(in3, in4);
+    const __m128i q4 = _mm_sub_epi16(in3, in4);
+    const __m128i q5 = _mm_sub_epi16(in2, in5);
+    const __m128i q6 = _mm_sub_epi16(in1, in6);
+    const __m128i q7 = _mm_sub_epi16(in0, in7);
+    // Work on first four results
+    {
+      // Add/substract
+      const __m128i r0 = _mm_add_epi16(q0, q3);
+      const __m128i r1 = _mm_add_epi16(q1, q2);
+      const __m128i r2 = _mm_sub_epi16(q1, q2);
+      const __m128i r3 = _mm_sub_epi16(q0, q3);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res0 = _mm_packs_epi32(w0, w1);
+      res4 = _mm_packs_epi32(w2, w3);
+      res2 = _mm_packs_epi32(w4, w5);
+      res6 = _mm_packs_epi32(w6, w7);
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+      // Add/substract
+      const __m128i x0 = _mm_add_epi16(q4, r0);
+      const __m128i x1 = _mm_sub_epi16(q4, r0);
+      const __m128i x2 = _mm_sub_epi16(q7, r1);
+      const __m128i x3 = _mm_add_epi16(q7, r1);
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+      // dct_const_round_shift
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+      // Combine
+      res1 = _mm_packs_epi32(w0, w1);
+      res7 = _mm_packs_epi32(w2, w3);
+      res5 = _mm_packs_epi32(w4, w5);
+      res3 = _mm_packs_epi32(w6, w7);
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    _mm_store_si128((__m128i *)(output + 0 * 8), in0);
+    _mm_store_si128((__m128i *)(output + 1 * 8), in1);
+    _mm_store_si128((__m128i *)(output + 2 * 8), in2);
+    _mm_store_si128((__m128i *)(output + 3 * 8), in3);
+    _mm_store_si128((__m128i *)(output + 4 * 8), in4);
+    _mm_store_si128((__m128i *)(output + 5 * 8), in5);
+    _mm_store_si128((__m128i *)(output + 6 * 8), in6);
+    _mm_store_si128((__m128i *)(output + 7 * 8), in7);
+  }
+}
+
+// load 8x8 array
+static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
+                                   int stride) {
+  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+
+  in[0] = _mm_slli_epi16(in[0], 2);
+  in[1] = _mm_slli_epi16(in[1], 2);
+  in[2] = _mm_slli_epi16(in[2], 2);
+  in[3] = _mm_slli_epi16(in[3], 2);
+  in[4] = _mm_slli_epi16(in[4], 2);
+  in[5] = _mm_slli_epi16(in[5], 2);
+  in[6] = _mm_slli_epi16(in[6], 2);
+  in[7] = _mm_slli_epi16(in[7], 2);
+}
+
+// right shift and rounding
+static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
+  const __m128i kOne = _mm_set1_epi16(1);
+  const int bit_m02 = bit - 2;
+  __m128i sign0 = _mm_srai_epi16(res[0], 15);
+  __m128i sign1 = _mm_srai_epi16(res[1], 15);
+  __m128i sign2 = _mm_srai_epi16(res[2], 15);
+  __m128i sign3 = _mm_srai_epi16(res[3], 15);
+  __m128i sign4 = _mm_srai_epi16(res[4], 15);
+  __m128i sign5 = _mm_srai_epi16(res[5], 15);
+  __m128i sign6 = _mm_srai_epi16(res[6], 15);
+  __m128i sign7 = _mm_srai_epi16(res[7], 15);
+
+  if (bit_m02 >= 0) {
+    __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
+    res[0] = _mm_add_epi16(res[0], k_const_rounding);
+    res[1] = _mm_add_epi16(res[1], k_const_rounding);
+    res[2] = _mm_add_epi16(res[2], k_const_rounding);
+    res[3] = _mm_add_epi16(res[3], k_const_rounding);
+    res[4] = _mm_add_epi16(res[4], k_const_rounding);
+    res[5] = _mm_add_epi16(res[5], k_const_rounding);
+    res[6] = _mm_add_epi16(res[6], k_const_rounding);
+    res[7] = _mm_add_epi16(res[7], k_const_rounding);
+  }
+
+  res[0] = _mm_sub_epi16(res[0], sign0);
+  res[1] = _mm_sub_epi16(res[1], sign1);
+  res[2] = _mm_sub_epi16(res[2], sign2);
+  res[3] = _mm_sub_epi16(res[3], sign3);
+  res[4] = _mm_sub_epi16(res[4], sign4);
+  res[5] = _mm_sub_epi16(res[5], sign5);
+  res[6] = _mm_sub_epi16(res[6], sign6);
+  res[7] = _mm_sub_epi16(res[7], sign7);
+
+  res[0] = _mm_srai_epi16(res[0], bit);
+  res[1] = _mm_srai_epi16(res[1], bit);
+  res[2] = _mm_srai_epi16(res[2], bit);
+  res[3] = _mm_srai_epi16(res[3], bit);
+  res[4] = _mm_srai_epi16(res[4], bit);
+  res[5] = _mm_srai_epi16(res[5], bit);
+  res[6] = _mm_srai_epi16(res[6], bit);
+  res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
+  _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+  _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
+}
+
+// perform in-place transpose
+static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+  // 00 10 01 11 02 12 03 13
+  // 20 30 21 31 22 32 23 33
+  // 04 14 05 15 06 16 07 17
+  // 24 34 25 35 26 36 27 37
+  // 40 50 41 51 42 52 43 53
+  // 60 70 61 71 62 72 63 73
+  // 44 54 45 55 46 56 47 57
+  // 64 74 65 75 66 76 67 77
+  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+  // 00 10 20 30 01 11 21 31
+  // 40 50 60 70 41 51 61 71
+  // 02 12 22 32 03 13 23 33
+  // 42 52 62 72 43 53 63 73
+  // 04 14 24 34 05 15 25 35
+  // 44 54 64 74 45 55 65 75
+  // 06 16 26 36 07 17 27 37
+  // 46 56 66 76 47 57 67 77
+  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
+  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
+  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
+  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
+  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  // 00 10 20 30 40 50 60 70
+  // 01 11 21 31 41 51 61 71
+  // 02 12 22 32 42 52 62 72
+  // 03 13 23 33 43 53 63 73
+  // 04 14 24 34 44 54 64 74
+  // 05 15 25 35 45 55 65 75
+  // 06 16 26 36 46 56 66 76
+  // 07 17 27 37 47 57 67 77
+}
+
+void fdct8_1d_avx2(__m128i *in) {
+  // constants
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+
+  // stage 1
+  s0 = _mm_add_epi16(in[0], in[7]);
+  s1 = _mm_add_epi16(in[1], in[6]);
+  s2 = _mm_add_epi16(in[2], in[5]);
+  s3 = _mm_add_epi16(in[3], in[4]);
+  s4 = _mm_sub_epi16(in[3], in[4]);
+  s5 = _mm_sub_epi16(in[2], in[5]);
+  s6 = _mm_sub_epi16(in[1], in[6]);
+  s7 = _mm_sub_epi16(in[0], in[7]);
+
+  u0 = _mm_add_epi16(s0, s3);
+  u1 = _mm_add_epi16(s1, s2);
+  u2 = _mm_sub_epi16(s1, s2);
+  u3 = _mm_sub_epi16(s0, s3);
+  // interleave and perform butterfly multiplication/addition
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpackhi_epi16(u0, u1);
+  v2 = _mm_unpacklo_epi16(u2, u3);
+  v3 = _mm_unpackhi_epi16(u2, u3);
+
+  u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
+  u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
+  u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
+  u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
+  u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
+  u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
+  u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
+  u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[4] = _mm_packs_epi32(u2, u3);
+  in[6] = _mm_packs_epi32(u6, u7);
+
+  // stage 2
+  // interleave and perform butterfly multiplication/addition
+  u0 = _mm_unpacklo_epi16(s6, s5);
+  u1 = _mm_unpackhi_epi16(s6, s5);
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+
+  u0 = _mm_packs_epi32(v0, v1);
+  u1 = _mm_packs_epi32(v2, v3);
+
+  // stage 3
+  s0 = _mm_add_epi16(s4, u0);
+  s1 = _mm_sub_epi16(s4, u0);
+  s2 = _mm_sub_epi16(s7, u1);
+  s3 = _mm_add_epi16(s7, u1);
+
+  // stage 4
+  u0 = _mm_unpacklo_epi16(s0, s3);
+  u1 = _mm_unpackhi_epi16(s0, s3);
+  u2 = _mm_unpacklo_epi16(s1, s2);
+  u3 = _mm_unpackhi_epi16(s1, s2);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
+  v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
+  v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
+  v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
+  v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
+  v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
+  v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
+  v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
+
+  // shift and rounding
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  in[1] = _mm_packs_epi32(v0, v1);
+  in[3] = _mm_packs_epi32(v4, v5);
+  in[5] = _mm_packs_epi32(v2, v3);
+  in[7] = _mm_packs_epi32(v6, v7);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void fadst8_1d_avx2(__m128i *in) {
+  // Constants
+  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
+  __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+
+  // properly aligned for butterfly input
+  in0  = in[7];
+  in1  = in[0];
+  in2  = in[5];
+  in3  = in[2];
+  in4  = in[3];
+  in5  = in[4];
+  in6  = in[1];
+  in7  = in[6];
+
+  // column transformation
+  // stage 1
+  // interleave and multiply/add into 32-bit integer
+  s0 = _mm_unpacklo_epi16(in0, in1);
+  s1 = _mm_unpackhi_epi16(in0, in1);
+  s2 = _mm_unpacklo_epi16(in2, in3);
+  s3 = _mm_unpackhi_epi16(in2, in3);
+  s4 = _mm_unpacklo_epi16(in4, in5);
+  s5 = _mm_unpackhi_epi16(in4, in5);
+  s6 = _mm_unpacklo_epi16(in6, in7);
+  s7 = _mm_unpackhi_epi16(in6, in7);
+
+  u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
+  u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
+  u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
+  u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
+  u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
+  u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
+  u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
+  u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
+  u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
+  u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
+  u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
+  u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
+  u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
+  u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
+  u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
+  u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
+
+  // addition
+  w0 = _mm_add_epi32(u0, u8);
+  w1 = _mm_add_epi32(u1, u9);
+  w2 = _mm_add_epi32(u2, u10);
+  w3 = _mm_add_epi32(u3, u11);
+  w4 = _mm_add_epi32(u4, u12);
+  w5 = _mm_add_epi32(u5, u13);
+  w6 = _mm_add_epi32(u6, u14);
+  w7 = _mm_add_epi32(u7, u15);
+  w8 = _mm_sub_epi32(u0, u8);
+  w9 = _mm_sub_epi32(u1, u9);
+  w10 = _mm_sub_epi32(u2, u10);
+  w11 = _mm_sub_epi32(u3, u11);
+  w12 = _mm_sub_epi32(u4, u12);
+  w13 = _mm_sub_epi32(u5, u13);
+  w14 = _mm_sub_epi32(u6, u14);
+  w15 = _mm_sub_epi32(u7, u15);
+
+  // shift and rounding
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+  v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
+  v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
+  v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
+  v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
+  v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
+  v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
+  v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
+  v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+  u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
+  u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
+  u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
+  u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
+  u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
+  u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
+  u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
+  u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
+
+  // back to 16-bit and pack 8 integers into __m128i
+  in[0] = _mm_packs_epi32(u0, u1);
+  in[1] = _mm_packs_epi32(u2, u3);
+  in[2] = _mm_packs_epi32(u4, u5);
+  in[3] = _mm_packs_epi32(u6, u7);
+  in[4] = _mm_packs_epi32(u8, u9);
+  in[5] = _mm_packs_epi32(u10, u11);
+  in[6] = _mm_packs_epi32(u12, u13);
+  in[7] = _mm_packs_epi32(u14, u15);
+
+  // stage 2
+  s0 = _mm_add_epi16(in[0], in[2]);
+  s1 = _mm_add_epi16(in[1], in[3]);
+  s2 = _mm_sub_epi16(in[0], in[2]);
+  s3 = _mm_sub_epi16(in[1], in[3]);
+  u0 = _mm_unpacklo_epi16(in[4], in[5]);
+  u1 = _mm_unpackhi_epi16(in[4], in[5]);
+  u2 = _mm_unpacklo_epi16(in[6], in[7]);
+  u3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
+  v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
+  v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
+  v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
+  v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
+  v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
+  v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
+  v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
+
+  w0 = _mm_add_epi32(v0, v4);
+  w1 = _mm_add_epi32(v1, v5);
+  w2 = _mm_add_epi32(v2, v6);
+  w3 = _mm_add_epi32(v3, v7);
+  w4 = _mm_sub_epi32(v0, v4);
+  w5 = _mm_sub_epi32(v1, v5);
+  w6 = _mm_sub_epi32(v2, v6);
+  w7 = _mm_sub_epi32(v3, v7);
+
+  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  // back to 16-bit intergers
+  s4 = _mm_packs_epi32(u0, u1);
+  s5 = _mm_packs_epi32(u2, u3);
+  s6 = _mm_packs_epi32(u4, u5);
+  s7 = _mm_packs_epi32(u6, u7);
+
+  // stage 3
+  u0 = _mm_unpacklo_epi16(s2, s3);
+  u1 = _mm_unpackhi_epi16(s2, s3);
+  u2 = _mm_unpacklo_epi16(s6, s7);
+  u3 = _mm_unpackhi_epi16(s6, s7);
+
+  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
+  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
+  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
+  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
+  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
+  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
+  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
+  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
+
+  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  s2 = _mm_packs_epi32(v0, v1);
+  s3 = _mm_packs_epi32(v2, v3);
+  s6 = _mm_packs_epi32(v4, v5);
+  s7 = _mm_packs_epi32(v6, v7);
+
+  // FIXME(jingning): do subtract using bit inversion?
+  in[0] = s0;
+  in[1] = _mm_sub_epi16(k__const_0, s4);
+  in[2] = s6;
+  in[3] = _mm_sub_epi16(k__const_0, s2);
+  in[4] = s3;
+  in[5] = _mm_sub_epi16(k__const_0, s7);
+  in[6] = s5;
+  in[7] = _mm_sub_epi16(k__const_0, s1);
+
+  // transpose
+  array_transpose_8x8_avx2(in, in);
+}
+
+void vp9_short_fht8x8_avx2(const int16_t *input, int16_t *output,
+                           int stride, int tx_type) {
+  __m128i in[8];
+  load_buffer_8x8_avx2(input, in, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 1:  // ADST_DCT
+      fadst8_1d_avx2(in);
+      fdct8_1d_avx2(in);
+      break;
+    case 2:  // DCT_ADST
+      fdct8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    case 3:  // ADST_ADST
+      fadst8_1d_avx2(in);
+      fadst8_1d_avx2(in);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  right_shift_8x8_avx2(in, 1);
+  write_buffer_8x8_avx2(output, in, 8);
+}
+
+void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows. To achieve that,
+  // as the first pass results are transposed, we tranpose the columns (that
+  // is the transposed rows) and transpose the results (so that it goes back
+  // in normal/row positions).
+  int pass;
+  // We need an intermediate buffer between passes.
+  DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
+  const int16_t *in = input;
+  int16_t *out = intermediate;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kOne = _mm_set1_epi16(1);
+  // Do the two transform/transpose passes
+  for (pass = 0; pass < 2; ++pass) {
+    // We process eight columns (transposed rows in second pass) at a time.
+    int column_start;
+    for (column_start = 0; column_start < 16; column_start += 8) {
+      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
+      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
+      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
+      __m128i step1_0, step1_1, step1_2, step1_3;
+      __m128i step1_4, step1_5, step1_6, step1_7;
+      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
+      __m128i step3_0, step3_1, step3_2, step3_3;
+      __m128i step3_4, step3_5, step3_6, step3_7;
+      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
+      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
+      // Load and pre-condition input.
+      if (0 == pass) {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * stride));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * stride));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * stride));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * stride));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * stride));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * stride));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * stride));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * stride));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * stride));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * stride));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * stride));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * stride));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * stride));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * stride));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * stride));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * stride));
+        // x = x << 2
+        in00 = _mm_slli_epi16(in00, 2);
+        in01 = _mm_slli_epi16(in01, 2);
+        in02 = _mm_slli_epi16(in02, 2);
+        in03 = _mm_slli_epi16(in03, 2);
+        in04 = _mm_slli_epi16(in04, 2);
+        in05 = _mm_slli_epi16(in05, 2);
+        in06 = _mm_slli_epi16(in06, 2);
+        in07 = _mm_slli_epi16(in07, 2);
+        in08 = _mm_slli_epi16(in08, 2);
+        in09 = _mm_slli_epi16(in09, 2);
+        in10 = _mm_slli_epi16(in10, 2);
+        in11 = _mm_slli_epi16(in11, 2);
+        in12 = _mm_slli_epi16(in12, 2);
+        in13 = _mm_slli_epi16(in13, 2);
+        in14 = _mm_slli_epi16(in14, 2);
+        in15 = _mm_slli_epi16(in15, 2);
+      } else {
+        in00  = _mm_load_si128((const __m128i *)(in +  0 * 16));
+        in01  = _mm_load_si128((const __m128i *)(in +  1 * 16));
+        in02  = _mm_load_si128((const __m128i *)(in +  2 * 16));
+        in03  = _mm_load_si128((const __m128i *)(in +  3 * 16));
+        in04  = _mm_load_si128((const __m128i *)(in +  4 * 16));
+        in05  = _mm_load_si128((const __m128i *)(in +  5 * 16));
+        in06  = _mm_load_si128((const __m128i *)(in +  6 * 16));
+        in07  = _mm_load_si128((const __m128i *)(in +  7 * 16));
+        in08  = _mm_load_si128((const __m128i *)(in +  8 * 16));
+        in09  = _mm_load_si128((const __m128i *)(in +  9 * 16));
+        in10  = _mm_load_si128((const __m128i *)(in + 10 * 16));
+        in11  = _mm_load_si128((const __m128i *)(in + 11 * 16));
+        in12  = _mm_load_si128((const __m128i *)(in + 12 * 16));
+        in13  = _mm_load_si128((const __m128i *)(in + 13 * 16));
+        in14  = _mm_load_si128((const __m128i *)(in + 14 * 16));
+        in15  = _mm_load_si128((const __m128i *)(in + 15 * 16));
+        // x = (x + 1) >> 2
+        in00 = _mm_add_epi16(in00, kOne);
+        in01 = _mm_add_epi16(in01, kOne);
+        in02 = _mm_add_epi16(in02, kOne);
+        in03 = _mm_add_epi16(in03, kOne);
+        in04 = _mm_add_epi16(in04, kOne);
+        in05 = _mm_add_epi16(in05, kOne);
+        in06 = _mm_add_epi16(in06, kOne);
+        in07 = _mm_add_epi16(in07, kOne);
+        in08 = _mm_add_epi16(in08, kOne);
+        in09 = _mm_add_epi16(in09, kOne);
+        in10 = _mm_add_epi16(in10, kOne);
+        in11 = _mm_add_epi16(in11, kOne);
+        in12 = _mm_add_epi16(in12, kOne);
+        in13 = _mm_add_epi16(in13, kOne);
+        in14 = _mm_add_epi16(in14, kOne);
+        in15 = _mm_add_epi16(in15, kOne);
+        in00 = _mm_srai_epi16(in00, 2);
+        in01 = _mm_srai_epi16(in01, 2);
+        in02 = _mm_srai_epi16(in02, 2);
+        in03 = _mm_srai_epi16(in03, 2);
+        in04 = _mm_srai_epi16(in04, 2);
+        in05 = _mm_srai_epi16(in05, 2);
+        in06 = _mm_srai_epi16(in06, 2);
+        in07 = _mm_srai_epi16(in07, 2);
+        in08 = _mm_srai_epi16(in08, 2);
+        in09 = _mm_srai_epi16(in09, 2);
+        in10 = _mm_srai_epi16(in10, 2);
+        in11 = _mm_srai_epi16(in11, 2);
+        in12 = _mm_srai_epi16(in12, 2);
+        in13 = _mm_srai_epi16(in13, 2);
+        in14 = _mm_srai_epi16(in14, 2);
+        in15 = _mm_srai_epi16(in15, 2);
+      }
+      in += 8;
+      // Calculate input for the first 8 results.
+      {
+        input0 = _mm_add_epi16(in00, in15);
+        input1 = _mm_add_epi16(in01, in14);
+        input2 = _mm_add_epi16(in02, in13);
+        input3 = _mm_add_epi16(in03, in12);
+        input4 = _mm_add_epi16(in04, in11);
+        input5 = _mm_add_epi16(in05, in10);
+        input6 = _mm_add_epi16(in06, in09);
+        input7 = _mm_add_epi16(in07, in08);
+      }
+      // Calculate input for the next 8 results.
+      {
+        step1_0 = _mm_sub_epi16(in07, in08);
+        step1_1 = _mm_sub_epi16(in06, in09);
+        step1_2 = _mm_sub_epi16(in05, in10);
+        step1_3 = _mm_sub_epi16(in04, in11);
+        step1_4 = _mm_sub_epi16(in03, in12);
+        step1_5 = _mm_sub_epi16(in02, in13);
+        step1_6 = _mm_sub_epi16(in01, in14);
+        step1_7 = _mm_sub_epi16(in00, in15);
+      }
+      // Work on the first eight values; fdct8_1d(input, even_results);
+      {
+        // Add/substract
+        const __m128i q0 = _mm_add_epi16(input0, input7);
+        const __m128i q1 = _mm_add_epi16(input1, input6);
+        const __m128i q2 = _mm_add_epi16(input2, input5);
+        const __m128i q3 = _mm_add_epi16(input3, input4);
+        const __m128i q4 = _mm_sub_epi16(input3, input4);
+        const __m128i q5 = _mm_sub_epi16(input2, input5);
+        const __m128i q6 = _mm_sub_epi16(input1, input6);
+        const __m128i q7 = _mm_sub_epi16(input0, input7);
+        // Work on first four results
+        {
+          // Add/substract
+          const __m128i r0 = _mm_add_epi16(q0, q3);
+          const __m128i r1 = _mm_add_epi16(q1, q2);
+          const __m128i r2 = _mm_sub_epi16(q1, q2);
+          const __m128i r3 = _mm_sub_epi16(q0, q3);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res00 = _mm_packs_epi32(w0, w1);
+          res08 = _mm_packs_epi32(w2, w3);
+          res04 = _mm_packs_epi32(w4, w5);
+          res12 = _mm_packs_epi32(w6, w7);
+        }
+        // Work on next four results
+        {
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+          // Combine
+          const __m128i r0 = _mm_packs_epi32(s0, s1);
+          const __m128i r1 = _mm_packs_epi32(s2, s3);
+          // Add/substract
+          const __m128i x0 = _mm_add_epi16(q4, r0);
+          const __m128i x1 = _mm_sub_epi16(q4, r0);
+          const __m128i x2 = _mm_sub_epi16(q7, r1);
+          const __m128i x3 = _mm_add_epi16(q7, r1);
+          // Interleave to do the multiply by constants which gets us
+          // into 32 bits.
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res02 = _mm_packs_epi32(w0, w1);
+          res14 = _mm_packs_epi32(w2, w3);
+          res10 = _mm_packs_epi32(w4, w5);
+          res06 = _mm_packs_epi32(w6, w7);
+        }
+      }
+      // Work on the next eight values; step1 -> odd_results
+      {
+        // step 2
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_2 = _mm_packs_epi32(w0, w1);
+          step2_3 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_5 = _mm_packs_epi32(w0, w1);
+          step2_4 = _mm_packs_epi32(w2, w3);
+        }
+        // step 3
+        {
+          step3_0 = _mm_add_epi16(step1_0, step2_3);
+          step3_1 = _mm_add_epi16(step1_1, step2_2);
+          step3_2 = _mm_sub_epi16(step1_1, step2_2);
+          step3_3 = _mm_sub_epi16(step1_0, step2_3);
+          step3_4 = _mm_sub_epi16(step1_7, step2_4);
+          step3_5 = _mm_sub_epi16(step1_6, step2_5);
+          step3_6 = _mm_add_epi16(step1_6, step2_5);
+          step3_7 = _mm_add_epi16(step1_7, step2_4);
+        }
+        // step 4
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_1 = _mm_packs_epi32(w0, w1);
+          step2_2 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
+          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
+          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
+          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          step2_6 = _mm_packs_epi32(w0, w1);
+          step2_5 = _mm_packs_epi32(w2, w3);
+        }
+        // step 5
+        {
+          step1_0 = _mm_add_epi16(step3_0, step2_1);
+          step1_1 = _mm_sub_epi16(step3_0, step2_1);
+          step1_2 = _mm_sub_epi16(step3_3, step2_2);
+          step1_3 = _mm_add_epi16(step3_3, step2_2);
+          step1_4 = _mm_add_epi16(step3_4, step2_5);
+          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_6 = _mm_sub_epi16(step3_7, step2_6);
+          step1_7 = _mm_add_epi16(step3_7, step2_6);
+        }
+        // step 6
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res01 = _mm_packs_epi32(w0, w1);
+          res09 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res05 = _mm_packs_epi32(w0, w1);
+          res13 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res11 = _mm_packs_epi32(w0, w1);
+          res03 = _mm_packs_epi32(w2, w3);
+        }
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
+          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
+          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
+          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          // Combine
+          res15 = _mm_packs_epi32(w0, w1);
+          res07 = _mm_packs_epi32(w2, w3);
+        }
+      }
+      // Transpose the results, do it as two 8x8 transposes.
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
+        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
+        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
+        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
+        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
+        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
+        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
+        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
+      }
+      {
+        // 00 01 02 03 04 05 06 07
+        // 10 11 12 13 14 15 16 17
+        // 20 21 22 23 24 25 26 27
+        // 30 31 32 33 34 35 36 37
+        // 40 41 42 43 44 45 46 47
+        // 50 51 52 53 54 55 56 57
+        // 60 61 62 63 64 65 66 67
+        // 70 71 72 73 74 75 76 77
+        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
+        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
+        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
+        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
+        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
+        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
+        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
+        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
+        // 00 10 01 11 02 12 03 13
+        // 20 30 21 31 22 32 23 33
+        // 04 14 05 15 06 16 07 17
+        // 24 34 25 35 26 36 27 37
+        // 40 50 41 51 42 52 43 53
+        // 60 70 61 71 62 72 63 73
+        // 54 54 55 55 56 56 57 57
+        // 64 74 65 75 66 76 67 77
+        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+        // 00 10 20 30 01 11 21 31
+        // 40 50 60 70 41 51 61 71
+        // 02 12 22 32 03 13 23 33
+        // 42 52 62 72 43 53 63 73
+        // 04 14 24 34 05 15 21 36
+        // 44 54 64 74 45 55 61 76
+        // 06 16 26 36 07 17 27 37
+        // 46 56 66 76 47 57 67 77
+        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+        // 00 10 20 30 40 50 60 70
+        // 01 11 21 31 41 51 61 71
+        // 02 12 22 32 42 52 62 72
+        // 03 13 23 33 43 53 63 73
+        // 04 14 24 34 44 54 64 74
+        // 05 15 25 35 45 55 65 75
+        // 06 16 26 36 46 56 66 76
+        // 07 17 27 37 47 57 67 77
+        // Store results
+        _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
+        _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
+        _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
+        _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
+        _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
+        _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
+        _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
+        _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
+      }
+      out += 8*16;
+    }
+    // Setup in/out for next pass.
+    in = intermediate;
+    out = output;
+  }
+}
+
+static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
+                                     __m128i *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8_avx2(input, in0, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8_avx2(input, in1, stride);
+  load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
+                                      __m128i *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8_avx2(output, in0, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8_avx2(output, in1, stride);
+  write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
+  __m128i tbuf[8];
+  array_transpose_8x8_avx2(res0, res0);
+  array_transpose_8x8_avx2(res1, tbuf);
+  array_transpose_8x8_avx2(res0 + 8, res1);
+  array_transpose_8x8_avx2(res1 + 8, res1 + 8);
+
+  res0[8] = tbuf[0];
+  res0[9] = tbuf[1];
+  res0[10] = tbuf[2];
+  res0[11] = tbuf[3];
+  res0[12] = tbuf[4];
+  res0[13] = tbuf[5];
+  res0[14] = tbuf[6];
+  res0[15] = tbuf[7];
+}
+
+static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
+  // perform rounding operations
+  right_shift_8x8_avx2(res0, 2);
+  right_shift_8x8_avx2(res0 + 8, 2);
+  right_shift_8x8_avx2(res1, 2);
+  right_shift_8x8_avx2(res1 + 8, 2);
+}
+
+void fdct16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  __m128i i[8], s[8], p[8], t[8], u[16], v[16];
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
+  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
+  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
+  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
+  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
+  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
+  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
+  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = _mm_add_epi16(in[0], in[15]);
+  i[1] = _mm_add_epi16(in[1], in[14]);
+  i[2] = _mm_add_epi16(in[2], in[13]);
+  i[3] = _mm_add_epi16(in[3], in[12]);
+  i[4] = _mm_add_epi16(in[4], in[11]);
+  i[5] = _mm_add_epi16(in[5], in[10]);
+  i[6] = _mm_add_epi16(in[6], in[9]);
+  i[7] = _mm_add_epi16(in[7], in[8]);
+
+  s[0] = _mm_sub_epi16(in[7], in[8]);
+  s[1] = _mm_sub_epi16(in[6], in[9]);
+  s[2] = _mm_sub_epi16(in[5], in[10]);
+  s[3] = _mm_sub_epi16(in[4], in[11]);
+  s[4] = _mm_sub_epi16(in[3], in[12]);
+  s[5] = _mm_sub_epi16(in[2], in[13]);
+  s[6] = _mm_sub_epi16(in[1], in[14]);
+  s[7] = _mm_sub_epi16(in[0], in[15]);
+
+  p[0] = _mm_add_epi16(i[0], i[7]);
+  p[1] = _mm_add_epi16(i[1], i[6]);
+  p[2] = _mm_add_epi16(i[2], i[5]);
+  p[3] = _mm_add_epi16(i[3], i[4]);
+  p[4] = _mm_sub_epi16(i[3], i[4]);
+  p[5] = _mm_sub_epi16(i[2], i[5]);
+  p[6] = _mm_sub_epi16(i[1], i[6]);
+  p[7] = _mm_sub_epi16(i[0], i[7]);
+
+  u[0] = _mm_add_epi16(p[0], p[3]);
+  u[1] = _mm_add_epi16(p[1], p[2]);
+  u[2] = _mm_sub_epi16(p[1], p[2]);
+  u[3] = _mm_sub_epi16(p[0], p[3]);
+
+  v[0] = _mm_unpacklo_epi16(u[0], u[1]);
+  v[1] = _mm_unpackhi_epi16(u[0], u[1]);
+  v[2] = _mm_unpacklo_epi16(u[2], u[3]);
+  v[3] = _mm_unpackhi_epi16(u[2], u[3]);
+
+  u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
+  u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
+  u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
+  u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
+  u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
+  u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
+  u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
+  u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[4] = _mm_packs_epi32(u[4], u[5]);
+  in[8] = _mm_packs_epi32(u[2], u[3]);
+  in[12] = _mm_packs_epi32(u[6], u[7]);
+
+  u[0] = _mm_unpacklo_epi16(p[5], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[5], p[6]);
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[2], v[3]);
+
+  t[0] = _mm_add_epi16(p[4], u[0]);
+  t[1] = _mm_sub_epi16(p[4], u[0]);
+  t[2] = _mm_sub_epi16(p[7], u[1]);
+  t[3] = _mm_add_epi16(p[7], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(t[0], t[3]);
+  u[1] = _mm_unpackhi_epi16(t[0], t[3]);
+  u[2] = _mm_unpacklo_epi16(t[1], t[2]);
+  u[3] = _mm_unpackhi_epi16(t[1], t[2]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  in[2] = _mm_packs_epi32(v[0], v[1]);
+  in[6] = _mm_packs_epi32(v[4], v[5]);
+  in[10] = _mm_packs_epi32(v[2], v[3]);
+  in[14] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[3] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[2] = _mm_packs_epi32(v[0], v[1]);
+  t[3] = _mm_packs_epi32(v[2], v[3]);
+  t[4] = _mm_packs_epi32(v[4], v[5]);
+  t[5] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 3
+  p[0] = _mm_add_epi16(s[0], t[3]);
+  p[1] = _mm_add_epi16(s[1], t[2]);
+  p[2] = _mm_sub_epi16(s[1], t[2]);
+  p[3] = _mm_sub_epi16(s[0], t[3]);
+  p[4] = _mm_sub_epi16(s[7], t[4]);
+  p[5] = _mm_sub_epi16(s[6], t[5]);
+  p[6] = _mm_add_epi16(s[6], t[5]);
+  p[7] = _mm_add_epi16(s[7], t[4]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(p[1], p[6]);
+  u[1] = _mm_unpackhi_epi16(p[1], p[6]);
+  u[2] = _mm_unpacklo_epi16(p[2], p[5]);
+  u[3] = _mm_unpackhi_epi16(p[2], p[5]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
+  v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+  t[1] = _mm_packs_epi32(v[0], v[1]);
+  t[2] = _mm_packs_epi32(v[2], v[3]);
+  t[5] = _mm_packs_epi32(v[4], v[5]);
+  t[6] = _mm_packs_epi32(v[6], v[7]);
+
+  // stage 5
+  s[0] = _mm_add_epi16(p[0], t[1]);
+  s[1] = _mm_sub_epi16(p[0], t[1]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[6] = _mm_sub_epi16(p[7], t[6]);
+  s[7] = _mm_add_epi16(p[7], t[6]);
+
+  // stage 6
+  u[0] = _mm_unpacklo_epi16(s[0], s[7]);
+  u[1] = _mm_unpackhi_epi16(s[0], s[7]);
+  u[2] = _mm_unpacklo_epi16(s[1], s[6]);
+  u[3] = _mm_unpackhi_epi16(s[1], s[6]);
+  u[4] = _mm_unpacklo_epi16(s[2], s[5]);
+  u[5] = _mm_unpackhi_epi16(s[2], s[5]);
+  u[6] = _mm_unpacklo_epi16(s[3], s[4]);
+  u[7] = _mm_unpackhi_epi16(s[3], s[4]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
+  v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
+  v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
+  v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
+  v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
+  v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
+  v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
+  v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
+  v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
+  v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
+  v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[1]  = _mm_packs_epi32(v[0], v[1]);
+  in[9]  = _mm_packs_epi32(v[2], v[3]);
+  in[5]  = _mm_packs_epi32(v[4], v[5]);
+  in[13] = _mm_packs_epi32(v[6], v[7]);
+  in[3]  = _mm_packs_epi32(v[8], v[9]);
+  in[11] = _mm_packs_epi32(v[10], v[11]);
+  in[7]  = _mm_packs_epi32(v[12], v[13]);
+  in[15] = _mm_packs_epi32(v[14], v[15]);
+}
+
+void fadst16_1d_8col_avx2(__m128i *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  __m128i s[16], x[16], u[32], v[32];
+  const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
+  const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+  const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
+  const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+  const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
+  const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+  const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
+  const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+  const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
+  const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+  const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+  const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+  const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
+  const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+  const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
+  const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+  const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  const __m128i kZero = _mm_set1_epi16(0);
+
+  u[0] = _mm_unpacklo_epi16(in[15], in[0]);
+  u[1] = _mm_unpackhi_epi16(in[15], in[0]);
+  u[2] = _mm_unpacklo_epi16(in[13], in[2]);
+  u[3] = _mm_unpackhi_epi16(in[13], in[2]);
+  u[4] = _mm_unpacklo_epi16(in[11], in[4]);
+  u[5] = _mm_unpackhi_epi16(in[11], in[4]);
+  u[6] = _mm_unpacklo_epi16(in[9], in[6]);
+  u[7] = _mm_unpackhi_epi16(in[9], in[6]);
+  u[8] = _mm_unpacklo_epi16(in[7], in[8]);
+  u[9] = _mm_unpackhi_epi16(in[7], in[8]);
+  u[10] = _mm_unpacklo_epi16(in[5], in[10]);
+  u[11] = _mm_unpackhi_epi16(in[5], in[10]);
+  u[12] = _mm_unpacklo_epi16(in[3], in[12]);
+  u[13] = _mm_unpackhi_epi16(in[3], in[12]);
+  u[14] = _mm_unpacklo_epi16(in[1], in[14]);
+  u[15] = _mm_unpackhi_epi16(in[1], in[14]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
+  v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
+  v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
+  v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
+  v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
+  v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
+  v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
+  v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
+  v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
+  v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
+  v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
+  v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
+  v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
+  v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
+  v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
+  v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
+  v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
+
+  u[0] = _mm_add_epi32(v[0], v[16]);
+  u[1] = _mm_add_epi32(v[1], v[17]);
+  u[2] = _mm_add_epi32(v[2], v[18]);
+  u[3] = _mm_add_epi32(v[3], v[19]);
+  u[4] = _mm_add_epi32(v[4], v[20]);
+  u[5] = _mm_add_epi32(v[5], v[21]);
+  u[6] = _mm_add_epi32(v[6], v[22]);
+  u[7] = _mm_add_epi32(v[7], v[23]);
+  u[8] = _mm_add_epi32(v[8], v[24]);
+  u[9] = _mm_add_epi32(v[9], v[25]);
+  u[10] = _mm_add_epi32(v[10], v[26]);
+  u[11] = _mm_add_epi32(v[11], v[27]);
+  u[12] = _mm_add_epi32(v[12], v[28]);
+  u[13] = _mm_add_epi32(v[13], v[29]);
+  u[14] = _mm_add_epi32(v[14], v[30]);
+  u[15] = _mm_add_epi32(v[15], v[31]);
+  u[16] = _mm_sub_epi32(v[0], v[16]);
+  u[17] = _mm_sub_epi32(v[1], v[17]);
+  u[18] = _mm_sub_epi32(v[2], v[18]);
+  u[19] = _mm_sub_epi32(v[3], v[19]);
+  u[20] = _mm_sub_epi32(v[4], v[20]);
+  u[21] = _mm_sub_epi32(v[5], v[21]);
+  u[22] = _mm_sub_epi32(v[6], v[22]);
+  u[23] = _mm_sub_epi32(v[7], v[23]);
+  u[24] = _mm_sub_epi32(v[8], v[24]);
+  u[25] = _mm_sub_epi32(v[9], v[25]);
+  u[26] = _mm_sub_epi32(v[10], v[26]);
+  u[27] = _mm_sub_epi32(v[11], v[27]);
+  u[28] = _mm_sub_epi32(v[12], v[28]);
+  u[29] = _mm_sub_epi32(v[13], v[29]);
+  u[30] = _mm_sub_epi32(v[14], v[30]);
+  u[31] = _mm_sub_epi32(v[15], v[31]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
+  v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
+  v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
+  v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
+  v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
+  v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
+  v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
+  v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
+  v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
+  u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
+  u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
+  u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
+  u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
+  u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
+  u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
+  u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
+
+  s[0] = _mm_packs_epi32(u[0], u[1]);
+  s[1] = _mm_packs_epi32(u[2], u[3]);
+  s[2] = _mm_packs_epi32(u[4], u[5]);
+  s[3] = _mm_packs_epi32(u[6], u[7]);
+  s[4] = _mm_packs_epi32(u[8], u[9]);
+  s[5] = _mm_packs_epi32(u[10], u[11]);
+  s[6] = _mm_packs_epi32(u[12], u[13]);
+  s[7] = _mm_packs_epi32(u[14], u[15]);
+  s[8] = _mm_packs_epi32(u[16], u[17]);
+  s[9] = _mm_packs_epi32(u[18], u[19]);
+  s[10] = _mm_packs_epi32(u[20], u[21]);
+  s[11] = _mm_packs_epi32(u[22], u[23]);
+  s[12] = _mm_packs_epi32(u[24], u[25]);
+  s[13] = _mm_packs_epi32(u[26], u[27]);
+  s[14] = _mm_packs_epi32(u[28], u[29]);
+  s[15] = _mm_packs_epi32(u[30], u[31]);
+
+  // stage 2
+  u[0] = _mm_unpacklo_epi16(s[8], s[9]);
+  u[1] = _mm_unpackhi_epi16(s[8], s[9]);
+  u[2] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[3] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[4] = _mm_unpacklo_epi16(s[12], s[13]);
+  u[5] = _mm_unpackhi_epi16(s[12], s[13]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
+
+  u[0] = _mm_add_epi32(v[0], v[8]);
+  u[1] = _mm_add_epi32(v[1], v[9]);
+  u[2] = _mm_add_epi32(v[2], v[10]);
+  u[3] = _mm_add_epi32(v[3], v[11]);
+  u[4] = _mm_add_epi32(v[4], v[12]);
+  u[5] = _mm_add_epi32(v[5], v[13]);
+  u[6] = _mm_add_epi32(v[6], v[14]);
+  u[7] = _mm_add_epi32(v[7], v[15]);
+  u[8] = _mm_sub_epi32(v[0], v[8]);
+  u[9] = _mm_sub_epi32(v[1], v[9]);
+  u[10] = _mm_sub_epi32(v[2], v[10]);
+  u[11] = _mm_sub_epi32(v[3], v[11]);
+  u[12] = _mm_sub_epi32(v[4], v[12]);
+  u[13] = _mm_sub_epi32(v[5], v[13]);
+  u[14] = _mm_sub_epi32(v[6], v[14]);
+  u[15] = _mm_sub_epi32(v[7], v[15]);
+
+  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
+  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
+  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
+  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
+  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
+  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
+  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
+  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
+  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  x[0] = _mm_add_epi16(s[0], s[4]);
+  x[1] = _mm_add_epi16(s[1], s[5]);
+  x[2] = _mm_add_epi16(s[2], s[6]);
+  x[3] = _mm_add_epi16(s[3], s[7]);
+  x[4] = _mm_sub_epi16(s[0], s[4]);
+  x[5] = _mm_sub_epi16(s[1], s[5]);
+  x[6] = _mm_sub_epi16(s[2], s[6]);
+  x[7] = _mm_sub_epi16(s[3], s[7]);
+  x[8] = _mm_packs_epi32(u[0], u[1]);
+  x[9] = _mm_packs_epi32(u[2], u[3]);
+  x[10] = _mm_packs_epi32(u[4], u[5]);
+  x[11] = _mm_packs_epi32(u[6], u[7]);
+  x[12] = _mm_packs_epi32(u[8], u[9]);
+  x[13] = _mm_packs_epi32(u[10], u[11]);
+  x[14] = _mm_packs_epi32(u[12], u[13]);
+  x[15] = _mm_packs_epi32(u[14], u[15]);
+
+  // stage 3
+  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
+  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
+  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
+  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[4] = _mm_unpacklo_epi16(x[12], x[13]);
+  u[5] = _mm_unpackhi_epi16(x[12], x[13]);
+  u[6] = _mm_unpacklo_epi16(x[14], x[15]);
+  u[7] = _mm_unpackhi_epi16(x[14], x[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
+
+  u[0] = _mm_add_epi32(v[0], v[4]);
+  u[1] = _mm_add_epi32(v[1], v[5]);
+  u[2] = _mm_add_epi32(v[2], v[6]);
+  u[3] = _mm_add_epi32(v[3], v[7]);
+  u[4] = _mm_sub_epi32(v[0], v[4]);
+  u[5] = _mm_sub_epi32(v[1], v[5]);
+  u[6] = _mm_sub_epi32(v[2], v[6]);
+  u[7] = _mm_sub_epi32(v[3], v[7]);
+  u[8] = _mm_add_epi32(v[8], v[12]);
+  u[9] = _mm_add_epi32(v[9], v[13]);
+  u[10] = _mm_add_epi32(v[10], v[14]);
+  u[11] = _mm_add_epi32(v[11], v[15]);
+  u[12] = _mm_sub_epi32(v[8], v[12]);
+  u[13] = _mm_sub_epi32(v[9], v[13]);
+  u[14] = _mm_sub_epi32(v[10], v[14]);
+  u[15] = _mm_sub_epi32(v[11], v[15]);
+
+  u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  s[0] = _mm_add_epi16(x[0], x[2]);
+  s[1] = _mm_add_epi16(x[1], x[3]);
+  s[2] = _mm_sub_epi16(x[0], x[2]);
+  s[3] = _mm_sub_epi16(x[1], x[3]);
+  s[4] = _mm_packs_epi32(v[0], v[1]);
+  s[5] = _mm_packs_epi32(v[2], v[3]);
+  s[6] = _mm_packs_epi32(v[4], v[5]);
+  s[7] = _mm_packs_epi32(v[6], v[7]);
+  s[8] = _mm_add_epi16(x[8], x[10]);
+  s[9] = _mm_add_epi16(x[9], x[11]);
+  s[10] = _mm_sub_epi16(x[8], x[10]);
+  s[11] = _mm_sub_epi16(x[9], x[11]);
+  s[12] = _mm_packs_epi32(v[8], v[9]);
+  s[13] = _mm_packs_epi32(v[10], v[11]);
+  s[14] = _mm_packs_epi32(v[12], v[13]);
+  s[15] = _mm_packs_epi32(v[14], v[15]);
+
+  // stage 4
+  u[0] = _mm_unpacklo_epi16(s[2], s[3]);
+  u[1] = _mm_unpackhi_epi16(s[2], s[3]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
+  u[4] = _mm_unpacklo_epi16(s[10], s[11]);
+  u[5] = _mm_unpackhi_epi16(s[10], s[11]);
+  u[6] = _mm_unpacklo_epi16(s[14], s[15]);
+  u[7] = _mm_unpackhi_epi16(s[14], s[15]);
+
+  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
+  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
+  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
+  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
+  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
+  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
+  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
+  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
+  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
+  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
+  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
+
+  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+
+  in[0] = s[0];
+  in[1] = _mm_sub_epi16(kZero, s[8]);
+  in[2] = s[12];
+  in[3] = _mm_sub_epi16(kZero, s[4]);
+  in[4] = _mm_packs_epi32(v[4], v[5]);
+  in[5] = _mm_packs_epi32(v[12], v[13]);
+  in[6] = _mm_packs_epi32(v[8], v[9]);
+  in[7] = _mm_packs_epi32(v[0], v[1]);
+  in[8] = _mm_packs_epi32(v[2], v[3]);
+  in[9] = _mm_packs_epi32(v[10], v[11]);
+  in[10] = _mm_packs_epi32(v[14], v[15]);
+  in[11] = _mm_packs_epi32(v[6], v[7]);
+  in[12] = s[5];
+  in[13] = _mm_sub_epi16(kZero, s[13]);
+  in[14] = s[9];
+  in[15] = _mm_sub_epi16(kZero, s[1]);
+}
+
+void fdct16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fdct16_1d_8col_avx2(in0);
+  fdct16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void fadst16_1d_avx2(__m128i *in0, __m128i *in1) {
+  fadst16_1d_8col_avx2(in0);
+  fadst16_1d_8col_avx2(in1);
+  array_transpose_16x16_avx2(in0, in1);
+}
+
+void vp9_short_fht16x16_avx2(const int16_t *input, int16_t *output,
+                             int stride, int tx_type) {
+  __m128i in0[16], in1[16];
+  load_buffer_16x16_avx2(input, in0, in1, stride);
+  switch (tx_type) {
+    case 0:  // DCT_DCT
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 1:  // ADST_DCT
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fdct16_1d_avx2(in0, in1);
+      break;
+    case 2:  // DCT_ADST
+      fdct16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    case 3:  // ADST_ADST
+      fadst16_1d_avx2(in0, in1);
+      right_shift_16x16_avx2(in0, in1);
+      fadst16_1d_avx2(in0, in1);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+  write_buffer_16x16_avx2(output, in0, in1, 16);
+}
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
+#define FDCT32x32_HIGH_PRECISION 0
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c"
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
+
+#define FDCT32x32_2D_AVX2 vp9_fdct32x32_avx2
+#define FDCT32x32_HIGH_PRECISION 1
+#include "vp9/encoder/x86/vp9_dct32x32_avx2.c" // NOLINT
+#undef  FDCT32x32_2D_AVX2
+#undef  FDCT32x32_HIGH_PRECISION
diff --git a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
index dc11501..65431bd 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/source/libvpx/vp9/encoder/x86/vp9_dct_sse2.c
@@ -26,24 +26,25 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   const __m128i kOne = _mm_set1_epi16(1);
-  __m128i in0, in1, in2, in3;
+  __m128i in0, in1;
   // Load inputs.
   {
     in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
-    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
-    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
-    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
+    in0  = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
+           (input +  1 * stride)));
+    in1  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
+    in1  = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)
+           (input +  3 * stride)), in1);
+
     // x = x << 4
     in0 = _mm_slli_epi16(in0, 4);
     in1 = _mm_slli_epi16(in1, 4);
-    in2 = _mm_slli_epi16(in2, 4);
-    in3 = _mm_slli_epi16(in3, 4);
     // if (i == 0 && input[0]) input[0] += 1;
     {
       // The mask will only contain wether the first value is zero, all
@@ -60,18 +61,18 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
   // Do the two transform/transpose passes
   for (pass = 0; pass < 2; ++pass) {
     // Transform 1/2: Add/substract
-    const __m128i r0 = _mm_add_epi16(in0, in3);
-    const __m128i r1 = _mm_add_epi16(in1, in2);
-    const __m128i r2 = _mm_sub_epi16(in1, in2);
-    const __m128i r3 = _mm_sub_epi16(in0, in3);
+    const __m128i r0 = _mm_add_epi16(in0, in1);
+    const __m128i r1 = _mm_sub_epi16(in0, in1);
+    const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
+    const __m128i r3 = _mm_unpackhi_epi64(r0, r1);
     // Transform 1/2: Interleave to do the multiply by constants which gets us
     //                into 32 bits.
-    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i t0 = _mm_unpacklo_epi16(r2, r3);
+    const __m128i t2 = _mm_unpackhi_epi16(r2, r3);
     const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
     const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p08_p24);
+    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_p24_m08);
     const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
     const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
     const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
@@ -90,24 +91,21 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
     // 00 10 01 11 02 12 03 13
     // 20 30 21 31 22 32 23 33
     in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+    in1 = _mm_shuffle_epi32(in1, 0x4E);
     // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
-    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
-    if (0 == pass) {
-      // Extract values in the high part for second pass as transform code
-      // only uses the first four values.
-      in1 = _mm_unpackhi_epi64(in0, in0);
-      in3 = _mm_unpackhi_epi64(in2, in2);
-    } else {
-      // Post-condition output and store it (v + 1) >> 2, taking advantage
-      // of the fact 1/3 are stored just after 0/2.
-      __m128i out01 = _mm_add_epi16(in0, kOne);
-      __m128i out23 = _mm_add_epi16(in2, kOne);
-      out01 = _mm_srai_epi16(out01, 2);
-      out23 = _mm_srai_epi16(out23, 2);
-      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
-      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
-    }
+    // 02 12 22 32 03 13 23 33      in1 contains 2 followed by 3
+  }
+  in1 = _mm_shuffle_epi32(in1, 0x4E);
+  // Post-condition output and store it (v + 1) >> 2, taking advantage
+  // of the fact 1/3 are stored just after 0/2.
+  {
+     __m128i out01 = _mm_add_epi16(in0, kOne);
+     __m128i out23 = _mm_add_epi16(in1, kOne);
+     out01 = _mm_srai_epi16(out01, 2);
+     out23 = _mm_srai_epi16(out23, 2);
+     _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
+     _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
   }
 }
 
@@ -206,12 +204,12 @@ void fadst4_1d_sse2(__m128i *in) {
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
-  in7 = _mm_sub_epi16(in7, in[3]);
 
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
   u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
@@ -219,9 +217,10 @@ void fadst4_1d_sse2(__m128i *in) {
   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
 
   u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = v[2];
+  u[1] = _mm_sub_epi32(v[2], v[6]);
   u[2] = _mm_add_epi32(v[3], v[4]);
   u[3] = _mm_sub_epi32(u[2], u[0]);
   u[4] = _mm_slli_epi32(v[5], 2);
diff --git a/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm b/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
index 533456b..1a9e4e8 100644
--- a/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/source/libvpx/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -118,6 +118,14 @@ SECTION .text
   RET
 %endmacro
 
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
 %macro SUBPEL_VARIANCE 1-2 0 ; W
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
@@ -129,41 +137,85 @@ SECTION .text
 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
 ; difference on Win64
-%ifdef PIC
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                              x_offset, y_offset, \
-                                              dst, dst_stride, \
-                                              sec, sec_stride, height, sse
-%define sec_str sec_strideq
-%else
-cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
-%endif
-%define h heightd
-%define bilin_filter sseq
-%else
-%if %2 == 1 ; avg
-cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
-                                    7 + 2 * ARCH_X86_64, 13, src, src_stride, \
-                                                         x_offset, y_offset, \
-                                                         dst, dst_stride, \
-                                                         sec, sec_stride, \
-                                                         height, sse
-%if ARCH_X86_64
-%define h heightd
-%define sec_str sec_strideq
-%else
-%define h dword heightm
-%define sec_str sec_stridemp
-%endif
+
+%ifdef PIC    ; 64bit PIC
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, \
+                                  y_offset, dst, dst_stride, height, sse
+  %endif
+  %define h heightd
+  %define bilin_filter sseq
 %else
-cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
-                                          dst, dst_stride, height, sse
-%define h heightd
-%endif
-%define bilin_filter bilin_filter_m
+  %if ARCH_X86=1 && CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, \
+                                  sec, sec_stride, \
+                                  height, sse, g_bilin_filter, g_pw_8
+      %define h dword heightm
+      %define sec_str sec_stridemp
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                                y_offset, dst, dst_stride, height, sse, \
+                                g_bilin_filter, g_pw_8
+      %define h heightd
+
+      ;Store bilin_filter and pw_8 location in stack
+      GET_GOT eax
+      add esp, 4                ; restore esp
+
+      lea ecx, [GLOBAL(bilin_filter_m)]
+      mov g_bilin_filterm, ecx
+
+      lea ecx, [GLOBAL(pw_8)]
+      mov g_pw_8m, ecx
+
+      LOAD_IF_USED 0, 1         ; load eax, ecx back
+    %endif
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+                        7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+                                             x_offset, y_offset, \
+                                             dst, dst_stride, \
+                                             sec, sec_stride, \
+                                             height, sse
+      %if ARCH_X86_64
+      %define h heightd
+      %define sec_str sec_strideq
+      %else
+      %define h dword heightm
+      %define sec_str sec_stridemp
+      %endif
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, \
+                              y_offset, dst, dst_stride, height, sse
+      %define h heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
 %endif
+
   ASSERT               %1 <= 16         ; m6 overflows if w > 16
   pxor                 m6, m6           ; sum
   pxor                 m7, m7           ; sse
@@ -329,11 +381,22 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_b m9
 %define filter_rnd m10
 %else ; x86-32 or mmx
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_zero_y_other_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -615,12 +678,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_a m8
 %define filter_y_b m9
 %define filter_rnd m10
+%else  ;x86_32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           y_offsetq, bilin_filter
 %define filter_y_a [y_offsetq]
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m3, [srcq+1]
@@ -752,12 +826,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 .x_other_y_zero_loop:
 %if %1 == 16
   movu                 m0, [srcq]
@@ -873,12 +958,23 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_x_a m8
 %define filter_x_b m9
 %define filter_rnd m10
+%else    ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
 %define filter_x_a [x_offsetq]
 %define filter_x_b [x_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
 %if %1 == 16
   movu                 m0, [srcq]
   movu                 m1, [srcq+1]
@@ -1057,6 +1153,21 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_a m10
 %define filter_y_b m11
 %define filter_rnd m12
+%else   ; x86-32
+%if ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
 %else
   add           x_offsetq, bilin_filter
   add           y_offsetq, bilin_filter
@@ -1066,6 +1177,8 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %define filter_y_b [y_offsetq+16]
 %define filter_rnd [pw_8]
 %endif
+%endif
+
   ; x_offset == bilin interpolation && y_offset == bilin interpolation
 %if %1 == 16
   movu                 m0, [srcq]
@@ -1093,7 +1206,9 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %endif
   psraw                m0, 4
   psraw                m2, 4
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
   packuswb             m0, m2
 .x_other_y_other_loop:
 %if cpuflag(ssse3)
@@ -1163,7 +1278,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  add                srcq, src_strideq
+  INC_SRC_BY_SRC_STRIDE
   add                dstq, dst_strideq
 %else ; %1 < 16
   movh                 m0, [srcq]
@@ -1184,12 +1299,17 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
 %if cpuflag(ssse3)
   packuswb             m0, m0
 %endif
-  add                srcq, src_strideq
+
+  INC_SRC_BY_SRC_STRIDE
+
 .x_other_y_other_loop:
   movh                 m2, [srcq]
   movh                 m1, [srcq+1]
-  movh                 m4, [srcq+src_strideq]
-  movh                 m3, [srcq+src_strideq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movh                 m4, [srcq]
+  movh                 m3, [srcq+1]
+
 %if cpuflag(ssse3)
   punpcklbw            m2, m1
   punpcklbw            m4, m3
@@ -1253,7 +1373,7 @@ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
   SUM_SSE              m0, m1, m2, m3, m6, m7
   mova                 m0, m4
 
-  lea                srcq, [srcq+src_strideq*2]
+  INC_SRC_BY_SRC_STRIDE
   lea                dstq, [dstq+dst_strideq*2]
 %endif
 %if %2 == 1 ; avg
diff --git a/source/libvpx/vp9/vp9_common.mk b/source/libvpx/vp9/vp9_common.mk
index 11fa2e0..01c55a4 100644
--- a/source/libvpx/vp9/vp9_common.mk
+++ b/source/libvpx/vp9/vp9_common.mk
@@ -17,11 +17,9 @@ VP9_COMMON_SRCS-yes += common/vp9_alloccommon.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.c
 VP9_COMMON_SRCS-yes += common/vp9_convolve.h
 VP9_COMMON_SRCS-yes += common/vp9_debugmodes.c
-VP9_COMMON_SRCS-yes += common/vp9_default_coef_probs.h
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_extend.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c
@@ -34,7 +32,6 @@ VP9_COMMON_SRCS-yes += common/vp9_entropy.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
-VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
@@ -47,7 +44,6 @@ VP9_COMMON_SRCS-yes += common/vp9_reconinter.h
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
-VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -65,7 +61,6 @@ VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c
-VP9_COMMON_SRCS-yes += common/vp9_treecoder.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.c
 VP9_COMMON_SRCS-yes += common/vp9_common_data.h
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
@@ -74,6 +69,7 @@ VP9_COMMON_SRCS-yes += common/vp9_scan.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_postproc_x86.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.h
 VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
 VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
@@ -102,19 +98,31 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_convolve8_vert_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred4_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred8_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_intrapred16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans4_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans8_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans16_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_cols_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_itrans32_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_filters_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_macros_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_loopfilter_masks_dspr2.h
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mbloop_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
+VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_dc_only_idct_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct4x4_add_neon$(ASM)
@@ -122,6 +130,7 @@ VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_1_add_neon$(AS
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
diff --git a/source/libvpx/vp9/vp9_cx_iface.c b/source/libvpx/vp9/vp9_cx_iface.c
index 4d39670..6bfca8d 100644
--- a/source/libvpx/vp9/vp9_cx_iface.c
+++ b/source/libvpx/vp9/vp9_cx_iface.c
@@ -38,6 +38,7 @@ struct vp9_extracfg {
   unsigned int                rc_max_intra_bitrate_pct;
   unsigned int                lossless;
   unsigned int                frame_parallel_decoding_mode;
+  unsigned int                aq_mode;
 };
 
 struct extraconfig_map {
@@ -66,6 +67,7 @@ static const struct extraconfig_map extracfg_map[] = {
       0,                          /* rc_max_intra_bitrate_pct */
       0,                          /* lossless */
       0,                          /* frame_parallel_decoding_mode */
+      0,                          /* aq_mode */
     }
   }
 };
@@ -75,14 +77,14 @@ struct vpx_codec_alg_priv {
   vpx_codec_enc_cfg_t     cfg;
   struct vp9_extracfg     vp8_cfg;
   VP9_CONFIG              oxcf;
-  VP9_PTR             cpi;
+  VP9_PTR                 cpi;
   unsigned char          *cx_data;
-  unsigned int            cx_data_sz;
+  size_t                  cx_data_sz;
   unsigned char          *pending_cx_data;
-  unsigned int            pending_cx_data_sz;
+  size_t                  pending_cx_data_sz;
   int                     pending_frame_count;
-  uint32_t                pending_frame_sizes[8];
-  uint32_t                pending_frame_magnitude;
+  size_t                  pending_frame_sizes[8];
+  size_t                  pending_frame_magnitude;
   vpx_image_t             preview_img;
   vp8_postproc_cfg_t      preview_ppcfg;
   vpx_codec_pkt_list_decl(64) pkt_list;
@@ -98,7 +100,7 @@ static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
     case VP8_ALTR_FRAME:
       return VP9_ALT_FLAG;
   }
-  assert(!"Invalid Reference Frame");
+  assert(0 && "Invalid Reference Frame");
   return VP9_LAST_FLAG;
 }
 
@@ -157,6 +159,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK_HI(cfg, rc_max_quantizer, 0);
     RANGE_CHECK_HI(cfg, rc_min_quantizer, 0);
   }
+  RANGE_CHECK(vp8_cfg, aq_mode,           0, AQ_MODES_COUNT - 1);
 
   RANGE_CHECK_HI(cfg, g_threads,          64);
   RANGE_CHECK_HI(cfg, g_lag_in_frames,    MAX_LAG_BUFFERS);
@@ -195,6 +198,10 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
   RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
   RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
 
+  // TODO(yaowu): remove this when ssim tuning is implemented for vp9
+  if (vp8_cfg->tuning == VP8_TUNE_SSIM)
+      ERROR("Option --tune=ssim is not currently supported in VP9.");
+
   if (cfg->g_pass == VPX_RC_LAST_PASS) {
     size_t           packet_sz = sizeof(FIRSTPASS_STATS);
     int              n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -335,6 +342,8 @@ static vpx_codec_err_t set_vp9e_config(VP9_CONFIG *oxcf,
   oxcf->error_resilient_mode         = cfg.g_error_resilient;
   oxcf->frame_parallel_decoding_mode = vp8_cfg.frame_parallel_decoding_mode;
 
+  oxcf->aq_mode = vp8_cfg.aq_mode;
+
   oxcf->ss_number_layers = cfg.ss_number_layers;
   /*
   printf("Current VP9 Settings: \n");
@@ -442,11 +451,10 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
       MAP(VP8E_SET_ARNR_TYPE,               xcfg.arnr_type);
       MAP(VP8E_SET_TUNING,                  xcfg.tuning);
       MAP(VP8E_SET_CQ_LEVEL,                xcfg.cq_level);
-      MAP(VP9E_SET_MAX_Q,                   ctx->cfg.rc_max_quantizer);
-      MAP(VP9E_SET_MIN_Q,                   ctx->cfg.rc_min_quantizer);
       MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT,   xcfg.rc_max_intra_bitrate_pct);
       MAP(VP9E_SET_LOSSLESS,                xcfg.lossless);
       MAP(VP9E_SET_FRAME_PARALLEL_DECODING, xcfg.frame_parallel_decoding_mode);
+      MAP(VP9E_SET_AQ_MODE,                 xcfg.aq_mode);
   }
 
   res = validate_config(ctx, &ctx->cfg, &xcfg);
@@ -700,7 +708,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t  *ctx,
     unsigned int lib_flags;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp, dst_end_time_stamp;
-    unsigned long size, cx_data_sz;
+    size_t size, cx_data_sz;
     unsigned char *cx_data;
 
     /* Set up internal flags */
@@ -1009,66 +1017,40 @@ static vpx_codec_err_t vp9e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-static vpx_codec_err_t vp9e_set_width(vpx_codec_alg_priv_t *ctx, int ctr_id,
-                                      va_list args) {
-  unsigned int *data = va_arg(args, unsigned int *);
-  if (data) {
-    int res;
-    res = vp9_set_size_literal(ctx->cpi, *data, 0);
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
-  } else {
-    return VPX_CODEC_INVALID_PARAM;
-  }
+static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
+                                    va_list args) {
+  int data = va_arg(args, int);
+  vp9_set_svc(ctx->cpi, data);
+  return VPX_CODEC_OK;
 }
 
-static vpx_codec_err_t vp9e_set_height(vpx_codec_alg_priv_t *ctx,
-                                       int ctr_id,
-                                       va_list args) {
-  unsigned int *data =  va_arg(args, unsigned int *);
+static vpx_codec_err_t vp9e_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
+                                               int ctr_id, va_list args) {
+  vpx_svc_parameters_t *data = va_arg(args, vpx_svc_parameters_t *);
+  VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
+  vpx_svc_parameters_t params;
 
-  if (data) {
-    int res;
-    res = vp9_set_size_literal(ctx->cpi, 0, *data);
-
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
-  } else {
+  if (data == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-}
 
-static vpx_codec_err_t vp9e_set_layer(vpx_codec_alg_priv_t *ctx,
-                                      int ctr_id,
-                                      va_list args) {
-  unsigned int *data =  va_arg(args, unsigned int *);
+  params = *(vpx_svc_parameters_t *)data;
 
-  if (data) {
-    int res;
-    res = 0;
-
-    res = vp9_switch_layer(ctx->cpi, *data);
+  cpi->current_layer = params.layer;
+  cpi->lst_fb_idx = params.lst_fb_idx;
+  cpi->gld_fb_idx = params.gld_fb_idx;
+  cpi->alt_fb_idx = params.alt_fb_idx;
 
-    if (!res) {
-      return VPX_CODEC_OK;
-    } else {
-      return VPX_CODEC_INVALID_PARAM;
-    }
-  } else {
+  if (vp9_set_size_literal(ctx->cpi, params.width, params.height) != 0) {
     return VPX_CODEC_INVALID_PARAM;
   }
-}
 
-static vpx_codec_err_t vp9e_set_svc(vpx_codec_alg_priv_t *ctx, int ctr_id,
-                                    va_list args) {
-  int data = va_arg(args, int);
-  vp9_set_svc(ctx->cpi, data);
+  ctx->cfg.rc_max_quantizer = params.max_quantizer;
+  ctx->cfg.rc_min_quantizer = params.min_quantizer;
+
+  set_vp9e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg);
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
+
   return VPX_CODEC_OK;
 }
 
@@ -1096,16 +1078,13 @@ static vpx_codec_ctrl_fn_map_t vp9e_ctf_maps[] = {
   {VP8E_SET_ARNR_TYPE,                set_param},
   {VP8E_SET_TUNING,                   set_param},
   {VP8E_SET_CQ_LEVEL,                 set_param},
-  {VP9E_SET_MAX_Q,                    set_param},
-  {VP9E_SET_MIN_Q,                    set_param},
   {VP8E_SET_MAX_INTRA_BITRATE_PCT,    set_param},
   {VP9E_SET_LOSSLESS,                 set_param},
   {VP9E_SET_FRAME_PARALLEL_DECODING,  set_param},
+  {VP9E_SET_AQ_MODE,                  set_param},
   {VP9_GET_REFERENCE,                 get_reference},
-  {VP9E_SET_WIDTH,                    vp9e_set_width},
-  {VP9E_SET_HEIGHT,                   vp9e_set_height},
-  {VP9E_SET_LAYER,                    vp9e_set_layer},
   {VP9E_SET_SVC,                      vp9e_set_svc},
+  {VP9E_SET_SVC_PARAMETERS,           vp9e_set_svc_parameters},
   { -1, NULL},
 };
 
diff --git a/source/libvpx/vp9/vp9_dx_iface.c b/source/libvpx/vp9/vp9_dx_iface.c
index 5dacab4..c123c46 100644
--- a/source/libvpx/vp9/vp9_dx_iface.c
+++ b/source/libvpx/vp9/vp9_dx_iface.c
@@ -59,6 +59,13 @@ struct vpx_codec_alg_priv {
   int                     img_setup;
   int                     img_avail;
   int                     invert_tile_order;
+  int                     fb_lru;
+
+  /* External buffer info to save for VP9 common. */
+  vpx_codec_frame_buffer_t *fb_list;  // External frame buffers
+  int fb_count;  // Total number of frame buffers
+  vpx_realloc_frame_buffer_cb_fn_t realloc_fb_cb;
+  void *user_priv;  // Private data associated with the external frame buffers.
 };
 
 static unsigned long priv_sz(const vpx_codec_dec_cfg_t *si,
@@ -307,10 +314,32 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
         ctx->postproc_cfg.noise_level = 0;
       }
 
-      if (!optr)
+      if (!optr) {
         res = VPX_CODEC_ERROR;
-      else
+      } else {
+        VP9D_COMP *const pbi = (VP9D_COMP*)optr;
+        VP9_COMMON *const cm = &pbi->common;
+        if (ctx->fb_list != NULL && ctx->realloc_fb_cb != NULL &&
+            ctx->fb_count > 0) {
+          cm->fb_list = ctx->fb_list;
+          cm->fb_count = ctx->fb_count;
+          cm->realloc_fb_cb = ctx->realloc_fb_cb;
+          cm->user_priv = ctx->user_priv;
+        } else {
+          cm->fb_count = FRAME_BUFFERS;
+        }
+        cm->fb_lru = ctx->fb_lru;
+        CHECK_MEM_ERROR(cm, cm->yv12_fb,
+                        vpx_calloc(cm->fb_count, sizeof(*cm->yv12_fb)));
+        CHECK_MEM_ERROR(cm, cm->fb_idx_ref_cnt,
+                        vpx_calloc(cm->fb_count, sizeof(*cm->fb_idx_ref_cnt)));
+        if (cm->fb_lru) {
+          CHECK_MEM_ERROR(cm, cm->fb_idx_ref_lru,
+                          vpx_calloc(cm->fb_count,
+                                     sizeof(*cm->fb_idx_ref_lru)));
+        }
         ctx->pbi = optr;
+      }
     }
 
     ctx->decoder_init = 1;
@@ -347,7 +376,7 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
     }
 
     if (vp9_receive_compressed_data(ctx->pbi, data_sz, data, deadline)) {
-      VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+      VP9D_COMP *pbi = (VP9D_COMP*)ctx->pbi;
       res = update_error_state(ctx, &pbi->common.error);
     }
 
@@ -475,6 +504,27 @@ static vpx_image_t *vp9_get_frame(vpx_codec_alg_priv_t  *ctx,
   return img;
 }
 
+static vpx_codec_err_t vp9_set_frame_buffers(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_codec_frame_buffer_t *fb_list, int fb_count,
+    vpx_realloc_frame_buffer_cb_fn_t cb, void *user_priv) {
+  if (fb_count < REF_FRAMES) {
+    /* The application must pass in at least REF_FRAMES frame buffers. */
+    return VPX_CODEC_INVALID_PARAM;
+  } else if (!ctx->pbi) {
+    /* If the decoder has already been initialized, do not accept external
+     * frame buffers.
+     */
+    ctx->fb_list = fb_list;
+    ctx->fb_count = fb_count;
+    ctx->realloc_fb_cb = cb;
+    ctx->user_priv = user_priv;
+    return VPX_CODEC_OK;
+  }
+
+  return VPX_CODEC_ERROR;
+}
+
 static vpx_codec_err_t vp9_xma_get_mmap(const vpx_codec_ctx_t      *ctx,
                                         vpx_codec_mmap_t           *mmap,
                                         vpx_codec_iter_t           *iter) {
@@ -639,7 +689,7 @@ static vpx_codec_err_t get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                             int ctrl_id,
                                             va_list args) {
   int *update_info = va_arg(args, int *);
-  VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+  VP9D_COMP *pbi = (VP9D_COMP*)ctx->pbi;
 
   if (update_info) {
     *update_info = pbi->refresh_frame_flags;
@@ -657,7 +707,7 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
-    VP9D_COMP *pbi = (VP9D_COMP *)ctx->pbi;
+    VP9D_COMP *pbi = (VP9D_COMP*)ctx->pbi;
     if (pbi)
       *corrupted = pbi->common.frame_to_show->corrupted;
     else
@@ -668,6 +718,25 @@ static vpx_codec_err_t get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
   }
 }
 
+static vpx_codec_err_t get_display_size(vpx_codec_alg_priv_t *ctx,
+                                        int ctrl_id,
+                                        va_list args) {
+  int *const display_size = va_arg(args, int *);
+
+  if (display_size) {
+    const VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi;
+    if (pbi) {
+      display_size[0] = pbi->common.display_width;
+      display_size[1] = pbi->common.display_height;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+    return VPX_CODEC_OK;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
                                              int ctr_id,
                                              va_list args) {
@@ -675,6 +744,21 @@ static vpx_codec_err_t set_invert_tile_order(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t set_frame_buffer_lru_cache(vpx_codec_alg_priv_t *ctx,
+                                                  int ctr_id,
+                                                  va_list args) {
+  VP9D_COMP *const pbi = (VP9D_COMP*)ctx->pbi;
+
+  // Save for later to pass into vp9 common.
+  ctx->fb_lru = va_arg(args, int);
+
+  if (pbi) {
+    VP9_COMMON *const cm = &pbi->common;
+    cm->fb_lru = ctx->fb_lru;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8_SET_REFERENCE,             set_reference},
   {VP8_COPY_REFERENCE,            copy_reference},
@@ -686,7 +770,9 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
   {VP8D_GET_LAST_REF_UPDATES,     get_last_ref_updates},
   {VP8D_GET_FRAME_CORRUPTED,      get_frame_corrupted},
   {VP9_GET_REFERENCE,             get_reference},
+  {VP9D_GET_DISPLAY_SIZE,         get_display_size},
   {VP9_INVERT_TILE_DECODE_ORDER,  set_invert_tile_order},
+  {VP9D_SET_FRAME_BUFFER_LRU_CACHE, set_frame_buffer_lru_cache},
   { -1, NULL},
 };
 
@@ -697,7 +783,8 @@ static vpx_codec_ctrl_fn_map_t ctf_maps[] = {
 CODEC_INTERFACE(vpx_codec_vp9_dx) = {
   "WebM Project VP9 Decoder" VERSION_STRING,
   VPX_CODEC_INTERNAL_ABI_VERSION,
-  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC,
+  VPX_CODEC_CAP_DECODER | VP9_CAP_POSTPROC |
+      VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER,
   /* vpx_codec_caps_t          caps; */
   vp9_init,         /* vpx_codec_init_fn_t       init; */
   vp9_destroy,      /* vpx_codec_destroy_fn_t    destroy; */
@@ -709,6 +796,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
     vp9_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
     vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp9_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
+    vp9_set_frame_buffers,    /* vpx_codec_set_frame_buffers_fn_t  set_fb; */
   },
   { // NOLINT
     /* encoder functions */
diff --git a/source/libvpx/vp9/vp9cx.mk b/source/libvpx/vp9/vp9cx.mk
index 0993c6c..ce83a67 100644
--- a/source/libvpx/vp9/vp9cx.mk
+++ b/source/libvpx/vp9/vp9cx.mk
@@ -23,32 +23,31 @@ VP9_CX_SRCS-yes += encoder/vp9_dct.c
 VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
-VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_extend.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.h
 VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
-VP9_CX_SRCS-yes += encoder/vp9_encodeintra.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
+VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.h
 VP9_CX_SRCS-yes += encoder/vp9_onyx_int.h
 VP9_CX_SRCS-yes += encoder/vp9_psnr.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
+VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
 VP9_CX_SRCS-yes += encoder/vp9_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
-VP9_CX_SRCS-yes += encoder/vp9_modecosts.c
 VP9_CX_SRCS-yes += encoder/vp9_onyx_if.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.h
@@ -105,4 +104,7 @@ VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct32x32_sse2.c
 
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2.c
+
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/source/libvpx/vp9/vp9dx.mk b/source/libvpx/vp9/vp9dx.mk
index 3a27cdd..f431721 100644
--- a/source/libvpx/vp9/vp9dx.mk
+++ b/source/libvpx/vp9/vp9dx.mk
@@ -19,8 +19,8 @@ VP9_DX_SRCS-yes += vp9_dx_iface.c
 
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
-VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
-VP9_DX_SRCS-yes += decoder/vp9_decodframe.h
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h
@@ -30,7 +30,6 @@ VP9_DX_SRCS-yes += decoder/vp9_onyxd.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_int.h
 VP9_DX_SRCS-yes += decoder/vp9_thread.c
 VP9_DX_SRCS-yes += decoder/vp9_thread.h
-VP9_DX_SRCS-yes += decoder/vp9_treereader.h
 VP9_DX_SRCS-yes += decoder/vp9_onyxd_if.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.h
diff --git a/source/libvpx/vp9_spatial_scalable_encoder.c b/source/libvpx/vp9_spatial_scalable_encoder.c
index 8bb582f..a727f50 100644
--- a/source/libvpx/vp9_spatial_scalable_encoder.c
+++ b/source/libvpx/vp9_spatial_scalable_encoder.c
@@ -13,62 +13,77 @@
  * VP9 encoding scheme based on spatial scalability for video applications
  * that benefit from a scalable bitstream.
  */
-#include <stdio.h>
-#include <stdlib.h>
+
 #include <stdarg.h>
-#include <time.h>
+#include <stdlib.h>
 #include <string.h>
-#include <unistd.h>
-#include <libgen.h>
-#define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx/vpx_encoder.h"
+#include <time.h>
+#include "./args.h"
+#include "./ivfenc.h"
+#include "./tools_common.h"
+#include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
-#define interface (vpx_codec_vp9_cx())
-#define fourcc 0x30395056
-#define IVF_FILE_HDR_SZ (32)
-#define IVF_FRAME_HDR_SZ (12)
-#define NUM_BUFFERS 8
-
-char *input_filename;
-char *output_filename;
-unsigned int number_frames_to_code = 60 * 60;
-unsigned int number_frames_to_skip = 0;
-unsigned int number_spatial_layers = 5;
-unsigned int key_period = 100;
-
-typedef enum ENCODING_MODE {
-  INTER_LAYER_PREDICTION_I,
-  INTER_LAYER_PREDICTION_IP,
-  USE_GOLDEN_FRAME
-} ENCODING_MODE;
-
-static void mem_put_le16(char *mem, unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-}
-
-static void mem_put_le32(char *mem, unsigned int val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-  mem[2] = val >> 16;
-  mem[3] = val >> 24;
-}
-
-static void usage(char *program_name) {
-  printf(
-      "Usage: %s [-f frames] [-s skip_frames] [-w width] [-h height] \n\t"
-      "[-n rate_num] [-d rate_den] [-b bitrate] [-l layers] "
-      "<input_filename> <output_filename>\n",
-      basename(program_name));
-  exit(EXIT_FAILURE);
-}
-
-static void die(const char *fmt, ...) {
-  va_list ap;
+#include "vpx/vpx_encoder.h"
 
-  va_start(ap, fmt);
-  vprintf(fmt, ap);
-  if (fmt[strlen(fmt) - 1] != '\n') printf("\n");
+static const struct arg_enum_list encoding_mode_enum[] = {
+  {"i", INTER_LAYER_PREDICTION_I},
+  {"alt-ip", ALT_INTER_LAYER_PREDICTION_IP},
+  {"ip", INTER_LAYER_PREDICTION_IP},
+  {"gf", USE_GOLDEN_FRAME},
+  {NULL, 0}
+};
+
+static const arg_def_t encoding_mode_arg = ARG_DEF_ENUM(
+    "m", "encoding-mode", 1, "Encoding mode algorithm", encoding_mode_enum);
+static const arg_def_t skip_frames_arg =
+    ARG_DEF("s", "skip-frames", 1, "input frames to skip");
+static const arg_def_t frames_arg =
+    ARG_DEF("f", "frames", 1, "number of frames to encode");
+static const arg_def_t width_arg = ARG_DEF("w", "width", 1, "source width");
+static const arg_def_t height_arg = ARG_DEF("h", "height", 1, "source height");
+static const arg_def_t timebase_arg =
+    ARG_DEF("t", "timebase", 1, "timebase (num/den)");
+static const arg_def_t bitrate_arg = ARG_DEF(
+    "b", "target-bitrate", 1, "encoding bitrate, in kilobits per second");
+static const arg_def_t layers_arg =
+    ARG_DEF("l", "layers", 1, "number of SVC layers");
+static const arg_def_t kf_dist_arg =
+    ARG_DEF("k", "kf-dist", 1, "number of frames between keyframes");
+static const arg_def_t scale_factors_arg =
+    ARG_DEF("r", "scale-factors", 1, "scale factors (lowest to highest layer)");
+static const arg_def_t quantizers_arg =
+    ARG_DEF("q", "quantizers", 1, "quantizers (lowest to highest layer)");
+
+static const arg_def_t *svc_args[] = {
+  &encoding_mode_arg, &frames_arg,        &width_arg,       &height_arg,
+  &timebase_arg,      &bitrate_arg,       &skip_frames_arg, &layers_arg,
+  &kf_dist_arg,       &scale_factors_arg, &quantizers_arg,  NULL
+};
+
+static const SVC_ENCODING_MODE default_encoding_mode =
+    INTER_LAYER_PREDICTION_IP;
+static const uint32_t default_frames_to_skip = 0;
+static const uint32_t default_frames_to_code = 60 * 60;
+static const uint32_t default_width = 1920;
+static const uint32_t default_height = 1080;
+static const uint32_t default_timebase_num = 1;
+static const uint32_t default_timebase_den = 60;
+static const uint32_t default_bitrate = 1000;
+static const uint32_t default_spatial_layers = 5;
+static const uint32_t default_kf_dist = 100;
+
+typedef struct {
+  char *output_filename;
+  uint32_t frames_to_code;
+  uint32_t frames_to_skip;
+  struct VpxInputContext input_ctx;
+} AppInput;
+
+void usage_exit(const char *exec_name) {
+  fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
+          exec_name);
+  fprintf(stderr, "Options:\n");
+  arg_show_usage(stderr, svc_args);
   exit(EXIT_FAILURE);
 }
 
@@ -80,408 +95,182 @@ static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   exit(EXIT_FAILURE);
 }
 
-static int read_frame(FILE *f, vpx_image_t *img) {
-  size_t nbytes, to_read;
-  int res = 1;
-
-  to_read = img->w * img->h * 3 / 2;
-  nbytes = fread(img->planes[0], 1, to_read, f);
-  if (nbytes != to_read) {
-    res = 0;
-    if (nbytes > 0)
-      printf("Warning: Read partial frame. Check your width & height!\n");
-  }
-  return res;
-}
-
-static int read_dummy_frame(vpx_image_t *img) {
-  size_t to_read;
-
-  to_read = img->w * img->h * 3 / 2;
-  memset(img->planes[0], 129, to_read);
+static int create_dummy_frame(vpx_image_t *img) {
+  const size_t buf_size = img->w * img->h * 3 / 2;
+  memset(img->planes[0], 129, buf_size);
   return 1;
 }
 
-static void write_ivf_file_header(FILE *outfile, const vpx_codec_enc_cfg_t *cfg,
-                                  int frame_cnt) {
-  char header[32];
-
-  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS) return;
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4, 0);                    /* version */
-  mem_put_le16(header + 6, 32);                   /* headersize */
-  mem_put_le32(header + 8, fourcc);               /* headersize */
-  mem_put_le16(header + 12, cfg->g_w);            /* width */
-  mem_put_le16(header + 14, cfg->g_h);            /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);           /* length */
-  mem_put_le32(header + 28, 0);                   /* unused */
-
-  (void)fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt) {
-  char header[12];
-  vpx_codec_pts_t pts;
-
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
-
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, pkt->data.frame.sz);
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void)fwrite(header, 1, 12, outfile);
-}
-
-static void check_parameters() {
-  if (number_spatial_layers > 5) die("Cannot support more than 5 layers");
-}
-
-static void parse_command_line(int argc, char **argv,
-                               vpx_codec_enc_cfg_t *cfg) {
-  unsigned int width = 1920;
-  unsigned int height = 1080;
-  unsigned int timebase_num = 1;
-  unsigned int timebase_den = 60;
-  unsigned int bitrate = 1000;
-  int c;
+static void parse_command_line(int argc, const char **argv_,
+                               AppInput *app_input, SvcContext *svc_ctx,
+                               vpx_codec_enc_cfg_t *enc_cfg) {
+  struct arg arg;
+  char **argv, **argi, **argj;
   vpx_codec_err_t res;
 
-  opterr = 0;
-  while ((c = getopt(argc, argv, "f:w:h:n:d:b:s:l:p:")) != -1) switch (c) {
-      case 'f':
-        number_frames_to_code = atoi(optarg);
-        break;
-      case 'w':
-        width = atoi(optarg);
-        break;
-      case 'h':
-        height = atoi(optarg);
-        break;
-      case 'n':
-        timebase_num = atoi(optarg);
-        break;
-      case 'd':
-        timebase_den = atoi(optarg);
-        break;
-      case 'b':
-        bitrate = atoi(optarg);
-        break;
-      case 's':
-        number_frames_to_skip = atoi(optarg);
-        break;
-      case 'l':
-        number_spatial_layers = atoi(optarg);
-        break;
-      case 'p':
-        key_period = atoi(optarg);
-        break;
-      case '?':
-        usage(argv[0]);
-    }
-
-  // Parse required parameters
-  if (argc - optind != 2) {
-    usage(argv[0]);
-  }
-
-  input_filename = argv[optind];
-  output_filename = argv[optind + 1];
+  // initialize SvcContext with parameters that will be passed to vpx_svc_init
+  svc_ctx->log_level = SVC_LOG_DEBUG;
+  svc_ctx->spatial_layers = default_spatial_layers;
+  svc_ctx->encoding_mode = default_encoding_mode;
 
-  if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %d x %d", width, height);
-
-  /* Populate encoder configuration */
-  res = vpx_codec_enc_config_default(interface, cfg, 0);
+  // start with default encoder configuration
+  res = vpx_codec_enc_config_default(vpx_codec_vp9_cx(), enc_cfg, 0);
   if (res) {
     die("Failed to get config: %s\n", vpx_codec_err_to_string(res));
   }
-  printf(
-      "Codec %s\nframes: %d, skip: %d, layers: %d\n"
-      "width %d, height: %d, \n"
-      "num: %d, den: %d, bitrate: %d, \n"
-      "key period: %d \n",
-      vpx_codec_iface_name(interface), number_frames_to_code,
-      number_frames_to_skip, number_spatial_layers, width, height, timebase_num,
-      timebase_den, bitrate, key_period);
-
-  // Do minimal check at the application level. Encoder parameters will be
-  // checked internally
-  check_parameters();
-
-  cfg->rc_target_bitrate = bitrate;
-  cfg->g_w = width;
-  cfg->g_h = height;
-  cfg->g_timebase.num = timebase_num;
-  cfg->g_timebase.den = timebase_den;
-  cfg->ss_number_layers = number_spatial_layers;
-}
-
-static void set_default_configuration(vpx_codec_enc_cfg_t *cfg) {
-  /* Real time parameters */
-  cfg->rc_dropframe_thresh = 0;
-  cfg->rc_end_usage = VPX_CBR;
-  cfg->rc_resize_allowed = 0;
-  cfg->rc_min_quantizer = 33;
-  cfg->rc_max_quantizer = 33;
-  cfg->rc_undershoot_pct = 100;
-  cfg->rc_overshoot_pct = 15;
-  cfg->rc_buf_initial_sz = 500;
-  cfg->rc_buf_optimal_sz = 600;
-  cfg->rc_buf_sz = 1000;
-
-  /* Enable error resilient mode */
-  cfg->g_error_resilient = 1;
-  cfg->g_lag_in_frames = 0;
-
-  /* Disable automatic keyframe placement */
-  cfg->kf_mode = VPX_KF_DISABLED;
-  cfg->kf_min_dist = cfg->kf_max_dist = 3000;
-}
-
-static void initialize_codec(vpx_codec_ctx_t *codec, vpx_codec_enc_cfg_t *cfg) {
-  int max_intra_size_pct;
-
-  /* Initialize codec */
-  if (vpx_codec_enc_init(codec, interface, cfg, VPX_CODEC_USE_PSNR))
-    die_codec(codec, "Failed to initialize encoder");
-
-  vpx_codec_control(codec, VP9E_SET_SVC, 1);
-  /* Cap CPU & first I-frame size */
-  vpx_codec_control(codec, VP8E_SET_CPUUSED, 1);
-  vpx_codec_control(codec, VP8E_SET_STATIC_THRESHOLD, 1);
-  vpx_codec_control(codec, VP8E_SET_NOISE_SENSITIVITY, 1);
-  vpx_codec_control(codec, VP8E_SET_TOKEN_PARTITIONS, 1);
+  // update enc_cfg with app default values
+  enc_cfg->g_w = default_width;
+  enc_cfg->g_h = default_height;
+  enc_cfg->g_timebase.num = default_timebase_num;
+  enc_cfg->g_timebase.den = default_timebase_den;
+  enc_cfg->rc_target_bitrate = default_bitrate;
+  enc_cfg->kf_min_dist = default_kf_dist;
+  enc_cfg->kf_max_dist = default_kf_dist;
+
+  // initialize AppInput with default values
+  app_input->frames_to_code = default_frames_to_code;
+  app_input->frames_to_skip = default_frames_to_skip;
+
+  // process command line options
+  argv = argv_dup(argc - 1, argv_ + 1);
+  for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
+    arg.argv_step = 1;
+
+    if (arg_match(&arg, &encoding_mode_arg, argi)) {
+      svc_ctx->encoding_mode = arg_parse_enum_or_int(&arg);
+    } else if (arg_match(&arg, &frames_arg, argi)) {
+      app_input->frames_to_code = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &width_arg, argi)) {
+      enc_cfg->g_w = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &height_arg, argi)) {
+      enc_cfg->g_h = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &timebase_arg, argi)) {
+      enc_cfg->g_timebase = arg_parse_rational(&arg);
+    } else if (arg_match(&arg, &bitrate_arg, argi)) {
+      enc_cfg->rc_target_bitrate = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &skip_frames_arg, argi)) {
+      app_input->frames_to_skip = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &layers_arg, argi)) {
+      svc_ctx->spatial_layers = arg_parse_uint(&arg);
+    } else if (arg_match(&arg, &kf_dist_arg, argi)) {
+      enc_cfg->kf_min_dist = arg_parse_uint(&arg);
+      enc_cfg->kf_max_dist = enc_cfg->kf_min_dist;
+    } else if (arg_match(&arg, &scale_factors_arg, argi)) {
+      vpx_svc_set_scale_factors(svc_ctx, arg.val);
+    } else if (arg_match(&arg, &quantizers_arg, argi)) {
+      vpx_svc_set_quantizers(svc_ctx, arg.val);
+    } else {
+      ++argj;
+    }
+  }
 
-  max_intra_size_pct =
-      (int)(((double)cfg->rc_buf_optimal_sz * 0.5) *
-            ((double)cfg->g_timebase.den / cfg->g_timebase.num) / 10.0);
-  /* printf ("max_intra_size_pct=%d\n", max_intra_size_pct); */
+  // Check for unrecognized options
+  for (argi = argv; *argi; ++argi)
+    if (argi[0][0] == '-' && strlen(argi[0]) > 1)
+      die("Error: Unrecognized option %s\n", *argi);
 
-  vpx_codec_control(codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, max_intra_size_pct);
-}
+  if (argv[0] == NULL || argv[1] == 0) {
+    usage_exit(argv_[0]);
+  }
+  app_input->input_ctx.filename = argv[0];
+  app_input->output_filename = argv[1];
+  free(argv);
 
-static int calculate_layer(int frame_cnt, int number_spatial_layers) {
-  if (frame_cnt == 0)
-    return 0;
-  else
-    return (frame_cnt + number_spatial_layers - 1) % number_spatial_layers;
-}
+  if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
+      enc_cfg->g_h % 2)
+    die("Invalid resolution: %d x %d\n", enc_cfg->g_w, enc_cfg->g_h);
 
-static void switch_to_layer(int layer, unsigned int initial_width,
-                            unsigned int initial_height,
-                            vpx_codec_ctx_t *codec) {
-  // Set layer size
-  int scaling_factor_num[MAX_LAYERS] = {2, 1, 4, 2, 1};
-  int scaling_factor_den[MAX_LAYERS] = {9, 3, 9, 3, 1};
-
-  int quantizer[MAX_LAYERS] = {60, 53, 39, 33, 27};
-
-  unsigned int current_width;
-  unsigned int current_height;
-
-  current_width = initial_width *
-                  scaling_factor_num[layer + 5 - number_spatial_layers] /
-                  scaling_factor_den[layer + 5 - number_spatial_layers];
-  current_height = initial_height *
-                   scaling_factor_num[layer + 5 - number_spatial_layers] /
-                   scaling_factor_den[layer + 5 - number_spatial_layers];
-
-  current_width += current_width % 2;
-  current_height += current_height % 2;
-
-  vpx_codec_control(codec, VP9E_SET_WIDTH, &current_width);
-  vpx_codec_control(codec, VP9E_SET_HEIGHT, &current_height);
-
-  // Set layer context
-  vpx_codec_control(codec, VP9E_SET_LAYER, &layer);
-  vpx_codec_control(codec, VP9E_SET_MAX_Q,
-                    quantizer[layer + 5 - number_spatial_layers]);
-  vpx_codec_control(codec, VP9E_SET_MIN_Q,
-                    quantizer[layer + 5 - number_spatial_layers]);
-}
-
-static int get_flag(int is_I_frame_in_layer, int layer, ENCODING_MODE mode) {
-  // First layer
-  switch (mode) {
-    case INTER_LAYER_PREDICTION_I:
-      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
-      if (layer == 0)
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      else if (is_I_frame_in_layer)
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
-      else
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      break;
-
-    case INTER_LAYER_PREDICTION_IP:
-      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
-      if (layer == 0)
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      else if (is_I_frame_in_layer)
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-               VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
-      else
-        return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF;
-      break;
-
-    case USE_GOLDEN_FRAME:
-      if (is_I_frame_in_layer && layer == 0) return VPX_EFLAG_FORCE_KF;
-      if (2 * number_spatial_layers - NUM_BUFFERS <= layer) {
-        if (layer == 0)
-          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                 VP8_EFLAG_NO_REF_ARF;
-        else if (is_I_frame_in_layer)
-          return VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_REF_GF |
-                 VP8_EFLAG_NO_REF_LAST;
-        else
-          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
-      } else {
-        if (layer == 0)
-          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-        else if (is_I_frame_in_layer)
-          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_LAST;
-        else
-          return VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
-                 VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
-      }
-      break;
-    default:
-      return VPX_EFLAG_FORCE_KF;
-  }
+  printf(
+      "Codec %s\nframes: %d, skip: %d\n"
+      "mode: %d, layers: %d\n"
+      "width %d, height: %d,\n"
+      "num: %d, den: %d, bitrate: %d,\n"
+      "gop size: %d\n",
+      vpx_codec_iface_name(vpx_codec_vp9_cx()), app_input->frames_to_code,
+      app_input->frames_to_skip, svc_ctx->encoding_mode,
+      svc_ctx->spatial_layers, enc_cfg->g_w, enc_cfg->g_h,
+      enc_cfg->g_timebase.num, enc_cfg->g_timebase.den,
+      enc_cfg->rc_target_bitrate, enc_cfg->kf_max_dist);
 }
 
-int main(int argc, char **argv) {
-  FILE *infile, *outfile[MAX_LAYERS];
+int main(int argc, const char **argv) {
+  AppInput app_input = {0};
+  FILE *outfile;
   vpx_codec_ctx_t codec;
-  vpx_codec_enc_cfg_t cfg;
-  int frame_cnt = 0;
+  vpx_codec_enc_cfg_t enc_cfg;
+  SvcContext svc_ctx;
+  uint32_t i;
+  uint32_t frame_cnt = 0;
   vpx_image_t raw;
-  int frame_avail = 1;
-  int got_data = 0;
-  int i;
-  int frames_in_layer[MAX_LAYERS] = {0};
-  clock_t before;
-  clock_t after;
+  vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
+  vpx_codec_cx_pkt_t packet = {0};
+  packet.kind = VPX_CODEC_CX_FRAME_PKT;
 
-  parse_command_line(argc, argv, &cfg);
+  memset(&svc_ctx, 0, sizeof(svc_ctx));
+  svc_ctx.log_print = 1;
+  parse_command_line(argc, argv, &app_input, &svc_ctx, &enc_cfg);
 
   // Allocate image buffer
-  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 32))
-    die("Failed to allocate image", cfg.g_w, cfg.g_h);
-
-  set_default_configuration(&cfg);
-
-  /* Open input file */
-  if (!(infile = fopen(input_filename, "rb")))
-    die("Failed to open %s for reading", argv[1]);
-
-  /* Open output file  */
-  for (i = 0; i < number_spatial_layers; i++) {
-    char file_name[512];
-    snprintf(file_name, sizeof(file_name), "%s_%d.ivf", output_filename, i);
-    if (!(outfile[i] = fopen(file_name, "wb")))
-      die("Failed to open %s for writing", file_name);
-    write_ivf_file_header(outfile[i], &cfg, 0);
-  }
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
+    die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
+
+  if (!(app_input.input_ctx.file = fopen(app_input.input_ctx.filename, "rb")))
+    die("Failed to open %s for reading\n", app_input.input_ctx.filename);
+
+  if (!(outfile = fopen(app_input.output_filename, "wb")))
+    die("Failed to open %s for writing\n", app_input.output_filename);
+
+  // Initialize codec
+  if (vpx_svc_init(&svc_ctx, &codec, vpx_codec_vp9_cx(), &enc_cfg) !=
+      VPX_CODEC_OK)
+    die("Failed to initialize encoder\n");
 
-  initialize_codec(&codec, &cfg);
+  ivf_write_file_header(outfile, &enc_cfg, VP9_FOURCC, 0);
 
   // skip initial frames
-  for (i = 0; i < number_frames_to_skip; i++) {
-    read_frame(infile, &raw);
+  for (i = 0; i < app_input.frames_to_skip; ++i) {
+    read_yuv_frame(&app_input.input_ctx, &raw);
   }
 
-  before = clock();
-  // Encoding frames
-  while ((frame_avail || got_data) &&
-         frame_cnt <= number_frames_to_code * number_spatial_layers) {
-    int flags = 0;
-    vpx_codec_iter_t iter = NULL;
-    const vpx_codec_cx_pkt_t *pkt;
-
-    int layer = calculate_layer(frame_cnt, number_spatial_layers);
-    int is_I_frame_in_layer =
-        (((frame_cnt - 1) / number_spatial_layers % key_period) == 0);
-    int is_dummy = (frame_cnt == 0);
-
-    if (is_dummy) {  // Dummy frame
-      flags = VPX_EFLAG_FORCE_KF;
-      frame_avail = read_dummy_frame(&raw);
-
-    } else {  // Regular frame
-      // Read a new frame only at the base layer
-      if (layer == 0) frame_avail = read_frame(infile, &raw);
-      switch_to_layer(layer, cfg.g_w, cfg.g_h, &codec);
-      flags = get_flag(is_I_frame_in_layer, layer, INTER_LAYER_PREDICTION_I);
-    }
+  // Encode frames
+  while (frame_cnt < app_input.frames_to_code) {
+    if (read_yuv_frame(&app_input.input_ctx, &raw)) break;
 
-    // Actual Encoding
-    if (vpx_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags,
-                         VPX_DL_REALTIME))
+    res = vpx_svc_encode(&svc_ctx, &codec, &raw, pts, frame_duration,
+                         VPX_DL_REALTIME);
+    printf("%s", vpx_svc_get_message(&svc_ctx));
+    if (res != VPX_CODEC_OK) {
       die_codec(&codec, "Failed to encode frame");
-
-    got_data = 0;
-    // Process data / Get PSNR statistics
-    while ((pkt = vpx_codec_get_cx_data(&codec, &iter))) {
-      got_data = 1;
-      switch (pkt->kind) {
-        case VPX_CODEC_CX_FRAME_PKT:
-          for (i = layer; i < number_spatial_layers; i++) {
-            write_ivf_frame_header(outfile[i], pkt);
-            (void)fwrite(pkt->data.frame.buf, 1, pkt->data.frame.sz,
-                         outfile[i]);
-            frames_in_layer[i]++;
-          }
-          break;
-        case VPX_CODEC_PSNR_PKT:
-          if (frame_cnt != 0)
-            printf(
-                "Processed Frame %d, layer %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                (frame_cnt - 1) / number_spatial_layers + 1, layer,
-                pkt->data.psnr.psnr[0], pkt->data.psnr.psnr[1],
-                pkt->data.psnr.psnr[2], pkt->data.psnr.psnr[3]);
-          break;
-        default:
-          break;
-      }
     }
-    frame_cnt++;
-    // TODO(ivan): Modify ts later if(!layer)
+    if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
+      packet.data.frame.pts = pts;
+      packet.data.frame.sz = vpx_svc_get_frame_size(&svc_ctx);
+      ivf_write_frame_header(outfile, &packet);
+      (void)fwrite(vpx_svc_get_buffer(&svc_ctx), 1,
+                   vpx_svc_get_frame_size(&svc_ctx), outfile);
+    }
+    ++frame_cnt;
     pts += frame_duration;
   }
-  // end while
-
-  after = clock();
-  printf("Processed %d frames in different resolutions in %ld ms.\n",
-         frame_cnt - 1, (int)(after - before) / (CLOCKS_PER_SEC / 1000));
 
-  fclose(infile);
+  printf("Processed %d frames\n", frame_cnt);
 
+  fclose(app_input.input_ctx.file);
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
-  /* Try to rewrite the output file headers with the actual frame count */
-  for (i = 0; i < number_spatial_layers; i++) {
-    if (!fseek(outfile[i], 0, SEEK_SET)) {
-      write_ivf_file_header(outfile[i], &cfg, frames_in_layer[i]);
-    }
-    fclose(outfile[i]);
+  // rewrite the output file headers with the actual frame count
+  if (!fseek(outfile, 0, SEEK_SET)) {
+    ivf_write_file_header(outfile, &enc_cfg, VP9_FOURCC, frame_cnt);
   }
+  fclose(outfile);
+  vpx_img_free(&raw);
+
+  // display average size, psnr
+  printf("%s", vpx_svc_dump_statistics(&svc_ctx));
+
+  vpx_svc_release(&svc_ctx);
 
   return EXIT_SUCCESS;
 }
diff --git a/source/libvpx/vpx/exports_dec b/source/libvpx/vpx/exports_dec
index ed121f7..d058c9b 100644
--- a/source/libvpx/vpx/exports_dec
+++ b/source/libvpx/vpx/exports_dec
@@ -7,3 +7,4 @@ text vpx_codec_peek_stream_info
 text vpx_codec_register_put_frame_cb
 text vpx_codec_register_put_slice_cb
 text vpx_codec_set_mem_map
+text vpx_codec_set_frame_buffers
diff --git a/source/libvpx/vpx/exports_enc b/source/libvpx/vpx/exports_enc
index 3d56749..1d9340c 100644
--- a/source/libvpx/vpx/exports_enc
+++ b/source/libvpx/vpx/exports_enc
@@ -6,3 +6,17 @@ text vpx_codec_get_cx_data
 text vpx_codec_get_global_headers
 text vpx_codec_get_preview_frame
 text vpx_codec_set_cx_data_buf
+text vpx_svc_dump_statistics
+text vpx_svc_encode
+text vpx_svc_free
+text vpx_svc_get_buffer
+text vpx_svc_get_encode_frame_count
+text vpx_svc_get_frame_size
+text vpx_svc_get_message
+text vpx_svc_init
+text vpx_svc_is_keyframe
+text vpx_svc_release
+text vpx_svc_set_keyframe
+text vpx_svc_set_options
+text vpx_svc_set_quantizers
+text vpx_svc_set_scale_factors
diff --git a/source/libvpx/vpx/internal/vpx_codec_internal.h b/source/libvpx/vpx/internal/vpx_codec_internal.h
index 05fed97..75b4a18 100644
--- a/source/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/source/libvpx/vpx/internal/vpx_codec_internal.h
@@ -56,7 +56,7 @@
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_CODEC_INTERNAL_ABI_VERSION (4) /**<\hideinitializer*/
+#define VPX_CODEC_INTERNAL_ABI_VERSION (5) /**<\hideinitializer*/
 
 typedef struct vpx_codec_alg_priv  vpx_codec_alg_priv_t;
 typedef struct vpx_codec_priv_enc_mr_cfg vpx_codec_priv_enc_mr_cfg_t;
@@ -215,6 +215,36 @@ typedef vpx_codec_err_t (*vpx_codec_decode_fn_t)(vpx_codec_alg_priv_t  *ctx,
 typedef vpx_image_t *(*vpx_codec_get_frame_fn_t)(vpx_codec_alg_priv_t *ctx,
                                                  vpx_codec_iter_t     *iter);
 
+/*!\brief Pass in external frame buffers for the decoder to use.
+ *
+ * Registers a given function to be called when the current frame to
+ * decode will be bigger than the external frame buffer size. This
+ * function must be called before the first call to decode or libvpx
+ * will assume the default behavior of allocating frame buffers internally.
+ * Frame buffers with a size of 0 are valid.
+ *
+ * \param[in] ctx          Pointer to this instance's context
+ * \param[in] fb_list      Pointer to array of frame buffers
+ * \param[in] fb_count     Number of elements in frame buffer array
+ * \param[in] cb           Pointer to the callback function
+ * \param[in] user_priv    User's private data
+ *
+ * \retval #VPX_CODEC_OK
+ *     External frame buffers will be used by libvpx.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     fb_count was less than the value needed by the codec.
+ * \retval #VPX_CODEC_ERROR
+ *     Decoder context not initialized, or algorithm not capable of
+ *     using external frame buffers.
+ *
+ * \note
+ * When decoding VP9, the application must pass in at least 8 external
+ * frame buffers, as VP9 can have up to 8 reference frames.
+ */
+typedef vpx_codec_err_t (*vpx_codec_set_frame_buffers_fn_t)(
+    vpx_codec_alg_priv_t *ctx,
+    vpx_codec_frame_buffer_t *fb_list, int fb_count,
+    vpx_realloc_frame_buffer_cb_fn_t cb, void *user_priv);
 
 /*\brief eXternal Memory Allocation memory map get iterator
  *
@@ -305,6 +335,7 @@ struct vpx_codec_iface {
     vpx_codec_get_si_fn_t     get_si;      /**< \copydoc ::vpx_codec_get_si_fn_t */
     vpx_codec_decode_fn_t     decode;      /**< \copydoc ::vpx_codec_decode_fn_t */
     vpx_codec_get_frame_fn_t  get_frame;   /**< \copydoc ::vpx_codec_get_frame_fn_t */
+    vpx_codec_set_frame_buffers_fn_t set_fb; /**< \copydoc ::vpx_codec_set_frame_buffers_fn_t */
   } dec;
   struct vpx_codec_enc_iface {
     vpx_codec_enc_cfg_map_t           *cfg_maps;      /**< \copydoc ::vpx_codec_enc_cfg_map_t */
diff --git a/source/libvpx/vpx/src/svc_encodeframe.c b/source/libvpx/vpx/src/svc_encodeframe.c
new file mode 100644
index 0000000..810e881
--- /dev/null
+++ b/source/libvpx/vpx/src/svc_encodeframe.c
@@ -0,0 +1,962 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/**
+ * @file
+ * VP9 SVC encoding support via libvpx
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#define VPX_DISABLE_CTRL_TYPECHECKS 1
+#define VPX_CODEC_DISABLE_COMPAT 1
+#include "vpx/svc_context.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#if defined(__MINGW32__) && !defined(MINGW_HAS_SECURE_API)
+#define strtok_r strtok_s
+// proto from /usr/x86_64-w64-mingw32/include/sec_api/string_s.h
+_CRTIMP char *__cdecl strtok_s(char *str, const char *delim, char **context);
+#endif
+
+#ifdef _MSC_VER
+#define strdup _strdup
+#define strtok_r strtok_s
+#endif
+
+#define SVC_REFERENCE_FRAMES 8
+#define SUPERFRAME_SLOTS (8)
+#define SUPERFRAME_BUFFER_SIZE (SUPERFRAME_SLOTS * sizeof(uint32_t) + 2)
+#define OPTION_BUFFER_SIZE 256
+
+static const char *DEFAULT_QUANTIZER_VALUES = "60,53,39,33,27";
+static const char *DEFAULT_SCALE_FACTORS = "4/16,5/16,7/16,11/16,16/16";
+
+typedef struct SvcInternal {
+  char options[OPTION_BUFFER_SIZE];        // set by vpx_svc_set_options
+  char quantizers[OPTION_BUFFER_SIZE];     // set by vpx_svc_set_quantizers
+  char scale_factors[OPTION_BUFFER_SIZE];  // set by vpx_svc_set_scale_factors
+
+  // values extracted from option, quantizers
+  int scaling_factor_num[VPX_SS_MAX_LAYERS];
+  int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  int quantizer[VPX_SS_MAX_LAYERS];
+
+  // accumulated statistics
+  double psnr_in_layer[VPX_SS_MAX_LAYERS];
+  uint32_t bytes_in_layer[VPX_SS_MAX_LAYERS];
+
+  // codec encoding values
+  int width;    // width of highest layer
+  int height;   // height of highest layer
+  int kf_dist;  // distance between keyframes
+
+  // state variables
+  int encode_frame_count;
+  int frame_within_gop;
+  vpx_enc_frame_flags_t enc_frame_flags;
+  int layers;
+  int layer;
+  int is_keyframe;
+
+  size_t frame_size;
+  size_t buffer_size;
+  void *buffer;
+
+  char message_buffer[2048];
+  vpx_codec_ctx_t *codec_ctx;
+} SvcInternal;
+
+// Superframe is used to generate an index of individual frames (i.e., layers)
+struct Superframe {
+  int count;
+  uint32_t sizes[SUPERFRAME_SLOTS];
+  uint32_t magnitude;
+  uint8_t buffer[SUPERFRAME_BUFFER_SIZE];
+  size_t index_size;
+};
+
+// One encoded frame layer
+struct LayerData {
+  void *buf;    // compressed data buffer
+  size_t size;  // length of compressed data
+  struct LayerData *next;
+};
+
+// create LayerData from encoder output
+static struct LayerData *ld_create(void *buf, size_t size) {
+  struct LayerData *const layer_data = malloc(sizeof(*layer_data));
+  if (layer_data == NULL) {
+    return NULL;
+  }
+  layer_data->buf = malloc(size);
+  if (layer_data->buf == NULL) {
+    free(layer_data);
+    return NULL;
+  }
+  memcpy(layer_data->buf, buf, size);
+  layer_data->size = size;
+  return layer_data;
+}
+
+// free LayerData
+static void ld_free(struct LayerData *layer_data) {
+  if (layer_data) {
+    if (layer_data->buf) {
+      free(layer_data->buf);
+      layer_data->buf = NULL;
+    }
+    free(layer_data);
+  }
+}
+
+// add layer data to list
+static void ld_list_add(struct LayerData **list, struct LayerData *layer_data) {
+  struct LayerData **p = list;
+
+  while (*p != NULL) p = &(*p)->next;
+  *p = layer_data;
+  layer_data->next = NULL;
+}
+
+// get accumulated size of layer data
+static size_t ld_list_get_buffer_size(struct LayerData *list) {
+  struct LayerData *p;
+  size_t size = 0;
+
+  for (p = list; p != NULL; p = p->next) {
+    size += p->size;
+  }
+  return size;
+}
+
+// copy layer data to buffer
+static void ld_list_copy_to_buffer(struct LayerData *list, uint8_t *buffer) {
+  struct LayerData *p;
+
+  for (p = list; p != NULL; p = p->next) {
+    buffer[0] = 1;
+    memcpy(buffer, p->buf, p->size);
+    buffer += p->size;
+  }
+}
+
+// free layer data list
+static void ld_list_free(struct LayerData *list) {
+  struct LayerData *p = list;
+
+  while (p) {
+    list = list->next;
+    ld_free(p);
+    p = list;
+  }
+}
+
+static void sf_create_index(struct Superframe *sf) {
+  uint8_t marker = 0xc0;
+  int i;
+  uint32_t mag, mask;
+  uint8_t *bufp;
+
+  if (sf->count == 0 || sf->count >= 8) return;
+
+  // Add the number of frames to the marker byte
+  marker |= sf->count - 1;
+
+  // Choose the magnitude
+  for (mag = 0, mask = 0xff; mag < 4; ++mag) {
+    if (sf->magnitude < mask) break;
+    mask <<= 8;
+    mask |= 0xff;
+  }
+  marker |= mag << 3;
+
+  // Write the index
+  sf->index_size = 2 + (mag + 1) * sf->count;
+  bufp = sf->buffer;
+
+  *bufp++ = marker;
+  for (i = 0; i < sf->count; ++i) {
+    int this_sz = sf->sizes[i];
+    uint32_t j;
+
+    for (j = 0; j <= mag; ++j) {
+      *bufp++ = this_sz & 0xff;
+      this_sz >>= 8;
+    }
+  }
+  *bufp++ = marker;
+}
+
+static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
+  if (svc_ctx == NULL) return NULL;
+  if (svc_ctx->internal == NULL) {
+    SvcInternal *const si = malloc(sizeof(*si));
+    if (si != NULL) {
+      memset(si, 0, sizeof(*si));
+    }
+    svc_ctx->internal = si;
+  }
+  return svc_ctx->internal;
+}
+
+static const SvcInternal *get_const_svc_internal(const SvcContext *svc_ctx) {
+  if (svc_ctx == NULL) return NULL;
+  return svc_ctx->internal;
+}
+
+static void svc_log_reset(SvcContext *svc_ctx) {
+  SvcInternal *const si = (SvcInternal *)svc_ctx->internal;
+  si->message_buffer[0] = '\0';
+}
+
+static int svc_log(SvcContext *svc_ctx, int level, const char *fmt, ...) {
+  char buf[512];
+  int retval = 0;
+  va_list ap;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+
+  if (level > svc_ctx->log_level) {
+    return retval;
+  }
+
+  va_start(ap, fmt);
+  retval = vsnprintf(buf, sizeof(buf), fmt, ap);
+  va_end(ap);
+
+  if (svc_ctx->log_print) {
+    printf("%s", buf);
+  } else {
+    strncat(si->message_buffer, buf,
+            sizeof(si->message_buffer) - strlen(si->message_buffer) - 1);
+  }
+
+  if (level == SVC_LOG_ERROR) {
+    si->codec_ctx->err_detail = si->message_buffer;
+  }
+  return retval;
+}
+
+static vpx_codec_err_t set_option_encoding_mode(SvcContext *svc_ctx,
+                                                const char *value_str) {
+  if (strcmp(value_str, "i") == 0) {
+    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_I;
+  } else if (strcmp(value_str, "alt-ip") == 0) {
+    svc_ctx->encoding_mode = ALT_INTER_LAYER_PREDICTION_IP;
+  } else if (strcmp(value_str, "ip") == 0) {
+    svc_ctx->encoding_mode = INTER_LAYER_PREDICTION_IP;
+  } else if (strcmp(value_str, "gf") == 0) {
+    svc_ctx->encoding_mode = USE_GOLDEN_FRAME;
+  } else {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "invalid encoding mode: %s", value_str);
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t parse_quantizer_values(SvcContext *svc_ctx,
+                                              const char *quantizer_values) {
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  char *save_ptr;
+  int found = 0;
+  int i, q;
+  int res = VPX_CODEC_OK;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+
+  if (quantizer_values == NULL || strlen(quantizer_values) == 0) {
+    input_string = strdup(DEFAULT_QUANTIZER_VALUES);
+  } else {
+    input_string = strdup(quantizer_values);
+  }
+
+  token = strtok_r(input_string, delim, &save_ptr);
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    if (token != NULL) {
+      q = atoi(token);
+      if (q <= 0 || q > 100) {
+        svc_log(svc_ctx, SVC_LOG_ERROR,
+                "svc-quantizer-values: invalid value %s\n", token);
+        res = VPX_CODEC_INVALID_PARAM;
+        break;
+      }
+      token = strtok_r(NULL, delim, &save_ptr);
+      found = i + 1;
+    } else {
+      q = 0;
+    }
+    si->quantizer[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] = q;
+  }
+  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: quantizers: %d values required, but only %d specified\n",
+            svc_ctx->spatial_layers, found);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+  free(input_string);
+  return res;
+}
+
+static void log_invalid_scale_factor(SvcContext *svc_ctx, const char *value) {
+  svc_log(svc_ctx, SVC_LOG_ERROR, "svc scale-factors: invalid value %s\n",
+          value);
+}
+
+static vpx_codec_err_t parse_scale_factors(SvcContext *svc_ctx,
+                                           const char *scale_factors) {
+  char *input_string;
+  char *token;
+  const char *delim = ",";
+  char *save_ptr;
+  int found = 0;
+  int i;
+  int64_t num, den;
+  int res = VPX_CODEC_OK;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+
+  if (scale_factors == NULL || strlen(scale_factors) == 0) {
+    input_string = strdup(DEFAULT_SCALE_FACTORS);
+  } else {
+    input_string = strdup(scale_factors);
+  }
+  token = strtok_r(input_string, delim, &save_ptr);
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+    num = den = 0;
+    if (token != NULL) {
+      num = strtol(token, &token, 10);
+      if (num <= 0) {
+        log_invalid_scale_factor(svc_ctx, token);
+        res = VPX_CODEC_INVALID_PARAM;
+        break;
+      }
+      if (*token++ != '/') {
+        log_invalid_scale_factor(svc_ctx, token);
+        res = VPX_CODEC_INVALID_PARAM;
+        break;
+      }
+      den = strtol(token, &token, 10);
+      if (den <= 0) {
+        log_invalid_scale_factor(svc_ctx, token);
+        res = VPX_CODEC_INVALID_PARAM;
+        break;
+      }
+      token = strtok_r(NULL, delim, &save_ptr);
+      found = i + 1;
+    }
+    si->scaling_factor_num[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
+        (int)num;
+    si->scaling_factor_den[i + VPX_SS_MAX_LAYERS - svc_ctx->spatial_layers] =
+        (int)den;
+  }
+  if (res == VPX_CODEC_OK && found != svc_ctx->spatial_layers) {
+    svc_log(svc_ctx, SVC_LOG_ERROR,
+            "svc: scale-factors: %d values required, but only %d specified\n",
+            svc_ctx->spatial_layers, found);
+    res = VPX_CODEC_INVALID_PARAM;
+  }
+  free(input_string);
+  return res;
+}
+
+/**
+ * Parse SVC encoding options
+ * Format: encoding-mode=<svc_mode>,layers=<layer_count>
+ *         scale-factors=<n1>/<d1>,<n2>/<d2>,...
+ *         quantizers=<q1>,<q2>,...
+ * svc_mode = [i|ip|alt_ip|gf]
+ */
+static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) {
+  char *input_string;
+  char *option_name;
+  char *option_value;
+  char *input_ptr;
+  int res = VPX_CODEC_OK;
+
+  if (options == NULL) return VPX_CODEC_OK;
+  input_string = strdup(options);
+
+  // parse option name
+  option_name = strtok_r(input_string, "=", &input_ptr);
+  while (option_name != NULL) {
+    // parse option value
+    option_value = strtok_r(NULL, " ", &input_ptr);
+    if (option_value == NULL) {
+      svc_log(svc_ctx, SVC_LOG_ERROR, "option missing value: %s\n",
+              option_name);
+      res = VPX_CODEC_INVALID_PARAM;
+      break;
+    }
+    if (strcmp("encoding-mode", option_name) == 0) {
+      res = set_option_encoding_mode(svc_ctx, option_value);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("layers", option_name) == 0) {
+      svc_ctx->spatial_layers = atoi(option_value);
+    } else if (strcmp("scale-factors", option_name) == 0) {
+      res = parse_scale_factors(svc_ctx, option_value);
+      if (res != VPX_CODEC_OK) break;
+    } else if (strcmp("quantizers", option_name) == 0) {
+      res = parse_quantizer_values(svc_ctx, option_value);
+      if (res != VPX_CODEC_OK) break;
+    } else {
+      svc_log(svc_ctx, SVC_LOG_ERROR, "invalid option: %s\n", option_name);
+      res = VPX_CODEC_INVALID_PARAM;
+      break;
+    }
+    option_name = strtok_r(NULL, "=", &input_ptr);
+  }
+  free(input_string);
+  return res;
+}
+
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || options == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  strncpy(si->options, options, sizeof(si->options));
+  si->options[sizeof(si->options) - 1] = '\0';
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
+                                       const char *quantizers) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || quantizers == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  strncpy(si->quantizers, quantizers, sizeof(si->quantizers));
+  si->quantizers[sizeof(si->quantizers) - 1] = '\0';
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx,
+                                          const char *scale_factors) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || scale_factors == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  strncpy(si->scale_factors, scale_factors, sizeof(si->scale_factors));
+  si->scale_factors[sizeof(si->scale_factors) - 1] = '\0';
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+                             vpx_codec_iface_t *iface,
+                             vpx_codec_enc_cfg_t *enc_cfg) {
+  int max_intra_size_pct;
+  vpx_codec_err_t res;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
+      enc_cfg == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (si == NULL) return VPX_CODEC_MEM_ERROR;
+
+  si->codec_ctx = codec_ctx;
+
+  si->width = enc_cfg->g_w;
+  si->height = enc_cfg->g_h;
+
+  if (enc_cfg->kf_max_dist < 2) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "key frame distance too small: %d\n",
+            enc_cfg->kf_max_dist);
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  si->kf_dist = enc_cfg->kf_max_dist;
+
+  if (svc_ctx->spatial_layers == 0)
+    svc_ctx->spatial_layers = VPX_SS_DEFAULT_LAYERS;
+  if (svc_ctx->spatial_layers < 1 ||
+      svc_ctx->spatial_layers > VPX_SS_MAX_LAYERS) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "spatial layers: invalid value: %d\n",
+            svc_ctx->spatial_layers);
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  // use SvcInternal value for number of layers to enable forcing single layer
+  // for first frame
+  si->layers = svc_ctx->spatial_layers;
+
+  res = parse_quantizer_values(svc_ctx, si->quantizers);
+  if (res != VPX_CODEC_OK) return res;
+
+  res = parse_scale_factors(svc_ctx, si->scale_factors);
+  if (res != VPX_CODEC_OK) return res;
+
+  // parse aggregate command line options
+  res = parse_options(svc_ctx, si->options);
+  if (res != VPX_CODEC_OK) return res;
+
+  // modify encoder configuration
+  enc_cfg->ss_number_layers = si->layers;
+  enc_cfg->kf_mode = VPX_KF_DISABLED;
+  enc_cfg->g_pass = VPX_RC_ONE_PASS;
+  // Lag in frames not currently supported
+  enc_cfg->g_lag_in_frames = 0;
+
+  // TODO(ivanmaltz): determine if these values need to be set explicitly for
+  // svc, or if the normal default/override mechanism can be used
+  enc_cfg->rc_dropframe_thresh = 0;
+  enc_cfg->rc_end_usage = VPX_CBR;
+  enc_cfg->rc_resize_allowed = 0;
+  enc_cfg->rc_min_quantizer = 33;
+  enc_cfg->rc_max_quantizer = 33;
+  enc_cfg->rc_undershoot_pct = 100;
+  enc_cfg->rc_overshoot_pct = 15;
+  enc_cfg->rc_buf_initial_sz = 500;
+  enc_cfg->rc_buf_optimal_sz = 600;
+  enc_cfg->rc_buf_sz = 1000;
+  enc_cfg->g_error_resilient = 1;
+
+  // Initialize codec
+  res = vpx_codec_enc_init(codec_ctx, iface, enc_cfg, VPX_CODEC_USE_PSNR);
+  if (res != VPX_CODEC_OK) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "svc_enc_init error\n");
+    return res;
+  }
+
+  vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
+  vpx_codec_control(codec_ctx, VP8E_SET_CPUUSED, 1);
+  vpx_codec_control(codec_ctx, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(codec_ctx, VP8E_SET_NOISE_SENSITIVITY, 1);
+  vpx_codec_control(codec_ctx, VP8E_SET_TOKEN_PARTITIONS, 1);
+
+  max_intra_size_pct =
+      (int)(((double)enc_cfg->rc_buf_optimal_sz * 0.5) *
+            ((double)enc_cfg->g_timebase.den / enc_cfg->g_timebase.num) / 10.0);
+  vpx_codec_control(codec_ctx, VP8E_SET_MAX_INTRA_BITRATE_PCT,
+                    max_intra_size_pct);
+  return VPX_CODEC_OK;
+}
+
+// SVC Algorithm flags - these get mapped to VP8_EFLAG_* defined in vp8cx.h
+
+// encoder should reference the last frame
+#define USE_LAST (1 << 0)
+
+// encoder should reference the alt ref frame
+#define USE_ARF (1 << 1)
+
+// encoder should reference the golden frame
+#define USE_GF (1 << 2)
+
+// encoder should copy current frame to the last frame buffer
+#define UPDATE_LAST (1 << 3)
+
+// encoder should copy current frame to the alt ref frame buffer
+#define UPDATE_ARF (1 << 4)
+
+// encoder should copy current frame to the golden frame
+#define UPDATE_GF (1 << 5)
+
+static int map_vp8_flags(int svc_flags) {
+  int flags = 0;
+
+  if (!(svc_flags & USE_LAST)) flags |= VP8_EFLAG_NO_REF_LAST;
+  if (!(svc_flags & USE_ARF)) flags |= VP8_EFLAG_NO_REF_ARF;
+  if (!(svc_flags & USE_GF)) flags |= VP8_EFLAG_NO_REF_GF;
+
+  if (svc_flags & UPDATE_LAST) {
+    // last is updated automatically
+  } else {
+    flags |= VP8_EFLAG_NO_UPD_LAST;
+  }
+  if (svc_flags & UPDATE_ARF) {
+    flags |= VP8_EFLAG_FORCE_ARF;
+  } else {
+    flags |= VP8_EFLAG_NO_UPD_ARF;
+  }
+  if (svc_flags & UPDATE_GF) {
+    flags |= VP8_EFLAG_FORCE_GF;
+  } else {
+    flags |= VP8_EFLAG_NO_UPD_GF;
+  }
+  return flags;
+}
+
+static void calculate_enc_frame_flags(SvcContext *svc_ctx) {
+  vpx_enc_frame_flags_t flags = VPX_EFLAG_FORCE_KF;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  const int is_keyframe = (si->frame_within_gop == 0);
+
+  // keyframe layer zero is identical for all modes
+  if (is_keyframe && si->layer == 0) {
+    si->enc_frame_flags = VPX_EFLAG_FORCE_KF;
+    return;
+  }
+
+  switch (svc_ctx->encoding_mode) {
+    case ALT_INTER_LAYER_PREDICTION_IP:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        if (si->layer == si->layers - 1) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+        } else {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
+        }
+      } else {
+        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
+      }
+      break;
+    case INTER_LAYER_PREDICTION_I:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+      } else {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      }
+      break;
+    case INTER_LAYER_PREDICTION_IP:
+      if (si->layer == 0) {
+        flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+      } else if (is_keyframe) {
+        flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+      } else {
+        flags = map_vp8_flags(USE_LAST | USE_ARF | UPDATE_LAST);
+      }
+      break;
+    case USE_GOLDEN_FRAME:
+      if (2 * si->layers - SVC_REFERENCE_FRAMES <= si->layer) {
+        if (si->layer == 0) {
+          flags = map_vp8_flags(USE_LAST | USE_GF | UPDATE_LAST);
+        } else if (is_keyframe) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST | UPDATE_GF);
+        } else {
+          flags = map_vp8_flags(USE_LAST | USE_ARF | USE_GF | UPDATE_LAST);
+        }
+      } else {
+        if (si->layer == 0) {
+          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+        } else if (is_keyframe) {
+          flags = map_vp8_flags(USE_ARF | UPDATE_LAST);
+        } else {
+          flags = map_vp8_flags(USE_LAST | UPDATE_LAST);
+        }
+      }
+      break;
+    default:
+      svc_log(svc_ctx, SVC_LOG_ERROR, "unexpected encoding mode: %d\n",
+              svc_ctx->encoding_mode);
+      break;
+  }
+  si->enc_frame_flags = flags;
+}
+
+vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
+                                             int layer,
+                                             unsigned int *width,
+                                             unsigned int *height) {
+  int w, h, index, num, den;
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+
+  if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (layer < 0 || layer >= si->layers) return VPX_CODEC_INVALID_PARAM;
+
+  index = layer + VPX_SS_MAX_LAYERS - si->layers;
+  num = si->scaling_factor_num[index];
+  den = si->scaling_factor_den[index];
+  if (num == 0 || den == 0) return VPX_CODEC_INVALID_PARAM;
+
+  w = si->width * num / den;
+  h = si->height * num / den;
+
+  // make height and width even to make chrome player happy
+  w += w % 2;
+  h += h % 2;
+
+  *width = w;
+  *height = h;
+
+  return VPX_CODEC_OK;
+}
+
+static void set_svc_parameters(SvcContext *svc_ctx,
+                               vpx_codec_ctx_t *codec_ctx) {
+  int layer, layer_index;
+  vpx_svc_parameters_t svc_params;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+
+  memset(&svc_params, 0, sizeof(svc_params));
+  svc_params.layer = si->layer;
+  svc_params.flags = si->enc_frame_flags;
+
+  layer = si->layer;
+  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+      si->frame_within_gop == 0) {
+    // layers 1 & 3 don't exist in this mode, use the higher one
+    if (layer == 0 || layer == 2) {
+      layer += 1;
+    }
+  }
+  if (VPX_CODEC_OK != vpx_svc_get_layer_resolution(svc_ctx, layer,
+                                                   &svc_params.width,
+                                                   &svc_params.height)) {
+    svc_log(svc_ctx, SVC_LOG_ERROR, "vpx_svc_get_layer_resolution failed\n");
+  }
+  layer_index = layer + VPX_SS_MAX_LAYERS - si->layers;
+  svc_params.min_quantizer = si->quantizer[layer_index];
+  svc_params.max_quantizer = si->quantizer[layer_index];
+  svc_params.distance_from_i_frame = si->frame_within_gop;
+
+  // Use buffer i for layer i LST
+  svc_params.lst_fb_idx = si->layer;
+
+  // Use buffer i-1 for layer i Alt (Inter-layer prediction)
+  if (si->layer != 0) {
+    const int use_higher_layer =
+        svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+        si->frame_within_gop == 0;
+    svc_params.alt_fb_idx = use_higher_layer ? si->layer - 2 : si->layer - 1;
+  }
+
+  if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP) {
+    svc_params.gld_fb_idx = si->layer + 1;
+  } else {
+    if (si->layer < 2 * si->layers - SVC_REFERENCE_FRAMES)
+      svc_params.gld_fb_idx = svc_params.lst_fb_idx;
+    else
+      svc_params.gld_fb_idx = 2 * si->layers - 1 - si->layer;
+  }
+
+  svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, layer: %d, %dx%d, q: %d\n",
+          si->encode_frame_count, si->layer, svc_params.width,
+          svc_params.height, svc_params.min_quantizer);
+
+  if (svc_params.flags == VPX_EFLAG_FORCE_KF) {
+    svc_log(svc_ctx, SVC_LOG_DEBUG, "flags == VPX_EFLAG_FORCE_KF\n");
+  } else {
+    svc_log(
+        svc_ctx, SVC_LOG_DEBUG, "Using:    LST/GLD/ALT [%2d|%2d|%2d]\n",
+        svc_params.flags & VP8_EFLAG_NO_REF_LAST ? -1 : svc_params.lst_fb_idx,
+        svc_params.flags & VP8_EFLAG_NO_REF_GF ? -1 : svc_params.gld_fb_idx,
+        svc_params.flags & VP8_EFLAG_NO_REF_ARF ? -1 : svc_params.alt_fb_idx);
+    svc_log(
+        svc_ctx, SVC_LOG_DEBUG, "Updating: LST/GLD/ALT [%2d|%2d|%2d]\n",
+        svc_params.flags & VP8_EFLAG_NO_UPD_LAST ? -1 : svc_params.lst_fb_idx,
+        svc_params.flags & VP8_EFLAG_NO_UPD_GF ? -1 : svc_params.gld_fb_idx,
+        svc_params.flags & VP8_EFLAG_NO_UPD_ARF ? -1 : svc_params.alt_fb_idx);
+  }
+
+  vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &svc_params);
+}
+
+/**
+ * Encode a frame into multiple layers
+ * Create a superframe containing the individual layers
+ */
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+                               int64_t duration, int deadline) {
+  vpx_codec_err_t res;
+  vpx_codec_iter_t iter;
+  const vpx_codec_cx_pkt_t *cx_pkt;
+  struct LayerData *cx_layer_list = NULL;
+  struct LayerData *layer_data;
+  struct Superframe superframe;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || codec_ctx == NULL || rawimg == NULL || si == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+
+  memset(&superframe, 0, sizeof(superframe));
+  svc_log_reset(svc_ctx);
+
+  si->layers = svc_ctx->spatial_layers;
+  if (si->frame_within_gop >= si->kf_dist ||
+      si->encode_frame_count == 0) {
+    si->frame_within_gop = 0;
+  }
+  si->is_keyframe = (si->frame_within_gop == 0);
+  si->frame_size = 0;
+
+  svc_log(svc_ctx, SVC_LOG_DEBUG,
+          "vpx_svc_encode  layers: %d, frame_count: %d, frame_within_gop: %d\n",
+          si->layers, si->encode_frame_count, si->frame_within_gop);
+
+  // encode each layer
+  for (si->layer = 0; si->layer < si->layers; ++si->layer) {
+    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+        si->is_keyframe && (si->layer == 1 || si->layer == 3)) {
+      svc_log(svc_ctx, SVC_LOG_DEBUG, "Skip encoding layer %d\n", si->layer);
+      continue;
+    }
+    calculate_enc_frame_flags(svc_ctx);
+
+    set_svc_parameters(svc_ctx, codec_ctx);
+
+    res = vpx_codec_encode(codec_ctx, rawimg, pts, (uint32_t)duration,
+                           si->enc_frame_flags, deadline);
+    if (res != VPX_CODEC_OK) {
+      return res;
+    }
+    // save compressed data
+    iter = NULL;
+    while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
+      switch (cx_pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          const uint32_t frame_pkt_size = (uint32_t)(cx_pkt->data.frame.sz);
+          si->bytes_in_layer[si->layer] += frame_pkt_size;
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, size: %u\n",
+                  si->encode_frame_count, si->layer, frame_pkt_size);
+          layer_data =
+              ld_create(cx_pkt->data.frame.buf, (size_t)frame_pkt_size);
+          if (layer_data == NULL) {
+            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating LayerData\n");
+            return 0;
+          }
+          ld_list_add(&cx_layer_list, layer_data);
+
+          // save layer size in superframe index
+          superframe.sizes[superframe.count++] = frame_pkt_size;
+          superframe.magnitude |= frame_pkt_size;
+          break;
+        }
+        case VPX_CODEC_PSNR_PKT: {
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->encode_frame_count, si->layer,
+                  cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
+                  cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
+          si->psnr_in_layer[si->layer] += cx_pkt->data.psnr.psnr[0];
+          break;
+        }
+        default: {
+          break;
+        }
+      }
+    }
+  }
+  // add superframe index to layer data list
+  sf_create_index(&superframe);
+  layer_data = ld_create(superframe.buffer, superframe.index_size);
+  ld_list_add(&cx_layer_list, layer_data);
+
+  // get accumulated size of layer data
+  si->frame_size = ld_list_get_buffer_size(cx_layer_list);
+  if (si->frame_size == 0) return VPX_CODEC_ERROR;
+
+  // all layers encoded, create single buffer with concatenated layers
+  if (si->frame_size > si->buffer_size) {
+    free(si->buffer);
+    si->buffer = malloc(si->frame_size);
+    if (si->buffer == NULL) {
+      ld_list_free(cx_layer_list);
+      return VPX_CODEC_MEM_ERROR;
+    }
+    si->buffer_size = si->frame_size;
+  }
+  // copy layer data into packet
+  ld_list_copy_to_buffer(cx_layer_list, si->buffer);
+
+  ld_list_free(cx_layer_list);
+
+  svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, pts: %d\n",
+          si->encode_frame_count, si->is_keyframe, (int)si->frame_size,
+          (int)pts);
+  ++si->frame_within_gop;
+  ++si->encode_frame_count;
+
+  return VPX_CODEC_OK;
+}
+
+const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+  return si->message_buffer;
+}
+
+void *vpx_svc_get_buffer(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+  return si->buffer;
+}
+
+size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return 0;
+  return si->frame_size;
+}
+
+int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return 0;
+  return si->encode_frame_count;
+}
+
+int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return 0;
+  return si->is_keyframe;
+}
+
+void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return;
+  si->frame_within_gop = 0;
+}
+
+// dump accumulated statistics and reset accumulated values
+const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
+  int number_of_frames, number_of_keyframes, encode_frame_count;
+  int i;
+  uint32_t bytes_total = 0;
+  SvcInternal *const si = get_svc_internal(svc_ctx);
+  if (svc_ctx == NULL || si == NULL) return NULL;
+
+  svc_log_reset(svc_ctx);
+
+  encode_frame_count = si->encode_frame_count;
+  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);
+
+  svc_log(svc_ctx, SVC_LOG_INFO, "\n");
+  number_of_keyframes = encode_frame_count / si->kf_dist + 1;
+  for (i = 0; i < si->layers; ++i) {
+    number_of_frames = encode_frame_count;
+
+    if (svc_ctx->encoding_mode == ALT_INTER_LAYER_PREDICTION_IP &&
+        (i == 1 || i == 3)) {
+      number_of_frames -= number_of_keyframes;
+    }
+    svc_log(svc_ctx, SVC_LOG_INFO, "Layer %d PSNR=[%2.3f], Bytes=[%u]\n", i,
+            (double)si->psnr_in_layer[i] / number_of_frames,
+            si->bytes_in_layer[i]);
+    bytes_total += si->bytes_in_layer[i];
+    si->psnr_in_layer[i] = 0;
+    si->bytes_in_layer[i] = 0;
+  }
+
+  // only display statistics once
+  si->encode_frame_count = 0;
+
+  svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
+  return vpx_svc_get_message(svc_ctx);
+}
+
+void vpx_svc_release(SvcContext *svc_ctx) {
+  SvcInternal *si;
+  if (svc_ctx == NULL) return;
+  // do not use get_svc_internal as it will unnecessarily allocate an
+  // SvcInternal if it was not already allocated
+  si = (SvcInternal *)svc_ctx->internal;
+  if (si != NULL) {
+    free(si->buffer);
+    free(si);
+    svc_ctx->internal = NULL;
+  }
+}
diff --git a/source/libvpx/vpx/src/vpx_decoder.c b/source/libvpx/vpx/src/vpx_decoder.c
index 1f575e0..39fd217 100644
--- a/source/libvpx/vpx/src/vpx_decoder.c
+++ b/source/libvpx/vpx/src/vpx_decoder.c
@@ -172,7 +172,7 @@ vpx_codec_err_t vpx_codec_register_put_slice_cb(vpx_codec_ctx_t             *ctx
   if (!ctx || !cb)
     res = VPX_CODEC_INVALID_PARAM;
   else if (!ctx->iface || !ctx->priv
-           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_FRAME))
+           || !(ctx->iface->caps & VPX_CODEC_CAP_PUT_SLICE))
     res = VPX_CODEC_ERROR;
   else {
     ctx->priv->dec.put_slice_cb.u.put_slice = cb;
@@ -226,3 +226,22 @@ vpx_codec_err_t vpx_codec_set_mem_map(vpx_codec_ctx_t   *ctx,
 
   return SAVE_STATUS(ctx, res);
 }
+
+vpx_codec_err_t vpx_codec_set_frame_buffers(
+    vpx_codec_ctx_t *ctx,
+    vpx_codec_frame_buffer_t *fb_list, int fb_count,
+    vpx_realloc_frame_buffer_cb_fn_t cb, void *user_priv) {
+  vpx_codec_err_t res;
+
+  if (!ctx || !fb_list || fb_count <= 0 || !cb) {
+    res = VPX_CODEC_INVALID_PARAM;
+  } else if (!ctx->iface || !ctx->priv ||
+             !(ctx->iface->caps & VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER)) {
+    res = VPX_CODEC_ERROR;
+  } else {
+    res = ctx->iface->dec.set_fb(ctx->priv->alg_priv, fb_list, fb_count,
+                                 cb, user_priv);
+  }
+
+  return SAVE_STATUS(ctx, res);
+}
diff --git a/source/libvpx/vpx/svc_context.h b/source/libvpx/vpx/svc_context.h
new file mode 100644
index 0000000..8204f9c
--- /dev/null
+++ b/source/libvpx/vpx/svc_context.h
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/**
+ * SvcContext - input parameters and state to encode a multi-layered
+ * spatial SVC frame
+ */
+
+#ifndef VPX_SVC_CONTEXT_H_
+#define VPX_SVC_CONTEXT_H_
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum SVC_ENCODING_MODE {
+  INTER_LAYER_PREDICTION_I,
+  ALT_INTER_LAYER_PREDICTION_IP,
+  INTER_LAYER_PREDICTION_IP,
+  USE_GOLDEN_FRAME
+} SVC_ENCODING_MODE;
+
+typedef enum SVC_LOG_LEVEL {
+  SVC_LOG_ERROR,
+  SVC_LOG_INFO,
+  SVC_LOG_DEBUG
+} SVC_LOG_LEVEL;
+
+typedef struct {
+  // public interface to svc_command options
+  int spatial_layers;               // number of layers
+  SVC_ENCODING_MODE encoding_mode;  // svc encoding strategy
+  SVC_LOG_LEVEL log_level;  // amount of information to display
+  int log_print;  // when set, printf log messages instead of returning the
+                  // message with svc_get_message
+
+  // private storage for vpx_svc_encode
+  void *internal;
+} SvcContext;
+
+/**
+ * Set SVC options
+ * options are supplied as a single string separated by spaces
+ * Format: encoding-mode=<i|ip|alt-ip|gf>
+ *         layers=<layer_count>
+ *         scaling-factors=<n1>/<d1>,<n2>/<d2>,...
+ *         quantizers=<q1>,<q2>,...
+ */
+vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options);
+
+/**
+ * Set SVC quantizer values
+ * values comma separated, ordered from lowest resolution to highest
+ * e.g., "60,53,39,33,27"
+ */
+vpx_codec_err_t vpx_svc_set_quantizers(SvcContext *svc_ctx,
+                                       const char *quantizer_values);
+
+/**
+ * Set SVC scale factors
+ * values comma separated, ordered from lowest resolution to highest
+ * e.g.,  "4/16,5/16,7/16,11/16,16/16"
+ */
+vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx,
+                                          const char *scale_factors);
+
+/**
+ * initialize SVC encoding
+ */
+vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+                             vpx_codec_iface_t *iface,
+                             vpx_codec_enc_cfg_t *cfg);
+/**
+ * encode a frame of video with multiple layers
+ */
+vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
+                               struct vpx_image *rawimg, vpx_codec_pts_t pts,
+                               int64_t duration, int deadline);
+
+/**
+ * finished with svc encoding, release allocated resources
+ */
+void vpx_svc_release(SvcContext *svc_ctx);
+
+/**
+ * dump accumulated statistics and reset accumulated values
+ */
+const char *vpx_svc_dump_statistics(SvcContext *svc_ctx);
+
+/**
+ *  get status message from previous encode
+ */
+const char *vpx_svc_get_message(const SvcContext *svc_ctx);
+
+/**
+ * return size of encoded data to be returned by vpx_svc_get_buffer
+ */
+size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
+
+/**
+ * return buffer with encoded data
+ */
+void *vpx_svc_get_buffer(const SvcContext *svc_ctx);
+
+/**
+ * return spatial resolution of the specified layer
+ */
+vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
+                                             int layer,
+                                             unsigned int *width,
+                                             unsigned int *height);
+/**
+ * return number of frames that have been encoded
+ */
+int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx);
+
+/**
+ * return 1 if last encoded frame was a keyframe
+ */
+int vpx_svc_is_keyframe(const SvcContext *svc_ctx);
+
+/**
+ * force the next frame to be a keyframe
+ */
+void vpx_svc_set_keyframe(SvcContext *svc_ctx);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  /* VPX_SVC_CONTEXT_H_ */
diff --git a/source/libvpx/vpx/vp8cx.h b/source/libvpx/vpx/vp8cx.h
index 9f68c38..c0424f1 100644
--- a/source/libvpx/vpx/vp8cx.h
+++ b/source/libvpx/vpx/vp8cx.h
@@ -193,14 +193,10 @@ enum vp8e_enc_control_id {
   VP9E_SET_TILE_COLUMNS,
   VP9E_SET_TILE_ROWS,
   VP9E_SET_FRAME_PARALLEL_DECODING,
+  VP9E_SET_AQ_MODE,
 
-  VP9E_SET_WIDTH              = 99,
-  VP9E_SET_HEIGHT,
-  VP9E_SET_LAYER,
   VP9E_SET_SVC,
-
-  VP9E_SET_MAX_Q,
-  VP9E_SET_MIN_Q
+  VP9E_SET_SVC_PARAMETERS
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -283,6 +279,23 @@ typedef enum {
   VP8_TUNE_SSIM
 } vp8e_tuning;
 
+/*!\brief  vp9 svc parameters
+ *
+ * This defines parameters for svc encoding.
+ *
+ */
+typedef struct vpx_svc_parameters {
+  unsigned int width;         /**< width of current spatial layer */
+  unsigned int height;        /**< height of current spatial layer */
+  int layer;                  /**< current layer number - 0 = base */
+  int flags;                  /**< encode frame flags */
+  int max_quantizer;          /**< max quantizer for current layer */
+  int min_quantizer;          /**< min quantizer for current layer */
+  int distance_from_i_frame;  /**< frame number within current gop */
+  int lst_fb_idx;             /**< last frame frame buffer index */
+  int gld_fb_idx;             /**< golden frame frame buffer index */
+  int alt_fb_idx;             /**< alt reference frame frame buffer index */
+} vpx_svc_parameters_t;
 
 /*!\brief VP8 encoder control function parameter type
  *
@@ -303,11 +316,8 @@ VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP,            vpx_roi_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP,          vpx_active_map_t *)
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE,          vpx_scaling_mode_t *)
 
-VPX_CTRL_USE_TYPE(VP9E_SET_LAYER,              int *)
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC,                int)
-
-VPX_CTRL_USE_TYPE(VP9E_SET_WIDTH,              unsigned int *)
-VPX_CTRL_USE_TYPE(VP9E_SET_HEIGHT,             unsigned int *)
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS,     vpx_svc_parameters_t *)
 
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED,            int)
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
@@ -334,8 +344,8 @@ VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
 
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 
-VPX_CTRL_USE_TYPE(VP9E_SET_MAX_Q,      unsigned int)
-VPX_CTRL_USE_TYPE(VP9E_SET_MIN_Q,      unsigned int)
+VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
+
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/source/libvpx/vpx/vp8dx.h b/source/libvpx/vpx/vp8dx.h
index d3093c4..218f0b8 100644
--- a/source/libvpx/vpx/vp8dx.h
+++ b/source/libvpx/vpx/vp8dx.h
@@ -73,9 +73,19 @@ enum vp8_dec_control_id {
    */
   VP8D_SET_DECRYPTOR,
 
+  /** control function to get the display dimensions for the current frame. */
+  VP9D_GET_DISPLAY_SIZE,
+
   /** For testing. */
   VP9_INVERT_TILE_DECODE_ORDER,
 
+  /** control function to set the vp9 decoder into using the least recently
+   * used frame buffer when a new buffer is requested. Takes an int and if
+   * the value is zero will turn off using lru cache. The value of zero is
+   * the default. If the value is anything besides zero, then that will turn
+   * on lru cache.*/
+  VP9D_SET_FRAME_BUFFER_LRU_CACHE,
+
   VP8_DECODER_CTRL_ID_MAX
 };
 
@@ -105,7 +115,9 @@ VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_UPDATES,   int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_FRAME_CORRUPTED,    int *)
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED,      int *)
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR,          vp8_decrypt_init *)
+VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE,       int *)
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
+VPX_CTRL_USE_TYPE(VP9D_SET_FRAME_BUFFER_LRU_CACHE, int)
 
 /*! @} - end defgroup vp8_decoder */
 
diff --git a/source/libvpx/vpx/vpx_codec.mk b/source/libvpx/vpx/vpx_codec.mk
index 3d5510f..df3ff6e 100644
--- a/source/libvpx/vpx/vpx_codec.mk
+++ b/source/libvpx/vpx/vpx_codec.mk
@@ -15,6 +15,8 @@ API_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8.h
 API_DOC_SRCS-$(CONFIG_VP8_ENCODER) += vp8cx.h
+API_SRCS-$(CONFIG_VP9_ENCODER) += src/svc_encodeframe.c
+API_SRCS-$(CONFIG_VP9_ENCODER) += svc_context.h
 
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8.h
 API_SRCS-$(CONFIG_VP8_DECODER) += vp8dx.h
@@ -25,6 +27,7 @@ API_DOC_SRCS-yes += vpx_codec.h
 API_DOC_SRCS-yes += vpx_decoder.h
 API_DOC_SRCS-yes += vpx_encoder.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_external_frame_buffer.h
 
 API_SRCS-yes                += src/vpx_decoder.c
 API_SRCS-yes                += vpx_decoder.h
@@ -36,4 +39,5 @@ API_SRCS-yes                += src/vpx_image.c
 API_SRCS-yes                += vpx_codec.h
 API_SRCS-yes                += vpx_codec.mk
 API_SRCS-yes                += vpx_image.h
+API_SRCS-yes                += vpx_external_frame_buffer.h
 API_SRCS-$(BUILD_LIBVPX)    += vpx_integer.h
diff --git a/source/libvpx/vpx/vpx_decoder.h b/source/libvpx/vpx/vpx_decoder.h
index 2dcd024..08f7f43 100644
--- a/source/libvpx/vpx/vpx_decoder.h
+++ b/source/libvpx/vpx/vpx_decoder.h
@@ -30,6 +30,7 @@ extern "C" {
 #endif
 
 #include "vpx_codec.h"
+#include "vpx_external_frame_buffer.h"
 
   /*!\brief Current ABI version number
    *
@@ -39,7 +40,7 @@ extern "C" {
    * types, removing or reassigning enums, adding/removing/rearranging
    * fields to structures
    */
-#define VPX_DECODER_ABI_VERSION (2 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_DECODER_ABI_VERSION (3 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
   /*! \brief Decoder capabilities bitfield
    *
@@ -66,6 +67,8 @@ extern "C" {
    */
 #define VPX_CODEC_CAP_FRAME_THREADING   0x200000 /**< Can support frame-based
                                                       multi-threading */
+#define VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER 0x400000 /**< Can support external
+                                                      frame buffers */
 
 #define VPX_CODEC_USE_POSTPROC   0x10000 /**< Postprocess decoded frame */
 #define VPX_CODEC_USE_ERROR_CONCEALMENT 0x20000 /**< Conceal errors in decoded
@@ -326,6 +329,49 @@ extern "C" {
 
   /*!@} - end defgroup cap_put_slice*/
 
+  /*!\defgroup cap_external_frame_buffer External Frame Buffer Functions
+   *
+   * The following section is required to be implemented for all decoders
+   * that advertise the VPX_CODEC_CAP_EXTERNAL_FRAME_BUFFER capability.
+   * Calling this function for codecs that don't advertise this capability
+   * will result in an error code being returned, usually VPX_CODEC_ERROR.
+   *
+   * \note
+   * Currently this only works with VP9.
+   * @{
+   */
+
+  /*!\brief Pass in external frame buffers for the decoder to use.
+   *
+   * Registers a given function to be called when the current frame to
+   * decode will be bigger than the external frame buffer size. This
+   * function must be called before the first call to decode or libvpx
+   * will assume the default behavior of allocating frame buffers internally.
+   * Frame buffers with a size of 0 are valid.
+   *
+   * \param[in] ctx          Pointer to this instance's context
+   * \param[in] fb_list      Pointer to array of frame buffers
+   * \param[in] fb_count     Number of elements in frame buffer array
+   * \param[in] cb           Pointer to the callback function
+   * \param[in] user_priv    User's private data
+   *
+   * \retval #VPX_CODEC_OK
+   *     External frame buffers passed into the decoder.
+   * \retval #VPX_CODEC_ERROR
+   *     Decoder context not initialized, or algorithm not capable of
+   *     using external frame buffers.
+   *
+   * \note
+   * When decoding VP9, the application must pass in at least 8 external
+   * frame buffers, as VP9 can have up to 8 reference frames.
+   */
+  vpx_codec_err_t vpx_codec_set_frame_buffers(
+      vpx_codec_ctx_t *ctx,
+      vpx_codec_frame_buffer_t *fb_list, int fb_count,
+      vpx_realloc_frame_buffer_cb_fn_t cb, void *user_priv);
+
+  /*!@} - end defgroup cap_external_frame_buffer */
+
   /*!@} - end defgroup decoder*/
 #ifdef __cplusplus
 }
diff --git a/source/libvpx/vpx/vpx_external_frame_buffer.h b/source/libvpx/vpx/vpx_external_frame_buffer.h
new file mode 100644
index 0000000..adf1330
--- /dev/null
+++ b/source/libvpx/vpx/vpx_external_frame_buffer.h
@@ -0,0 +1,53 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_EXTERNAL_FRAME_BUFFER_H_
+#define VPX_VPX_EXTERNAL_FRAME_BUFFER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vpx/vpx_integer.h"
+
+/*!\brief External frame buffer
+ *
+ * This structure is used to hold external frame buffers passed into the
+ * decoder by the application.
+ */
+typedef struct vpx_codec_frame_buffer {
+  uint8_t *data;    /**< Pointer to the data buffer */
+  size_t size;      /**< Size of data in bytes */
+  void *frame_priv;  /**< Frame's private data */
+} vpx_codec_frame_buffer_t;
+
+/*!\brief realloc frame buffer callback prototype
+ *
+ * This callback is invoked by the decoder to notify the application one
+ * of the external frame buffers must increase in size, in order for the
+ * decode call to complete. The callback must allocate at least new_size in
+ * bytes and assign it to fb->data. Then the callback must set fb->size to
+ * the allocated size. The application does not need to align the allocated
+ * data. The callback is usually triggered by a frame size change. On success
+ * the callback must return 0. Any failure the callback must return a value
+ * less than 0.
+ *
+ * \param[in] user_priv    User's private data
+ * \param[in] new_size     Size in bytes needed by the buffer.
+ * \param[in/out] fb       Pointer to frame buffer to increase size.
+ */
+typedef int (*vpx_realloc_frame_buffer_cb_fn_t)(
+    void *user_priv, size_t new_size, vpx_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_EXTERNAL_FRAME_BUFFER_H_
diff --git a/source/libvpx/vpx_ports/vpx_once.h b/source/libvpx/vpx_ports/vpx_once.h
index 16a735c..6052c4d 100644
--- a/source/libvpx/vpx_ports/vpx_once.h
+++ b/source/libvpx/vpx_ports/vpx_once.h
@@ -7,6 +7,10 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#ifndef VPX_ONCE_H
+#define VPX_ONCE_H
+
 #include "vpx_config.h"
 
 #if CONFIG_MULTITHREAD && defined(_WIN32)
@@ -95,3 +99,5 @@ static void once(void (*func)(void))
     }
 }
 #endif
+
+#endif
diff --git a/source/libvpx/vpx_ports/x86.h b/source/libvpx/vpx_ports/x86.h
index 2990583..fdbed25 100644
--- a/source/libvpx/vpx_ports/x86.h
+++ b/source/libvpx/vpx_ports/x86.h
@@ -35,51 +35,63 @@ typedef enum {
 
 #if defined(__GNUC__) && __GNUC__ || defined(__ANDROID__)
 #if ARCH_X86_64
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   __asm__ __volatile__ (\
                         "cpuid           \n\t" \
                         : "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) \
-                        : "a"  (func));
+                        : "a" (func), "c" (func2));
 #else
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   __asm__ __volatile__ (\
                         "mov %%ebx, %%edi   \n\t" \
                         "cpuid              \n\t" \
                         "xchg %%edi, %%ebx  \n\t" \
                         : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                        : "a" (func));
+                        : "a" (func), "c" (func2));
 #endif
 #elif defined(__SUNPRO_C) || defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
 #if ARCH_X86_64
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   asm volatile (\
                 "xchg %rsi, %rbx \n\t" \
                 "cpuid           \n\t" \
                 "movl %ebx, %edi \n\t" \
                 "xchg %rsi, %rbx \n\t" \
                 : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                : "a"  (func));
+                : "a" (func), "c" (func2));
 #else
-#define cpuid(func,ax,bx,cx,dx)\
+#define cpuid(func, func2, ax, bx, cx, dx)\
   asm volatile (\
                 "pushl %ebx       \n\t" \
                 "cpuid            \n\t" \
                 "movl %ebx, %edi  \n\t" \
                 "popl %ebx        \n\t" \
                 : "=a" (ax), "=D" (bx), "=c" (cx), "=d" (dx) \
-                : "a" (func));
+                : "a" (func), "c" (func2));
 #endif
 #else /* end __SUNPRO__ */
 #if ARCH_X86_64
+#if defined(_MSC_VER) && _MSC_VER > 1500
+void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+#pragma intrinsic(__cpuidex)
+#define cpuid(func, func2, a, b, c, d) do {\
+    int regs[4];\
+    __cpuidex(regs, func, func2); \
+    a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
+  } while(0)
+#else
 void __cpuid(int CPUInfo[4], int info_type);
 #pragma intrinsic(__cpuid)
-#define cpuid(func,a,b,c,d) do{\
+#define cpuid(func, func2, a, b, c, d) do {\
     int regs[4];\
-    __cpuid(regs,func); a=regs[0];  b=regs[1];  c=regs[2];  d=regs[3];\
-  } while(0)
+    __cpuid(regs, func); \
+    a = regs[0];  b = regs[1];  c = regs[2];  d = regs[3];\
+  } while (0)
+#endif
 #else
-#define cpuid(func,a,b,c,d)\
+#define cpuid(func, func2, a, b, c, d)\
   __asm mov eax, func\
+  __asm mov ecx, func2\
   __asm cpuid\
   __asm mov a, eax\
   __asm mov b, ebx\
@@ -120,13 +132,13 @@ x86_simd_caps(void) {
     mask = strtol(env, NULL, 0);
 
   /* Ensure that the CPUID instruction supports extended features */
-  cpuid(0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(0, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
   if (reg_eax < 1)
     return 0;
 
   /* Get the standard feature flags */
-  cpuid(1, reg_eax, reg_ebx, reg_ecx, reg_edx);
+  cpuid(1, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
   if (reg_edx & BIT(23)) flags |= HAS_MMX;
 
@@ -142,6 +154,11 @@ x86_simd_caps(void) {
 
   if (reg_ecx & BIT(28)) flags |= HAS_AVX;
 
+  /* Get the leaf 7 feature flags. Needed to check for AVX2 support */
+  reg_eax = 7;
+  reg_ecx = 0;
+  cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
+
   if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
 
   return flags & mask;
diff --git a/source/libvpx/vpx_ports/x86_cpuid.c b/source/libvpx/vpx_ports/x86_cpuid.c
index fe86cfc..02d382c 100644
--- a/source/libvpx/vpx_ports/x86_cpuid.c
+++ b/source/libvpx/vpx_ports/x86_cpuid.c
@@ -38,7 +38,7 @@ vpx_cpu_t vpx_x86_vendor(void) {
   int i;
 
   /* Get the Vendor String from the CPU */
-  cpuid(0, reg_eax, vs[0], vs[2], vs[1]);
+  cpuid(0, 0, reg_eax, vs[0], vs[2], vs[1]);
 
   for (i = 0; i < VPX_CPU_LAST; i++) {
     if (strncmp((const char *)vs, cpuid_vendor_list[i].vendor_string, 12) == 0)
diff --git a/source/libvpx/vpx_scale/generic/yv12config.c b/source/libvpx/vpx_scale/generic/yv12config.c
index a89e29d..a020e19 100644
--- a/source/libvpx/vpx_scale/generic/yv12config.c
+++ b/source/libvpx/vpx_scale/generic/yv12config.c
@@ -19,10 +19,18 @@
 /****************************************************************************
  *
  ****************************************************************************/
+
+#define yv12_align_addr(addr, align) \
+  (void*)(((size_t)(addr) + ((align) - 1)) & (size_t)-(align))
+
 int
 vp8_yv12_de_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   if (ybf) {
-    vpx_free(ybf->buffer_alloc);
+    // If libvpx is using external frame buffers then buffer_alloc_sz must
+    // not be set.
+    if (ybf->buffer_alloc_sz > 0) {
+      vpx_free(ybf->buffer_alloc);
+    }
 
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
@@ -108,7 +116,9 @@ int vp8_yv12_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 
 int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
   if (ybf) {
-    vpx_free(ybf->buffer_alloc);
+    if (ybf->buffer_alloc_sz > 0) {
+      vpx_free(ybf->buffer_alloc);
+    }
 
     /* buffer_alloc isn't accessed by most functions.  Rather y_buffer,
       u_buffer and v_buffer point to buffer_alloc and are used.  Clear out
@@ -123,7 +133,10 @@ int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf) {
 
 int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height,
-                             int ss_x, int ss_y, int border) {
+                             int ss_x, int ss_y, int border,
+                             vpx_codec_frame_buffer_t *ext_fb,
+                             vpx_realloc_frame_buffer_cb_fn_t cb,
+                             void *user_priv) {
   if (ybf) {
     const int aligned_width = (width + 7) & ~7;
     const int aligned_height = (height + 7) & ~7;
@@ -148,12 +161,36 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
 #else
     const int frame_size = yplane_size + 2 * uvplane_size;
 #endif
-    if (!ybf->buffer_alloc) {
-      ybf->buffer_alloc = vpx_memalign(32, frame_size);
-      ybf->buffer_alloc_sz = frame_size;
+
+    if (ext_fb != NULL) {
+      const int align_addr_extra_size = 31;
+      const int external_frame_size = frame_size + align_addr_extra_size;
+      if (external_frame_size > ext_fb->size) {
+        // Allocation to hold larger frame, or first allocation.
+        if (cb(user_priv, external_frame_size, ext_fb) < 0) {
+          return -1;
+        }
+
+        if (ext_fb->data == NULL || ext_fb->size < external_frame_size) {
+          return -1;
+        }
+
+        ybf->buffer_alloc = yv12_align_addr(ext_fb->data, 32);
+      }
+    } else {
+      if (frame_size > ybf->buffer_alloc_sz) {
+        // Allocation to hold larger frame, or first allocation.
+        if (ybf->buffer_alloc)
+          vpx_free(ybf->buffer_alloc);
+        ybf->buffer_alloc = vpx_memalign(32, frame_size);
+        ybf->buffer_alloc_sz = frame_size;
+      }
+
+      if (ybf->buffer_alloc_sz < frame_size)
+        return -1;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size)
+    if (!ybf->buffer_alloc)
       return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
@@ -203,7 +240,8 @@ int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                            int ss_x, int ss_y, int border) {
   if (ybf) {
     vp9_free_frame_buffer(ybf);
-    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border);
+    return vp9_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, border,
+                                    NULL, NULL, NULL);
   }
   return -2;
 }
diff --git a/source/libvpx/vpx_scale/generic/yv12extend.c b/source/libvpx/vpx_scale/generic/yv12extend.c
index f2aec2b..7896dfe 100644
--- a/source/libvpx/vpx_scale/generic/yv12extend.c
+++ b/source/libvpx/vpx_scale/generic/yv12extend.c
@@ -84,14 +84,13 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
 static void extend_frame(YV12_BUFFER_CONFIG *const ybf,
                          int subsampling_x, int subsampling_y,
                          int ext_size) {
-  const int c_w = (ybf->y_crop_width + subsampling_x) >> subsampling_x;
-  const int c_h = (ybf->y_crop_height + subsampling_y) >> subsampling_y;
-  const int c_et = ext_size >> subsampling_y;
-  const int c_el = ext_size >> subsampling_x;
-  const int c_eb = (ext_size + ybf->y_height - ybf->y_crop_height +
-                    subsampling_y) >> subsampling_y;
-  const int c_er = (ext_size + ybf->y_width - ybf->y_crop_width +
-                    subsampling_x) >> subsampling_x;
+  const int c_w = ybf->uv_crop_width;
+  const int c_h = ybf->uv_crop_height;
+  const int c_ext_size = ext_size >> 1;
+  const int c_et = c_ext_size;
+  const int c_el = c_ext_size;
+  const int c_eb = c_ext_size + ybf->uv_height - ybf->uv_crop_height;
+  const int c_er = c_ext_size + ybf->uv_width - ybf->uv_crop_width;
 
   assert(ybf->y_height - ybf->y_crop_height < 16);
   assert(ybf->y_width - ybf->y_crop_width < 16);
diff --git a/source/libvpx/vpx_scale/yv12config.h b/source/libvpx/vpx_scale/yv12config.h
index 0e950fb..f23e116 100644
--- a/source/libvpx/vpx_scale/yv12config.h
+++ b/source/libvpx/vpx_scale/yv12config.h
@@ -15,6 +15,7 @@
 extern "C" {
 #endif
 
+#include "vpx/vpx_external_frame_buffer.h"
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS       32
@@ -64,9 +65,20 @@ extern "C" {
   int vp9_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                              int width, int height, int ss_x, int ss_y,
                              int border);
+
+  // Updates the yv12 buffer config with the frame buffer. If ext_fb is not
+  // NULL then libvpx is using external frame buffers. The function will
+  // check if the frame buffer is big enough to fit the decoded frame and
+  // try to reallocate the frame buffer. If ext_fb is not NULL and the frame
+  // buffer is not big enough libvpx will call cb with minimum size in bytes.
+  //
+  // Returns 0 on success. Returns < 0 on failure.
   int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf,
                                int width, int height, int ss_x, int ss_y,
-                               int border);
+                               int border,
+                               vpx_codec_frame_buffer_t *ext_fb,
+                               vpx_realloc_frame_buffer_cb_fn_t cb,
+                               void *user_priv);
   int vp9_free_frame_buffer(YV12_BUFFER_CONFIG *ybf);
 
 #ifdef __cplusplus
diff --git a/source/libvpx/vpxdec.c b/source/libvpx/vpxdec.c
index 1860474..91d8faf 100644
--- a/source/libvpx/vpxdec.c
+++ b/source/libvpx/vpxdec.c
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-/* This is a simple program that reads ivf files and decodes them
- * using the new interface. Decoded frames are output as YV12 raw.
- */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -19,54 +15,48 @@
 #include <string.h>
 #include <limits.h>
 
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include "./args.h"
+#include "./ivfdec.h"
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/vpx_timer.h"
+
 #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
-#if CONFIG_MD5
-#include "md5_utils.h"
-#endif
-#include "tools_common.h"
-#include "nestegg/include/nestegg/nestegg.h"
-#include "third_party/libyuv/include/libyuv/scale.h"
 
-#if CONFIG_OS_SUPPORT
-#if defined(_MSC_VER)
-#include <io.h>
-#define snprintf _snprintf
-#define isatty   _isatty
-#define fileno   _fileno
-#else
-#include <unistd.h>
-#endif
+#if CONFIG_MD5
+#include "./md5_utils.h"
 #endif
 
-#ifndef PATH_MAX
-#define PATH_MAX 256
-#endif
+#include "./tools_common.h"
+#include "./webmdec.h"
 
 static const char *exec_name;
 
-#define VP8_FOURCC (0x00385056)
-#define VP9_FOURCC (0x00395056)
 static const struct {
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);
-  unsigned int             fourcc;
-  unsigned int             fourcc_mask;
+  uint32_t fourcc;
+  uint32_t fourcc_mask;
 } ifaces[] = {
 #if CONFIG_VP8_DECODER
-  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC, 0x00FFFFFF},
+  {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC_MASK, 0x00FFFFFF},
 #endif
 #if CONFIG_VP9_DECODER
-  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC, 0x00FFFFFF},
+  {"vp9",  vpx_codec_vp9_dx,   VP9_FOURCC_MASK, 0x00FFFFFF},
 #endif
 };
 
-#include "args.h"
+struct VpxDecInputContext {
+  struct VpxInputContext *vpx_input_ctx;
+  struct WebmInputContext *webm_ctx;
+};
+
 static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1,
                                           "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
@@ -99,6 +89,10 @@ static const arg_def_t error_concealment = ARG_DEF(NULL, "error-concealment", 0,
                                                    "Enable decoder error-concealment");
 static const arg_def_t scalearg = ARG_DEF("S", "scale", 0,
                                             "Scale output frames uniformly");
+static const arg_def_t fb_arg =
+    ARG_DEF(NULL, "frame-buffers", 1, "Number of frame buffers to use");
+static const arg_def_t fb_lru_arg =
+    ARG_DEF(NULL, "frame-buffers-lru", 1, "Turn on/off frame buffer lru");
 
 
 #if CONFIG_MD5
@@ -108,7 +102,7 @@ static const arg_def_t md5arg = ARG_DEF(NULL, "md5", 0,
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &noblitarg,
   &progressarg, &limitarg, &skiparg, &postprocarg, &summaryarg, &outputfile,
-  &threadsarg, &verbosearg, &scalearg,
+  &threadsarg, &verbosearg, &scalearg, &fb_arg, &fb_lru_arg,
 #if CONFIG_MD5
   &md5arg,
 #endif
@@ -143,7 +137,7 @@ static const arg_def_t *vp8_pp_args[] = {
 };
 #endif
 
-static void usage_exit() {
+void usage_exit() {
   int i;
 
   fprintf(stderr, "Usage: %s <options> filename\n\n"
@@ -178,131 +172,61 @@ static void usage_exit() {
   exit(EXIT_FAILURE);
 }
 
-void die(const char *fmt, ...) {
-  va_list ap;
-  va_start(ap, fmt);
-  vfprintf(stderr, fmt, ap);
-  fprintf(stderr, "\n");
-  usage_exit();
-}
-
-static unsigned int mem_get_le16(const void *vmem) {
-  unsigned int  val;
-  const unsigned char *mem = (const unsigned char *)vmem;
-
-  val = mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
-static unsigned int mem_get_le32(const void *vmem) {
-  unsigned int  val;
-  const unsigned char *mem = (const unsigned char *)vmem;
-
-  val = mem[3] << 24;
-  val |= mem[2] << 16;
-  val |= mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
-
-enum file_kind {
-  RAW_FILE,
-  IVF_FILE,
-  WEBM_FILE
-};
-
-struct input_ctx {
-  enum file_kind  kind;
-  FILE           *infile;
-  nestegg        *nestegg_ctx;
-  nestegg_packet *pkt;
-  unsigned int    chunk;
-  unsigned int    chunks;
-  unsigned int    video_track;
-};
-
-#define IVF_FRAME_HDR_SZ (sizeof(uint32_t) + sizeof(uint64_t))
-#define RAW_FRAME_HDR_SZ (sizeof(uint32_t))
-static int read_frame(struct input_ctx      *input,
-                      uint8_t               **buf,
-                      size_t                *buf_sz,
-                      size_t                *buf_alloc_sz) {
-  char            raw_hdr[IVF_FRAME_HDR_SZ];
-  size_t          new_buf_sz;
-  FILE           *infile = input->infile;
-  enum file_kind  kind = input->kind;
-  if (kind == WEBM_FILE) {
-    if (input->chunk >= input->chunks) {
-      unsigned int track;
-
-      do {
-        /* End of this packet, get another. */
-        if (input->pkt)
-          nestegg_free_packet(input->pkt);
-
-        if (nestegg_read_packet(input->nestegg_ctx, &input->pkt) <= 0
-            || nestegg_packet_track(input->pkt, &track))
-          return 1;
-
-      } while (track != input->video_track);
-
-      if (nestegg_packet_count(input->pkt, &input->chunks))
-        return 1;
-      input->chunk = 0;
-    }
-
-    if (nestegg_packet_data(input->pkt, input->chunk, buf, buf_sz))
-      return 1;
-    input->chunk++;
-
-    return 0;
-  }
-  /* For both the raw and ivf formats, the frame size is the first 4 bytes
-   * of the frame header. We just need to special case on the header
-   * size.
-   */
-  else if (fread(raw_hdr, kind == IVF_FILE
-                 ? IVF_FRAME_HDR_SZ : RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile))
-      fprintf(stderr, "Failed to read frame size\n");
-
-    new_buf_sz = 0;
-  } else {
-    new_buf_sz = mem_get_le32(raw_hdr);
+static int read_frame(struct VpxDecInputContext *input,
+                      uint8_t **buf,
+                      size_t *bytes_in_buffer,
+                      size_t *buffer_size) {
+  char raw_hdr[RAW_FRAME_HDR_SZ];
+  size_t bytes_to_read = 0;
+  FILE *infile = input->vpx_input_ctx->file;
+  enum VideoFileType kind = input->vpx_input_ctx->file_type;
+  if (kind == FILE_TYPE_WEBM) {
+    return webm_read_frame(input->webm_ctx,
+                           buf, bytes_in_buffer, buffer_size);
+  } else if (kind == FILE_TYPE_RAW) {
+    if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
+      if (!feof(infile))
+        warn("Failed to read RAW frame size\n");
+    } else {
+      const int kCorruptFrameThreshold = 256 * 1024 * 1024;
+      const int kFrameTooSmallThreshold = 256 * 1024;
+      bytes_to_read = mem_get_le32(raw_hdr);
 
-    if (new_buf_sz > 256 * 1024 * 1024) {
-      fprintf(stderr, "Error: Read invalid frame size (%u)\n",
-              (unsigned int)new_buf_sz);
-      new_buf_sz = 0;
-    }
+      if (bytes_to_read > kCorruptFrameThreshold) {
+        warn("Read invalid frame size (%u)\n", (unsigned int)bytes_to_read);
+        bytes_to_read = 0;
+      }
 
-    if (kind == RAW_FILE && new_buf_sz > 256 * 1024)
-      fprintf(stderr, "Warning: Read invalid frame size (%u)"
-              " - not a raw file?\n", (unsigned int)new_buf_sz);
+      if (kind == FILE_TYPE_RAW && bytes_to_read < kFrameTooSmallThreshold) {
+        warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
+             (unsigned int)bytes_to_read);
+      }
 
-    if (new_buf_sz > *buf_alloc_sz) {
-      uint8_t *new_buf = realloc(*buf, 2 * new_buf_sz);
+      if (bytes_to_read > *buffer_size) {
+        uint8_t *new_buf = realloc(*buf, 2 * bytes_to_read);
 
-      if (new_buf) {
-        *buf = new_buf;
-        *buf_alloc_sz = 2 * new_buf_sz;
-      } else {
-        fprintf(stderr, "Failed to allocate compressed data buffer\n");
-        new_buf_sz = 0;
+        if (new_buf) {
+          *buf = new_buf;
+          *buffer_size = 2 * bytes_to_read;
+        } else {
+          warn("Failed to allocate compressed data buffer\n");
+          bytes_to_read = 0;
+        }
       }
     }
-  }
-
-  *buf_sz = new_buf_sz;
 
-  if (!feof(infile)) {
-    if (fread(*buf, 1, *buf_sz, infile) != *buf_sz) {
-      fprintf(stderr, "Failed to read full frame\n");
-      return 1;
+    if (!feof(infile)) {
+      if (fread(*buf, 1, bytes_to_read, infile) != bytes_to_read) {
+        warn("Failed to read full frame\n");
+        return 1;
+      }
+      *bytes_in_buffer = bytes_to_read;
     }
 
     return 0;
+  } else if (kind == FILE_TYPE_IVF) {
+    return ivf_read_frame(input->vpx_input_ctx,
+                          buf, bytes_in_buffer, buffer_size);
   }
 
   return 1;
@@ -322,8 +246,7 @@ void *out_open(const char *out_fn, int do_md5) {
                           : set_binary_mode(stdout);
 
     if (!outfile) {
-      fprintf(stderr, "Failed to output file");
-      exit(EXIT_FAILURE);
+      fatal("Failed to output file");
     }
   }
 
@@ -359,253 +282,66 @@ void out_close(void *out, const char *out_fn, int do_md5) {
   }
 }
 
-unsigned int file_is_ivf(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         unsigned int *fps_den,
-                         unsigned int *fps_num) {
-  char raw_hdr[32];
-  int is_ivf = 0;
-
-  if (fread(raw_hdr, 1, 32, infile) == 32) {
-    if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K'
-        && raw_hdr[2] == 'I' && raw_hdr[3] == 'F') {
-      is_ivf = 1;
-
-      if (mem_get_le16(raw_hdr + 4) != 0)
-        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
-                " decode properly.");
-
-      *fourcc = mem_get_le32(raw_hdr + 8);
-      *width = mem_get_le16(raw_hdr + 12);
-      *height = mem_get_le16(raw_hdr + 14);
-      *fps_num = mem_get_le32(raw_hdr + 16);
-      *fps_den = mem_get_le32(raw_hdr + 20);
-
-      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
-       * we can guess the framerate using only the timebase in this
-       * case. Other files would require reading ahead to guess the
-       * timebase, like we do for webm.
-       */
-      if (*fps_num < 1000) {
-        /* Correct for the factor of 2 applied to the timebase in the
-         * encoder.
-         */
-        if (*fps_num & 1)*fps_den <<= 1;
-        else *fps_num >>= 1;
-      } else {
-        /* Don't know FPS for sure, and don't have readahead code
-         * (yet?), so just default to 30fps.
-         */
-        *fps_num = 30;
-        *fps_den = 1;
-      }
-    }
-  }
-
-  if (!is_ivf)
-    rewind(infile);
-
-  return is_ivf;
-}
-
-
-unsigned int file_is_raw(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         unsigned int *fps_den,
-                         unsigned int *fps_num) {
-  unsigned char buf[32];
+int file_is_raw(struct VpxInputContext *input) {
+  uint8_t buf[32];
   int is_raw = 0;
   vpx_codec_stream_info_t si;
 
   si.sz = sizeof(si);
 
-  if (fread(buf, 1, 32, infile) == 32) {
+  if (fread(buf, 1, 32, input->file) == 32) {
     int i;
 
-    if (mem_get_le32(buf) < 256 * 1024 * 1024)
-      for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
+    if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+      for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++) {
         if (!vpx_codec_peek_stream_info(ifaces[i].iface(),
                                         buf + 4, 32 - 4, &si)) {
           is_raw = 1;
-          *fourcc = ifaces[i].fourcc;
-          *width = si.w;
-          *height = si.h;
-          *fps_num = 30;
-          *fps_den = 1;
+          input->fourcc = ifaces[i].fourcc;
+          input->width = si.w;
+          input->height = si.h;
+          input->framerate.numerator = 30;
+          input->framerate.denominator = 1;
           break;
         }
-  }
-
-  rewind(infile);
-  return is_raw;
-}
-
-
-static int
-nestegg_read_cb(void *buffer, size_t length, void *userdata) {
-  FILE *f = userdata;
-
-  if (fread(buffer, 1, length, f) < length) {
-    if (ferror(f))
-      return -1;
-    if (feof(f))
-      return 0;
-  }
-  return 1;
-}
-
-
-static int
-nestegg_seek_cb(int64_t offset, int whence, void *userdata) {
-  switch (whence) {
-    case NESTEGG_SEEK_SET:
-      whence = SEEK_SET;
-      break;
-    case NESTEGG_SEEK_CUR:
-      whence = SEEK_CUR;
-      break;
-    case NESTEGG_SEEK_END:
-      whence = SEEK_END;
-      break;
-  };
-  return fseek(userdata, (long)offset, whence) ? -1 : 0;
-}
-
-
-static int64_t
-nestegg_tell_cb(void *userdata) {
-  return ftell(userdata);
-}
-
-
-static void
-nestegg_log_cb(nestegg *context, unsigned int severity, char const *format,
-               ...) {
-  va_list ap;
-
-  va_start(ap, format);
-  vfprintf(stderr, format, ap);
-  fprintf(stderr, "\n");
-  va_end(ap);
-}
-
-
-static int
-webm_guess_framerate(struct input_ctx *input,
-                     unsigned int     *fps_den,
-                     unsigned int     *fps_num) {
-  unsigned int i;
-  uint64_t     tstamp = 0;
-
-  /* Check to see if we can seek before we parse any data. */
-  if (nestegg_track_seek(input->nestegg_ctx, input->video_track, 0)) {
-    fprintf(stderr,
-            "WARNING: Failed to guess framerate (no Cues), set to 30fps.\n");
-    *fps_num = 30;
-    *fps_den = 1;
-    return 0;
-  }
-
-  /* Guess the framerate. Read up to 1 second, or 50 video packets,
-   * whichever comes first.
-   */
-  for (i = 0; tstamp < 1000000000 && i < 50;) {
-    nestegg_packet *pkt;
-    unsigned int track;
-
-    if (nestegg_read_packet(input->nestegg_ctx, &pkt) <= 0)
-      break;
-
-    nestegg_packet_track(pkt, &track);
-    if (track == input->video_track) {
-      nestegg_packet_tstamp(pkt, &tstamp);
-      i++;
+      }
     }
-
-    nestegg_free_packet(pkt);
-  }
-
-  if (nestegg_track_seek(input->nestegg_ctx, input->video_track, 0))
-    goto fail;
-
-  *fps_num = (i - 1) * 1000000;
-  *fps_den = (unsigned int)(tstamp / 1000);
-  return 0;
-fail:
-  nestegg_destroy(input->nestegg_ctx);
-  input->nestegg_ctx = NULL;
-  rewind(input->infile);
-  return 1;
-}
-
-
-static int
-file_is_webm(struct input_ctx *input,
-             unsigned int     *fourcc,
-             unsigned int     *width,
-             unsigned int     *height,
-             unsigned int     *fps_den,
-             unsigned int     *fps_num) {
-  unsigned int i, n;
-  int          track_type = -1;
-  int          codec_id;
-
-  nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
-  nestegg_video_params params;
-
-  io.userdata = input->infile;
-  if (nestegg_init(&input->nestegg_ctx, io, NULL))
-    goto fail;
-
-  if (nestegg_track_count(input->nestegg_ctx, &n))
-    goto fail;
-
-  for (i = 0; i < n; i++) {
-    track_type = nestegg_track_type(input->nestegg_ctx, i);
-
-    if (track_type == NESTEGG_TRACK_VIDEO)
-      break;
-    else if (track_type < 0)
-      goto fail;
   }
 
-  codec_id = nestegg_track_codec_id(input->nestegg_ctx, i);
-  if (codec_id == NESTEGG_CODEC_VP8) {
-    *fourcc = VP8_FOURCC;
-  } else if (codec_id == NESTEGG_CODEC_VP9) {
-    *fourcc = VP9_FOURCC;
-  } else {
-    fprintf(stderr, "Not VPx video, quitting.\n");
-    exit(1);
-  }
-
-  input->video_track = i;
-
-  if (nestegg_track_video_params(input->nestegg_ctx, i, &params))
-    goto fail;
-
-  *fps_den = 0;
-  *fps_num = 0;
-  *width = params.width;
-  *height = params.height;
-  return 1;
-fail:
-  input->nestegg_ctx = NULL;
-  rewind(input->infile);
-  return 0;
+  rewind(input->file);
+  return is_raw;
 }
 
-
 void show_progress(int frame_in, int frame_out, unsigned long dx_time) {
   fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
           (float)frame_out * 1000000.0 / (float)dx_time);
 }
 
+// Called by libvpx if the frame buffer size needs to increase.
+//
+// Parameters:
+// user_priv    Data passed into libvpx.
+// new_size     Minimum size needed by libvpx to decompress the next frame.
+// fb           Pointer to the frame buffer to update.
+//
+// Returns 0 on success. Returns < 0 on failure.
+int realloc_vp9_frame_buffer(void *user_priv, size_t new_size,
+                             vpx_codec_frame_buffer_t *fb) {
+  (void)user_priv;
+  if (!fb)
+    return -1;
+
+  free(fb->data);
+  fb->data = (uint8_t*)malloc(new_size);
+  if (!fb->data) {
+    fb->size = 0;
+    return -1;
+  }
+
+  fb->size = new_size;
+  return 0;
+}
 
 void generate_filename(const char *pattern, char *out, size_t q_len,
                        unsigned int d_w, unsigned int d_h,
@@ -688,18 +424,18 @@ void generate_filename(const char *pattern, char *out, size_t q_len,
 
 
 int main_loop(int argc, const char **argv_) {
-  vpx_codec_ctx_t          decoder;
+  vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
   int                    i;
   uint8_t               *buf = NULL;
-  size_t                 buf_sz = 0, buf_alloc_sz = 0;
+  size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
-  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
+  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int                    do_md5 = 0, progress = 0;
   int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int                    arg_skip = 0;
   int                    ec_enabled = 0;
   vpx_codec_iface_t       *iface = NULL;
-  unsigned int           fourcc;
   unsigned long          dx_time = 0;
   struct arg               arg;
   char                   **argv, **argi, **argj;
@@ -707,10 +443,6 @@ int main_loop(int argc, const char **argv_) {
   char                    outfile[PATH_MAX];
   int                     single_file;
   int                     use_y4m = 1;
-  unsigned int            width;
-  unsigned int            height;
-  unsigned int            fps_den;
-  unsigned int            fps_num;
   void                   *out = NULL;
   vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
@@ -720,13 +452,20 @@ int main_loop(int argc, const char **argv_) {
   int                     vp8_dbg_color_b_modes = 0;
   int                     vp8_dbg_display_mv = 0;
 #endif
-  struct input_ctx        input = {0};
   int                     frames_corrupted = 0;
   int                     dec_flags = 0;
   int                     do_scale = 0;
-  int                     stream_w = 0, stream_h = 0;
   vpx_image_t             *scaled_img = NULL;
   int                     frame_avail, got_data;
+  int                     num_external_frame_buffers = 0;
+  int                     fb_lru_cache = 0;
+  vpx_codec_frame_buffer_t *frame_buffers = NULL;
+
+  struct VpxDecInputContext input = {0};
+  struct VpxInputContext vpx_input_ctx = {0};
+  struct WebmInputContext webm_ctx = {0};
+  input.vpx_input_ctx = &vpx_input_ctx;
+  input.webm_ctx = &webm_ctx;
 
   /* Parse command line */
   exec_name = argv_[0];
@@ -780,6 +519,10 @@ int main_loop(int argc, const char **argv_) {
       quiet = 0;
     else if (arg_match(&arg, &scalearg, argi))
       do_scale = 1;
+    else if (arg_match(&arg, &fb_arg, argi))
+      num_external_frame_buffers = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &fb_lru_arg, argi))
+      fb_lru_cache = arg_parse_uint(&arg);
 
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
@@ -865,14 +608,13 @@ int main_loop(int argc, const char **argv_) {
     return EXIT_FAILURE;
   }
 #endif
-  input.infile = infile;
-  if (file_is_ivf(infile, &fourcc, &width, &height, &fps_den,
-                  &fps_num))
-    input.kind = IVF_FILE;
-  else if (file_is_webm(&input, &fourcc, &width, &height, &fps_den, &fps_num))
-    input.kind = WEBM_FILE;
-  else if (file_is_raw(infile, &fourcc, &width, &height, &fps_den, &fps_num))
-    input.kind = RAW_FILE;
+  input.vpx_input_ctx->file = infile;
+  if (file_is_ivf(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_IVF;
+  else if (file_is_webm(input.webm_ctx, input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_WEBM;
+  else if (file_is_raw(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_RAW;
   else {
     fprintf(stderr, "Unrecognized input file type.\n");
     return EXIT_FAILURE;
@@ -899,7 +641,7 @@ int main_loop(int argc, const char **argv_) {
 
   if (single_file && !noblit) {
     generate_filename(outfile_pattern, outfile, sizeof(outfile) - 1,
-                      width, height, 0);
+                      vpx_input_ctx.width, vpx_input_ctx.height, 0);
     out = out_open(outfile, do_md5);
   }
 
@@ -912,8 +654,8 @@ int main_loop(int argc, const char **argv_) {
       return EXIT_FAILURE;
     }
 
-    if (input.kind == WEBM_FILE)
-      if (webm_guess_framerate(&input, &fps_den, &fps_num)) {
+    if (vpx_input_ctx.file_type == FILE_TYPE_WEBM)
+      if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) {
         fprintf(stderr, "Failed to guess framerate -- error parsing "
                 "webm file?\n");
         return EXIT_FAILURE;
@@ -924,21 +666,23 @@ int main_loop(int argc, const char **argv_) {
        store one, and neither does VP8.
       That will have to wait until these tools support WebM natively.*/
     snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",
-             width, height, fps_num, fps_den, 'p');
+             vpx_input_ctx.width, vpx_input_ctx.height,
+             vpx_input_ctx.framerate.numerator,
+             vpx_input_ctx.framerate.denominator,
+             'p');
     out_put(out, (unsigned char *)buffer,
             (unsigned int)strlen(buffer), do_md5);
   }
 
   /* Try to determine the codec from the fourcc. */
   for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
-      vpx_codec_iface_t  *ivf_iface = ifaces[i].iface();
+    if ((vpx_input_ctx.fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
+      vpx_codec_iface_t *vpx_iface = ifaces[i].iface();
 
-      if (iface && iface != ivf_iface)
-        fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
-                ifaces[i].name);
+      if (iface && iface != vpx_iface)
+        warn("Header indicates codec: %s\n", ifaces[i].name);
       else
-        iface = ivf_iface;
+        iface = vpx_iface;
 
       break;
     }
@@ -988,14 +732,38 @@ int main_loop(int argc, const char **argv_) {
 #endif
 
 
-  if(arg_skip)
-    fprintf(stderr, "Skiping first %d frames.\n", arg_skip);
+  if (arg_skip)
+    fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
   while (arg_skip) {
-    if (read_frame(&input, &buf, &buf_sz, &buf_alloc_sz))
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size))
       break;
     arg_skip--;
   }
 
+  if (num_external_frame_buffers > 0) {
+    // Allocate the frame buffer list, setting all of the values to 0.
+    // Including the size of frame buffers. Libvpx will request the
+    // application to realloc the frame buffer data if the size is too small.
+    frame_buffers = (vpx_codec_frame_buffer_t*)calloc(
+        num_external_frame_buffers, sizeof(*frame_buffers));
+    if (vpx_codec_set_frame_buffers(&decoder, frame_buffers,
+                                    num_external_frame_buffers,
+                                    realloc_vp9_frame_buffer,
+                                    NULL)) {
+      fprintf(stderr, "Failed to configure external frame buffers: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+
+  if (fb_lru_cache > 0 &&
+      vpx_codec_control(&decoder, VP9D_SET_FRAME_BUFFER_LRU_CACHE,
+                        fb_lru_cache)) {
+    fprintf(stderr, "Failed to set frame buffer lru cache: %s\n",
+            vpx_codec_error(&decoder));
+    return EXIT_FAILURE;
+  }
+
   frame_avail = 1;
   got_data = 0;
 
@@ -1008,19 +776,19 @@ int main_loop(int argc, const char **argv_) {
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
-      if(!read_frame(&input, &buf, &buf_sz, &buf_alloc_sz)) {
+      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
         frame_avail = 1;
         frame_in++;
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, (unsigned int)buf_sz, NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
-          fprintf(stderr, "Failed to decode frame: %s\n",
-                  vpx_codec_error(&decoder));
+          warn("Failed to decode frame %d: %s",
+               frame_in, vpx_codec_error(&decoder));
 
           if (detail)
-            fprintf(stderr, "  Additional information: %s\n", detail);
+            warn("Additional information: %s", detail);
           goto fail;
         }
 
@@ -1041,8 +809,7 @@ int main_loop(int argc, const char **argv_) {
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
     if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
-      fprintf(stderr, "Failed VP8_GET_FRAME_CORRUPTED: %s\n",
-              vpx_codec_error(&decoder));
+      warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       goto fail;
     }
     frames_corrupted += corrupted;
@@ -1063,9 +830,18 @@ int main_loop(int argc, const char **argv_) {
       }
 
       if (do_scale) {
+        int stream_w = 0, stream_h = 0;
         if (img && frame_out == 1) {
-          stream_w = img->d_w;
-          stream_h = img->d_h;
+          int display_size[2];
+          if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE,
+                                display_size)) {
+            // Fallback to use raw image size if display size not available.
+            stream_w = img->d_w;
+            stream_h = img->d_h;
+          } else {
+            stream_w = display_size[0];
+            stream_h = display_size[1];
+          }
           scaled_img = vpx_img_alloc(NULL, VPX_IMG_FMT_I420,
                                      stream_w, stream_h, 16);
         }
@@ -1086,7 +862,6 @@ int main_loop(int argc, const char **argv_) {
           img = scaled_img;
         }
       }
-
       if (img) {
         unsigned int y;
         char out_fn[PATH_MAX];
@@ -1149,17 +924,25 @@ int main_loop(int argc, const char **argv_) {
 fail:
 
   if (vpx_codec_destroy(&decoder)) {
-    fprintf(stderr, "Failed to destroy decoder: %s\n", vpx_codec_error(&decoder));
+    fprintf(stderr, "Failed to destroy decoder: %s\n",
+            vpx_codec_error(&decoder));
     return EXIT_FAILURE;
   }
 
   if (single_file && !noblit)
     out_close(out, outfile, do_md5);
 
-  if (input.nestegg_ctx)
-    nestegg_destroy(input.nestegg_ctx);
-  if (input.kind != WEBM_FILE)
+  if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM)
+    webm_free(input.webm_ctx);
+  else
     free(buf);
+
+  if (scaled_img) vpx_img_free(scaled_img);
+  for (i = 0; i < num_external_frame_buffers; ++i) {
+    free(frame_buffers[i].data);
+  }
+  free(frame_buffers);
+
   fclose(infile);
   free(argv);
 
diff --git a/source/libvpx/vpxenc.c b/source/libvpx/vpxenc.c
index aa99c6b..d0ed9b5 100644
--- a/source/libvpx/vpxenc.c
+++ b/source/libvpx/vpxenc.c
@@ -8,31 +8,26 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_config.h"
-
-#if defined(_WIN32) || defined(__OS2__) || !CONFIG_OS_SUPPORT
-#define USE_POSIX_MMAP 0
-#else
-#define USE_POSIX_MMAP 1
-#endif
+#include "./vpxenc.h"
+#include "./vpx_config.h"
 
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <limits.h>
-#include <assert.h>
+
 #include "vpx/vpx_encoder.h"
 #if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
 #endif
-#if USE_POSIX_MMAP
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <unistd.h>
-#endif
+
+#include "third_party/libyuv/include/libyuv/scale.h"
+#include "./args.h"
+#include "./ivfdec.h"
+#include "./ivfenc.h"
 
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
@@ -41,37 +36,13 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#include "./tools_common.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_ports/vpx_timer.h"
-#include "tools_common.h"
-#include "y4minput.h"
-#include "third_party/libmkv/EbmlWriter.h"
-#include "third_party/libmkv/EbmlIDs.h"
-#include "third_party/libyuv/include/libyuv/scale.h"
-
-/* Need special handling of these functions on Windows */
-#if defined(_MSC_VER)
-/* MSVS doesn't define off_t, and uses _f{seek,tell}i64 */
-typedef __int64 off_t;
-#define fseeko _fseeki64
-#define ftello _ftelli64
-#elif defined(_WIN32)
-/* MinGW defines off_t as long
-   and uses f{seek,tell}o64/off64_t for large files */
-#define fseeko fseeko64
-#define ftello ftello64
-#define off_t off64_t
-#endif
-
-#define LITERALU64(hi,lo) ((((uint64_t)hi)<<32)|lo)
-
-/* We should use 32-bit file operations in WebM file format
- * when building ARM executable file (.axf) with RVCT */
-#if !CONFIG_OS_SUPPORT
-typedef long off_t;
-#define fseeko fseek
-#define ftello ftell
-#endif
+#include "./vpxstats.h"
+#include "./warnings.h"
+#include "./webmenc.h"
+#include "./y4minput.h"
 
 /* Swallow warnings about unused results of fread/fwrite */
 static size_t wrap_fread(void *ptr, size_t size, size_t nmemb,
@@ -89,8 +60,6 @@ static size_t wrap_fwrite(const void *ptr, size_t size, size_t nmemb,
 
 static const char *exec_name;
 
-#define VP8_FOURCC (0x30385056)
-#define VP9_FOURCC (0x30395056)
 static const struct codec_item {
   char const              *name;
   const vpx_codec_iface_t *(*iface)(void);
@@ -109,37 +78,6 @@ static const struct codec_item {
 #endif
 };
 
-static void usage_exit();
-
-#define LOG_ERROR(label) do \
-  {\
-    const char *l=label;\
-    va_list ap;\
-    va_start(ap, fmt);\
-    if(l)\
-      fprintf(stderr, "%s: ", l);\
-    vfprintf(stderr, fmt, ap);\
-    fprintf(stderr, "\n");\
-    va_end(ap);\
-  } while(0)
-
-void die(const char *fmt, ...) {
-  LOG_ERROR(NULL);
-  usage_exit();
-}
-
-
-void fatal(const char *fmt, ...) {
-  LOG_ERROR("Fatal");
-  exit(EXIT_FAILURE);
-}
-
-
-void warn(const char *fmt, ...) {
-  LOG_ERROR("Warning");
-}
-
-
 static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
                                    const char *s, va_list ap) {
   if (ctx->err) {
@@ -173,699 +111,28 @@ static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
   va_end(ap);
 }
 
-/* This structure is used to abstract the different ways of handling
- * first pass statistics.
- */
-typedef struct {
-  vpx_fixed_buf_t buf;
-  int             pass;
-  FILE           *file;
-  char           *buf_ptr;
-  size_t          buf_alloc_sz;
-} stats_io_t;
-
-int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
-  int res;
-
-  stats->pass = pass;
-
-  if (pass == 0) {
-    stats->file = fopen(fpf, "wb");
-    stats->buf.sz = 0;
-    stats->buf.buf = NULL,
-               res = (stats->file != NULL);
-  } else {
-#if 0
-#elif USE_POSIX_MMAP
-    struct stat stat_buf;
-    int fd;
-
-    fd = open(fpf, O_RDONLY);
-    stats->file = fdopen(fd, "rb");
-    fstat(fd, &stat_buf);
-    stats->buf.sz = stat_buf.st_size;
-    stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE,
-                          fd, 0);
-    res = (stats->buf.buf != NULL);
-#else
-    size_t nbytes;
-
-    stats->file = fopen(fpf, "rb");
-
-    if (fseek(stats->file, 0, SEEK_END))
-      fatal("First-pass stats file must be seekable!");
-
-    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
-    rewind(stats->file);
-
-    stats->buf.buf = malloc(stats->buf_alloc_sz);
-
-    if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
-            (unsigned long)stats->buf_alloc_sz);
-
-    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
-    res = (nbytes == stats->buf.sz);
-#endif
-  }
-
-  return res;
-}
-
-int stats_open_mem(stats_io_t *stats, int pass) {
-  int res;
-  stats->pass = pass;
-
-  if (!pass) {
-    stats->buf.sz = 0;
-    stats->buf_alloc_sz = 64 * 1024;
-    stats->buf.buf = malloc(stats->buf_alloc_sz);
-  }
-
-  stats->buf_ptr = stats->buf.buf;
-  res = (stats->buf.buf != NULL);
-  return res;
-}
-
-
-void stats_close(stats_io_t *stats, int last_pass) {
-  if (stats->file) {
-    if (stats->pass == last_pass) {
-#if 0
-#elif USE_POSIX_MMAP
-      munmap(stats->buf.buf, stats->buf.sz);
-#else
-      free(stats->buf.buf);
-#endif
-    }
-
-    fclose(stats->file);
-    stats->file = NULL;
-  } else {
-    if (stats->pass == last_pass)
-      free(stats->buf.buf);
-  }
-}
-
-void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
-  if (stats->file) {
-    (void) fwrite(pkt, 1, len, stats->file);
-  } else {
-    if (stats->buf.sz + len > stats->buf_alloc_sz) {
-      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
-      char   *new_ptr = realloc(stats->buf.buf, new_sz);
-
-      if (new_ptr) {
-        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
-        stats->buf.buf = new_ptr;
-        stats->buf_alloc_sz = new_sz;
-      } else
-        fatal("Failed to realloc firstpass stats buffer.");
-    }
-
-    memcpy(stats->buf_ptr, pkt, len);
-    stats->buf.sz += len;
-    stats->buf_ptr += len;
-  }
-}
-
-vpx_fixed_buf_t stats_get(stats_io_t *stats) {
-  return stats->buf;
-}
-
-/* Stereo 3D packed frame format */
-typedef enum stereo_format {
-  STEREO_FORMAT_MONO       = 0,
-  STEREO_FORMAT_LEFT_RIGHT = 1,
-  STEREO_FORMAT_BOTTOM_TOP = 2,
-  STEREO_FORMAT_TOP_BOTTOM = 3,
-  STEREO_FORMAT_RIGHT_LEFT = 11
-} stereo_format_t;
-
-enum video_file_type {
-  FILE_TYPE_RAW,
-  FILE_TYPE_IVF,
-  FILE_TYPE_Y4M
-};
-
-struct detect_buffer {
-  char buf[4];
-  size_t buf_read;
-  size_t position;
-};
-
-
-struct input_state {
-  char                 *fn;
-  FILE                 *file;
-  off_t                 length;
-  y4m_input             y4m;
-  struct detect_buffer  detect;
-  enum video_file_type  file_type;
-  unsigned int          w;
-  unsigned int          h;
-  struct vpx_rational   framerate;
-  int                   use_i420;
-  int                   only_i420;
-};
-
-
-#define IVF_FRAME_HDR_SZ (4+8) /* 4 byte size + 8 byte timestamp */
-static int read_frame(struct input_state *input, vpx_image_t *img) {
-  FILE *f = input->file;
-  enum video_file_type file_type = input->file_type;
-  y4m_input *y4m = &input->y4m;
-  struct detect_buffer *detect = &input->detect;
-  int plane = 0;
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
   int shortread = 0;
 
-  if (file_type == FILE_TYPE_Y4M) {
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
     if (y4m_input_fetch_frame(y4m, f, img) < 1)
       return 0;
   } else {
-    if (file_type == FILE_TYPE_IVF) {
-      char junk[IVF_FRAME_HDR_SZ];
-
-      /* Skip the frame header. We know how big the frame should be. See
-       * write_ivf_frame_header() for documentation on the frame header
-       * layout.
-       */
-      (void) fread(junk, 1, IVF_FRAME_HDR_SZ, f);
-    }
-
-    for (plane = 0; plane < 3; plane++) {
-      unsigned char *ptr;
-      int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
-      int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
-      int r;
-
-      /* Determine the correct plane based on the image format. The for-loop
-       * always counts in Y,U,V order, but this may not match the order of
-       * the data on disk.
-       */
-      switch (plane) {
-        case 1:
-          ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
-          break;
-        case 2:
-          ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
-          break;
-        default:
-          ptr = img->planes[plane];
-      }
-
-      for (r = 0; r < h; r++) {
-        size_t needed = w;
-        size_t buf_position = 0;
-        const size_t left = detect->buf_read - detect->position;
-        if (left > 0) {
-          const size_t more = (left < needed) ? left : needed;
-          memcpy(ptr, detect->buf + detect->position, more);
-          buf_position = more;
-          needed -= more;
-          detect->position += more;
-        }
-        if (needed > 0) {
-          shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
-        }
-
-        ptr += img->stride[plane];
-      }
-    }
+    shortread = read_yuv_frame(input_ctx, img);
   }
 
   return !shortread;
 }
 
-
-unsigned int file_is_y4m(FILE      *infile,
-                         y4m_input *y4m,
-                         char       detect[4]) {
+int file_is_y4m(FILE *infile, y4m_input *y4m, const char detect[4]) {
   if (memcmp(detect, "YUV4", 4) == 0) {
     return 1;
   }
   return 0;
 }
 
-#define IVF_FILE_HDR_SZ (32)
-unsigned int file_is_ivf(struct input_state *input,
-                         unsigned int *fourcc) {
-  char raw_hdr[IVF_FILE_HDR_SZ];
-  int is_ivf = 0;
-  FILE *infile = input->file;
-  unsigned int *width = &input->w;
-  unsigned int *height = &input->h;
-  struct detect_buffer *detect = &input->detect;
-
-  if (memcmp(detect->buf, "DKIF", 4) != 0)
-    return 0;
-
-  /* See write_ivf_file_header() for more documentation on the file header
-   * layout.
-   */
-  if (fread(raw_hdr + 4, 1, IVF_FILE_HDR_SZ - 4, infile)
-      == IVF_FILE_HDR_SZ - 4) {
-    {
-      is_ivf = 1;
-
-      if (mem_get_le16(raw_hdr + 4) != 0)
-        warn("Unrecognized IVF version! This file may not decode "
-             "properly.");
-
-      *fourcc = mem_get_le32(raw_hdr + 8);
-    }
-  }
-
-  if (is_ivf) {
-    *width = mem_get_le16(raw_hdr + 12);
-    *height = mem_get_le16(raw_hdr + 14);
-    detect->position = 4;
-  }
-
-  return is_ivf;
-}
-
-
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  unsigned int fourcc,
-                                  int frame_cnt) {
-  char header[32];
-
-  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-    return;
-
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4,  0);                 /* version */
-  mem_put_le16(header + 6,  32);                /* headersize */
-  mem_put_le32(header + 8,  fourcc);            /* headersize */
-  mem_put_le16(header + 12, cfg->g_w);          /* width */
-  mem_put_le16(header + 14, cfg->g_h);          /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);         /* length */
-  mem_put_le32(header + 28, 0);                 /* unused */
-
-  (void) fwrite(header, 1, 32, outfile);
-}
-
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt) {
-  char             header[12];
-  vpx_codec_pts_t  pts;
-
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, (int)pkt->data.frame.sz);
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void) fwrite(header, 1, 12, outfile);
-}
-
-static void write_ivf_frame_size(FILE *outfile, size_t size) {
-  char             header[4];
-  mem_put_le32(header, (int)size);
-  (void) fwrite(header, 1, 4, outfile);
-}
-
-
-typedef off_t EbmlLoc;
-
-
-struct cue_entry {
-  unsigned int time;
-  uint64_t     loc;
-};
-
-
-struct EbmlGlobal {
-  int debug;
-
-  FILE    *stream;
-  int64_t last_pts_ms;
-  vpx_rational_t  framerate;
-
-  /* These pointers are to the start of an element */
-  off_t    position_reference;
-  off_t    seek_info_pos;
-  off_t    segment_info_pos;
-  off_t    track_pos;
-  off_t    cue_pos;
-  off_t    cluster_pos;
-
-  /* This pointer is to a specific element to be serialized */
-  off_t    track_id_pos;
-
-  /* These pointers are to the size field of the element */
-  EbmlLoc  startSegment;
-  EbmlLoc  startCluster;
-
-  uint32_t cluster_timecode;
-  int      cluster_open;
-
-  struct cue_entry *cue_list;
-  unsigned int      cues;
-
-};
-
-
-void Ebml_Write(EbmlGlobal *glob, const void *buffer_in, unsigned long len) {
-  (void) fwrite(buffer_in, 1, len, glob->stream);
-}
-
-#define WRITE_BUFFER(s) \
-  for(i = len-1; i>=0; i--)\
-  { \
-    x = (char)(*(const s *)buffer_in >> (i * CHAR_BIT)); \
-    Ebml_Write(glob, &x, 1); \
-  }
-void Ebml_Serialize(EbmlGlobal *glob, const void *buffer_in, int buffer_size, unsigned long len) {
-  char x;
-  int i;
-
-  /* buffer_size:
-   * 1 - int8_t;
-   * 2 - int16_t;
-   * 3 - int32_t;
-   * 4 - int64_t;
-   */
-  switch (buffer_size) {
-    case 1:
-      WRITE_BUFFER(int8_t)
-      break;
-    case 2:
-      WRITE_BUFFER(int16_t)
-      break;
-    case 4:
-      WRITE_BUFFER(int32_t)
-      break;
-    case 8:
-      WRITE_BUFFER(int64_t)
-      break;
-    default:
-      break;
-  }
-}
-#undef WRITE_BUFFER
-
-/* Need a fixed size serializer for the track ID. libmkv provides a 64 bit
- * one, but not a 32 bit one.
- */
-static void Ebml_SerializeUnsigned32(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
-  unsigned char sizeSerialized = 4 | 0x80;
-  Ebml_WriteID(glob, class_id);
-  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
-  Ebml_Serialize(glob, &ui, sizeof(ui), 4);
-}
-
-
-static void
-Ebml_StartSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc,
-                     unsigned long class_id) {
-  /* todo this is always taking 8 bytes, this may need later optimization */
-  /* this is a key that says length unknown */
-  uint64_t unknownLen = LITERALU64(0x01FFFFFF, 0xFFFFFFFF);
-
-  Ebml_WriteID(glob, class_id);
-  *ebmlLoc = ftello(glob->stream);
-  Ebml_Serialize(glob, &unknownLen, sizeof(unknownLen), 8);
-}
-
-static void
-Ebml_EndSubElement(EbmlGlobal *glob, EbmlLoc *ebmlLoc) {
-  off_t pos;
-  uint64_t size;
-
-  /* Save the current stream pointer */
-  pos = ftello(glob->stream);
-
-  /* Calculate the size of this element */
-  size = pos - *ebmlLoc - 8;
-  size |= LITERALU64(0x01000000, 0x00000000);
-
-  /* Seek back to the beginning of the element and write the new size */
-  fseeko(glob->stream, *ebmlLoc, SEEK_SET);
-  Ebml_Serialize(glob, &size, sizeof(size), 8);
-
-  /* Reset the stream pointer */
-  fseeko(glob->stream, pos, SEEK_SET);
-}
-
-
-static void
-write_webm_seek_element(EbmlGlobal *ebml, unsigned long id, off_t pos) {
-  uint64_t offset = pos - ebml->position_reference;
-  EbmlLoc start;
-  Ebml_StartSubElement(ebml, &start, Seek);
-  Ebml_SerializeBinary(ebml, SeekID, id);
-  Ebml_SerializeUnsigned64(ebml, SeekPosition, offset);
-  Ebml_EndSubElement(ebml, &start);
-}
-
-
-static void
-write_webm_seek_info(EbmlGlobal *ebml) {
-
-  off_t pos;
-
-  /* Save the current stream pointer */
-  pos = ftello(ebml->stream);
-
-  if (ebml->seek_info_pos)
-    fseeko(ebml->stream, ebml->seek_info_pos, SEEK_SET);
-  else
-    ebml->seek_info_pos = pos;
-
-  {
-    EbmlLoc start;
-
-    Ebml_StartSubElement(ebml, &start, SeekHead);
-    write_webm_seek_element(ebml, Tracks, ebml->track_pos);
-    write_webm_seek_element(ebml, Cues,   ebml->cue_pos);
-    write_webm_seek_element(ebml, Info,   ebml->segment_info_pos);
-    Ebml_EndSubElement(ebml, &start);
-  }
-  {
-    /* segment info */
-    EbmlLoc startInfo;
-    uint64_t frame_time;
-    char version_string[64];
-
-    /* Assemble version string */
-    if (ebml->debug)
-      strcpy(version_string, "vpxenc");
-    else {
-      strcpy(version_string, "vpxenc ");
-      strncat(version_string,
-              vpx_codec_version_str(),
-              sizeof(version_string) - 1 - strlen(version_string));
-    }
-
-    frame_time = (uint64_t)1000 * ebml->framerate.den
-                 / ebml->framerate.num;
-    ebml->segment_info_pos = ftello(ebml->stream);
-    Ebml_StartSubElement(ebml, &startInfo, Info);
-    Ebml_SerializeUnsigned(ebml, TimecodeScale, 1000000);
-    Ebml_SerializeFloat(ebml, Segment_Duration,
-                        (double)(ebml->last_pts_ms + frame_time));
-    Ebml_SerializeString(ebml, 0x4D80, version_string);
-    Ebml_SerializeString(ebml, 0x5741, version_string);
-    Ebml_EndSubElement(ebml, &startInfo);
-  }
-}
-
-
-static void
-write_webm_file_header(EbmlGlobal                *glob,
-                       const vpx_codec_enc_cfg_t *cfg,
-                       const struct vpx_rational *fps,
-                       stereo_format_t            stereo_fmt,
-                       unsigned int               fourcc) {
-  {
-    EbmlLoc start;
-    Ebml_StartSubElement(glob, &start, EBML);
-    Ebml_SerializeUnsigned(glob, EBMLVersion, 1);
-    Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1);
-    Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4);
-    Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8);
-    Ebml_SerializeString(glob, DocType, "webm");
-    Ebml_SerializeUnsigned(glob, DocTypeVersion, 2);
-    Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2);
-    Ebml_EndSubElement(glob, &start);
-  }
-  {
-    Ebml_StartSubElement(glob, &glob->startSegment, Segment);
-    glob->position_reference = ftello(glob->stream);
-    glob->framerate = *fps;
-    write_webm_seek_info(glob);
-
-    {
-      EbmlLoc trackStart;
-      glob->track_pos = ftello(glob->stream);
-      Ebml_StartSubElement(glob, &trackStart, Tracks);
-      {
-        unsigned int trackNumber = 1;
-        uint64_t     trackID = 0;
-
-        EbmlLoc start;
-        Ebml_StartSubElement(glob, &start, TrackEntry);
-        Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
-        glob->track_id_pos = ftello(glob->stream);
-        Ebml_SerializeUnsigned32(glob, TrackUID, trackID);
-        Ebml_SerializeUnsigned(glob, TrackType, 1);
-        Ebml_SerializeString(glob, CodecID,
-                             fourcc == VP8_FOURCC ? "V_VP8" : "V_VP9");
-        {
-          unsigned int pixelWidth = cfg->g_w;
-          unsigned int pixelHeight = cfg->g_h;
-
-          EbmlLoc videoStart;
-          Ebml_StartSubElement(glob, &videoStart, Video);
-          Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
-          Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
-          Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt);
-          Ebml_EndSubElement(glob, &videoStart);
-        }
-        Ebml_EndSubElement(glob, &start); /* Track Entry */
-      }
-      Ebml_EndSubElement(glob, &trackStart);
-    }
-    /* segment element is open */
-  }
-}
-
-
-static void
-write_webm_block(EbmlGlobal                *glob,
-                 const vpx_codec_enc_cfg_t *cfg,
-                 const vpx_codec_cx_pkt_t  *pkt) {
-  unsigned long  block_length;
-  unsigned char  track_number;
-  unsigned short block_timecode = 0;
-  unsigned char  flags;
-  int64_t        pts_ms;
-  int            start_cluster = 0, is_keyframe;
-
-  /* Calculate the PTS of this frame in milliseconds */
-  pts_ms = pkt->data.frame.pts * 1000
-           * (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den;
-  if (pts_ms <= glob->last_pts_ms)
-    pts_ms = glob->last_pts_ms + 1;
-  glob->last_pts_ms = pts_ms;
-
-  /* Calculate the relative time of this block */
-  if (pts_ms - glob->cluster_timecode > SHRT_MAX)
-    start_cluster = 1;
-  else
-    block_timecode = (unsigned short)pts_ms - glob->cluster_timecode;
-
-  is_keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY);
-  if (start_cluster || is_keyframe) {
-    if (glob->cluster_open)
-      Ebml_EndSubElement(glob, &glob->startCluster);
-
-    /* Open the new cluster */
-    block_timecode = 0;
-    glob->cluster_open = 1;
-    glob->cluster_timecode = (uint32_t)pts_ms;
-    glob->cluster_pos = ftello(glob->stream);
-    Ebml_StartSubElement(glob, &glob->startCluster, Cluster); /* cluster */
-    Ebml_SerializeUnsigned(glob, Timecode, glob->cluster_timecode);
-
-    /* Save a cue point if this is a keyframe. */
-    if (is_keyframe) {
-      struct cue_entry *cue, *new_cue_list;
-
-      new_cue_list = realloc(glob->cue_list,
-                             (glob->cues + 1) * sizeof(struct cue_entry));
-      if (new_cue_list)
-        glob->cue_list = new_cue_list;
-      else
-        fatal("Failed to realloc cue list.");
-
-      cue = &glob->cue_list[glob->cues];
-      cue->time = glob->cluster_timecode;
-      cue->loc = glob->cluster_pos;
-      glob->cues++;
-    }
-  }
-
-  /* Write the Simple Block */
-  Ebml_WriteID(glob, SimpleBlock);
-
-  block_length = (unsigned long)pkt->data.frame.sz + 4;
-  block_length |= 0x10000000;
-  Ebml_Serialize(glob, &block_length, sizeof(block_length), 4);
-
-  track_number = 1;
-  track_number |= 0x80;
-  Ebml_Write(glob, &track_number, 1);
-
-  Ebml_Serialize(glob, &block_timecode, sizeof(block_timecode), 2);
-
-  flags = 0;
-  if (is_keyframe)
-    flags |= 0x80;
-  if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE)
-    flags |= 0x08;
-  Ebml_Write(glob, &flags, 1);
-
-  Ebml_Write(glob, pkt->data.frame.buf, (unsigned long)pkt->data.frame.sz);
-}
-
-
-static void
-write_webm_file_footer(EbmlGlobal *glob, long hash) {
-
-  if (glob->cluster_open)
-    Ebml_EndSubElement(glob, &glob->startCluster);
-
-  {
-    EbmlLoc start;
-    unsigned int i;
-
-    glob->cue_pos = ftello(glob->stream);
-    Ebml_StartSubElement(glob, &start, Cues);
-    for (i = 0; i < glob->cues; i++) {
-      struct cue_entry *cue = &glob->cue_list[i];
-      EbmlLoc start;
-
-      Ebml_StartSubElement(glob, &start, CuePoint);
-      {
-        EbmlLoc start;
-
-        Ebml_SerializeUnsigned(glob, CueTime, cue->time);
-
-        Ebml_StartSubElement(glob, &start, CueTrackPositions);
-        Ebml_SerializeUnsigned(glob, CueTrack, 1);
-        Ebml_SerializeUnsigned64(glob, CueClusterPosition,
-                                 cue->loc - glob->position_reference);
-        Ebml_EndSubElement(glob, &start);
-      }
-      Ebml_EndSubElement(glob, &start);
-    }
-    Ebml_EndSubElement(glob, &start);
-  }
-
-  Ebml_EndSubElement(glob, &glob->startSegment);
-
-  /* Patch up the seek info block */
-  write_webm_seek_info(glob);
-
-  /* Patch up the track id */
-  fseeko(glob->stream, glob->track_id_pos, SEEK_SET);
-  Ebml_SerializeUnsigned32(glob, TrackUID, glob->debug ? 0xDEADBEEF : hash);
-
-  fseeko(glob->stream, 0, SEEK_END);
-}
-
 
 /* Murmur hash derived from public domain reference implementation at
  *   http:// sites.google.com/site/murmurhash/
@@ -914,24 +181,7 @@ static unsigned int murmur(const void *key, int len, unsigned int seed) {
   return h;
 }
 
-#include "math.h"
-#define MAX_PSNR 100
-static double vp8_mse2psnr(double Samples, double Peak, double Mse) {
-  double psnr;
-
-  if ((double)Mse > 0.0)
-    psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
-  else
-    psnr = MAX_PSNR;      /* Limit to prevent / 0 */
 
-  if (psnr > MAX_PSNR)
-    psnr = MAX_PSNR;
-
-  return psnr;
-}
-
-
-#include "args.h"
 static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
                                            "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
@@ -966,11 +216,7 @@ static const arg_def_t verbosearg       = ARG_DEF("v", "verbose", 0,
                                                   "Show encoder parameters");
 static const arg_def_t psnrarg          = ARG_DEF(NULL, "psnr", 0,
                                                   "Show PSNR in status line");
-enum TestDecodeFatality {
-  TEST_DECODE_OFF,
-  TEST_DECODE_FATAL,
-  TEST_DECODE_WARN,
-};
+
 static const struct arg_enum_list test_decode_enum[] = {
   {"off",   TEST_DECODE_OFF},
   {"fatal", TEST_DECODE_FATAL},
@@ -990,11 +236,19 @@ static const arg_def_t q_hist_n         = ARG_DEF(NULL, "q-hist", 1,
                                                   "Show quantizer histogram (n-buckets)");
 static const arg_def_t rate_hist_n         = ARG_DEF(NULL, "rate-hist", 1,
                                                      "Show rate histogram (n-buckets)");
+static const arg_def_t disable_warnings =
+    ARG_DEF(NULL, "disable-warnings", 0,
+            "Disable warnings about potentially incorrect encode settings.");
+static const arg_def_t disable_warning_prompt =
+    ARG_DEF("y", "disable-warning-prompt", 0,
+            "Display warnings, but do not prompt user to continue.");
+
 static const arg_def_t *main_args[] = {
   &debugmode,
   &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &skip,
   &deadline, &best_dl, &good_dl, &rt_dl,
-  &quietarg, &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n, &rate_hist_n,
+  &quietarg, &verbosearg, &psnrarg, &use_ivf, &out_part, &q_hist_n,
+  &rate_hist_n, &disable_warnings, &disable_warning_prompt,
   NULL
 };
 
@@ -1132,6 +386,9 @@ static const arg_def_t lossless = ARG_DEF(NULL, "lossless", 1, "Lossless mode");
 #if CONFIG_VP9_ENCODER
 static const arg_def_t frame_parallel_decoding  = ARG_DEF(
     NULL, "frame-parallel", 1, "Enable frame parallel decodability features");
+static const arg_def_t aq_mode  = ARG_DEF(
+    NULL, "aq-mode", 1,
+    "Adaptive q mode (0: off (by default), 1: variance 2: complexity)");
 #endif
 
 #if CONFIG_VP8_ENCODER
@@ -1156,7 +413,7 @@ static const arg_def_t *vp9_args[] = {
   &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
   &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless,
-  &frame_parallel_decoding,
+  &frame_parallel_decoding, &aq_mode,
   NULL
 };
 static const int vp9_arg_ctrl_map[] = {
@@ -1165,14 +422,14 @@ static const int vp9_arg_ctrl_map[] = {
   VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS,
   VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE,
   VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT,
-  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING,
+  VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   0
 };
 #endif
 
 static const arg_def_t *no_args[] = { NULL };
 
-static void usage_exit() {
+void usage_exit() {
   int i;
 
   fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
@@ -1385,9 +642,9 @@ struct rate_hist {
 };
 
 
-static void init_rate_histogram(struct rate_hist          *hist,
+static void init_rate_histogram(struct rate_hist *hist,
                                 const vpx_codec_enc_cfg_t *cfg,
-                                const vpx_rational_t      *fps) {
+                                const vpx_rational_t *fps) {
   int i;
 
   /* Determine the number of samples in the buffer. Use the file's framerate
@@ -1605,29 +862,6 @@ static int compare_img(vpx_image_t *img1, vpx_image_t *img2)
                              NELEMENTS(vp9_arg_ctrl_map))
 #endif
 
-/* Configuration elements common to all streams */
-struct global_config {
-  const struct codec_item  *codec;
-  int                       passes;
-  int                       pass;
-  int                       usage;
-  int                       deadline;
-  int                       use_i420;
-  int                       quiet;
-  int                       verbose;
-  int                       limit;
-  int                       skip_frames;
-  int                       show_psnr;
-  enum TestDecodeFatality   test_decode;
-  int                       have_framerate;
-  struct vpx_rational       framerate;
-  int                       out_part;
-  int                       debug;
-  int                       show_q_hist_buckets;
-  int                       show_rate_hist_buckets;
-};
-
-
 /* Per-stream configuration */
 struct stream_config {
   struct vpx_codec_enc_cfg  cfg;
@@ -1647,7 +881,7 @@ struct stream_state {
   struct stream_config      config;
   FILE                     *file;
   struct rate_hist          rate_hist;
-  EbmlGlobal                ebml;
+  struct EbmlGlobal         ebml;
   uint32_t                  hash;
   uint64_t                  psnr_sse_total;
   uint64_t                  psnr_samples_total;
@@ -1680,7 +914,7 @@ void validate_positive_rational(const char          *msg,
 }
 
 
-static void parse_global_config(struct global_config *global, char **argv) {
+static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   char       **argi, **argj;
   struct arg   arg;
 
@@ -1757,6 +991,10 @@ static void parse_global_config(struct global_config *global, char **argv) {
       global->show_q_hist_buckets = arg_parse_uint(&arg);
     else if (arg_match(&arg, &rate_hist_n, argi))
       global->show_rate_hist_buckets = arg_parse_uint(&arg);
+    else if (arg_match(&arg, &disable_warnings, argi))
+      global->disable_warnings = 1;
+    else if (arg_match(&arg, &disable_warning_prompt, argi))
+      global->disable_warning_prompt = 1;
     else
       argj++;
   }
@@ -1783,12 +1021,10 @@ static void parse_global_config(struct global_config *global, char **argv) {
 }
 
 
-void open_input_file(struct input_state *input) {
-  unsigned int fourcc;
-
+void open_input_file(struct VpxInputContext *input) {
   /* Parse certain options from the input file, if possible */
-  input->file = strcmp(input->fn, "-") ? fopen(input->fn, "rb")
-                : set_binary_mode(stdin);
+  input->file = strcmp(input->filename, "-")
+      ? fopen(input->filename, "rb") : set_binary_mode(stdin);
 
   if (!input->file)
     fatal("Failed to open input file");
@@ -1812,39 +1048,29 @@ void open_input_file(struct input_state *input) {
     if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
                        input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
-      input->w = input->y4m.pic_w;
-      input->h = input->y4m.pic_h;
-      input->framerate.num = input->y4m.fps_n;
-      input->framerate.den = input->y4m.fps_d;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
       input->use_i420 = 0;
     } else
       fatal("Unsupported Y4M stream.");
-  } else if (input->detect.buf_read == 4 && file_is_ivf(input, &fourcc)) {
-    input->file_type = FILE_TYPE_IVF;
-    switch (fourcc) {
-      case 0x32315659:
-        input->use_i420 = 0;
-        break;
-      case 0x30323449:
-        input->use_i420 = 1;
-        break;
-      default:
-        fatal("Unsupported fourcc (%08x) in IVF", fourcc);
-    }
+  } else if (input->detect.buf_read == 4 && file_is_ivf(input)) {
+    fatal("IVF is not supported as input.");
   } else {
     input->file_type = FILE_TYPE_RAW;
   }
 }
 
 
-static void close_input_file(struct input_state *input) {
+static void close_input_file(struct VpxInputContext *input) {
   fclose(input->file);
   if (input->file_type == FILE_TYPE_Y4M)
     y4m_input_close(&input->y4m);
 }
 
-static struct stream_state *new_stream(struct global_config *global,
-                                       struct stream_state  *prev) {
+static struct stream_state *new_stream(struct VpxEncoderConfig *global,
+                                       struct stream_state *prev) {
   struct stream_state *stream;
 
   stream = calloc(1, sizeof(*stream));
@@ -1892,7 +1118,7 @@ static struct stream_state *new_stream(struct global_config *global,
 }
 
 
-static int parse_stream_params(struct global_config *global,
+static int parse_stream_params(struct VpxEncoderConfig *global,
                                struct stream_state  *stream,
                                char **argv) {
   char                   **argi, **argj;
@@ -2038,14 +1264,13 @@ static int parse_stream_params(struct global_config *global,
 }
 
 
-#define FOREACH_STREAM(func)\
-  do\
-  {\
-    struct stream_state  *stream;\
-    \
-    for(stream = streams; stream; stream = stream->next)\
-      func;\
-  }while(0)
+#define FOREACH_STREAM(func) \
+  do { \
+    struct stream_state *stream; \
+    for (stream = streams; stream; stream = stream->next) { \
+      func; \
+    } \
+  } while (0)
 
 
 static void validate_stream_config(struct stream_state *stream) {
@@ -2097,8 +1322,8 @@ static void set_stream_dimensions(struct stream_state *stream,
 }
 
 
-static void set_default_kf_interval(struct stream_state  *stream,
-                                    struct global_config *global) {
+static void set_default_kf_interval(struct stream_state *stream,
+                                    struct VpxEncoderConfig *global) {
   /* Use a max keyframe interval of 5 seconds, if none was
    * specified on the command line.
    */
@@ -2110,9 +1335,9 @@ static void set_default_kf_interval(struct stream_state  *stream,
 }
 
 
-static void show_stream_config(struct stream_state  *stream,
-                               struct global_config *global,
-                               struct input_state   *input) {
+static void show_stream_config(struct stream_state *stream,
+                               struct VpxEncoderConfig *global,
+                               struct VpxInputContext *input) {
 
 #define SHOW(field) \
   fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
@@ -2120,7 +1345,7 @@ static void show_stream_config(struct stream_state  *stream,
   if (stream->index == 0) {
     fprintf(stderr, "Codec: %s\n",
             vpx_codec_iface_name(global->codec->iface()));
-    fprintf(stderr, "Source file: %s Format: %s\n", input->fn,
+    fprintf(stderr, "Source file: %s Format: %s\n", input->filename,
             input->use_i420 ? "I420" : "YV12");
   }
   if (stream->next || stream->index)
@@ -2161,7 +1386,7 @@ static void show_stream_config(struct stream_state  *stream,
 
 
 static void open_output_file(struct stream_state *stream,
-                             struct global_config *global) {
+                             struct VpxEncoderConfig *global) {
   const char *fn = stream->config.out_fn;
 
   stream->file = strcmp(fn, "-") ? fopen(fn, "wb") : set_binary_mode(stdout);
@@ -2179,7 +1404,7 @@ static void open_output_file(struct stream_state *stream,
                            stream->config.stereo_fmt,
                            global->codec->fourcc);
   } else
-    write_ivf_file_header(stream->file, &stream->config.cfg,
+    ivf_write_file_header(stream->file, &stream->config.cfg,
                           global->codec->fourcc, 0);
 }
 
@@ -2192,7 +1417,7 @@ static void close_output_file(struct stream_state *stream,
     stream->ebml.cue_list = NULL;
   } else {
     if (!fseek(stream->file, 0, SEEK_SET))
-      write_ivf_file_header(stream->file, &stream->config.cfg,
+      ivf_write_file_header(stream->file, &stream->config.cfg,
                             fourcc,
                             stream->frames_out);
   }
@@ -2201,9 +1426,9 @@ static void close_output_file(struct stream_state *stream,
 }
 
 
-static void setup_pass(struct stream_state  *stream,
-                       struct global_config *global,
-                       int                   pass) {
+static void setup_pass(struct stream_state *stream,
+                       struct VpxEncoderConfig *global,
+                       int pass) {
   if (stream->config.stats_fn) {
     if (!stats_open_file(&stream->stats, stream->config.stats_fn,
                          pass))
@@ -2225,8 +1450,8 @@ static void setup_pass(struct stream_state  *stream,
 }
 
 
-static void initialize_encoder(struct stream_state  *stream,
-                               struct global_config *global) {
+static void initialize_encoder(struct stream_state *stream,
+                               struct VpxEncoderConfig *global) {
   int i;
   int flags = 0;
 
@@ -2260,10 +1485,10 @@ static void initialize_encoder(struct stream_state  *stream,
 }
 
 
-static void encode_frame(struct stream_state  *stream,
-                         struct global_config *global,
-                         struct vpx_image     *img,
-                         unsigned int          frames_in) {
+static void encode_frame(struct stream_state *stream,
+                         struct VpxEncoderConfig *global,
+                         struct vpx_image *img,
+                         unsigned int frames_in) {
   vpx_codec_pts_t frame_start, next_frame_start;
   struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
   struct vpx_usec_timer timer;
@@ -2318,9 +1543,9 @@ static void update_quantizer_histogram(struct stream_state *stream) {
 }
 
 
-static void get_cx_data(struct stream_state  *stream,
-                        struct global_config *global,
-                        int                  *got_data) {
+static void get_cx_data(struct stream_state *stream,
+                        struct VpxEncoderConfig *global,
+                        int *got_data) {
   const vpx_codec_cx_pkt_t *pkt;
   const struct vpx_codec_enc_cfg *cfg = &stream->config.cfg;
   vpx_codec_iter_t iter = NULL;
@@ -2352,14 +1577,14 @@ static void get_cx_data(struct stream_state  *stream,
             ivf_header_pos = ftello(stream->file);
             fsize = pkt->data.frame.sz;
 
-            write_ivf_frame_header(stream->file, pkt);
+            ivf_write_frame_header(stream->file, pkt);
           } else {
             fsize += pkt->data.frame.sz;
 
             if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
               off_t currpos = ftello(stream->file);
               fseeko(stream->file, ivf_header_pos, SEEK_SET);
-              write_ivf_frame_size(stream->file, fsize);
+              ivf_write_frame_size(stream->file, fsize);
               fseeko(stream->file, currpos, SEEK_SET);
             }
           }
@@ -2512,18 +1737,19 @@ static void print_time(const char *label, int64_t etl) {
   }
 }
 
+
 int main(int argc, const char **argv_) {
-  int                    pass;
-  vpx_image_t            raw;
-  int                    frame_avail, got_data;
-
-  struct input_state       input = {0};
-  struct global_config     global;
-  struct stream_state     *streams = NULL;
-  char                   **argv, **argi;
-  uint64_t                 cx_time = 0;
-  int                      stream_cnt = 0;
-  int                      res = 0;
+  int pass;
+  vpx_image_t raw;
+  int frame_avail, got_data;
+
+  struct VpxInputContext input = {0};
+  struct VpxEncoderConfig global;
+  struct stream_state *streams = NULL;
+  char **argv, **argi;
+  uint64_t cx_time = 0;
+  int stream_cnt = 0;
+  int res = 0;
 
   exec_name = argv_[0];
 
@@ -2531,8 +1757,8 @@ int main(int argc, const char **argv_) {
     usage_exit();
 
   /* Setup default input stream settings */
-  input.framerate.num = 30;
-  input.framerate.den = 1;
+  input.framerate.numerator = 30;
+  input.framerate.denominator = 1;
   input.use_i420 = 1;
   input.only_i420 = 1;
 
@@ -2543,6 +1769,7 @@ int main(int argc, const char **argv_) {
   argv = argv_dup(argc - 1, argv_ + 1);
   parse_global_config(&global, argv);
 
+
   {
     /* Now parse each stream's parameters. Using a local scope here
      * due to the use of 'stream' as loop variable in FOREACH_STREAM
@@ -2563,10 +1790,13 @@ int main(int argc, const char **argv_) {
     if (argi[0][0] == '-' && argi[0][1])
       die("Error: Unrecognized option %s\n", *argi);
 
+  FOREACH_STREAM(check_encoder_config(global.disable_warning_prompt,
+                                      &global, &stream->config.cfg););
+
   /* Handle non-option arguments */
-  input.fn = argv[0];
+  input.filename = argv[0];
 
-  if (!input.fn)
+  if (!input.filename)
     usage_exit();
 
 #if CONFIG_NON420
@@ -2586,20 +1816,20 @@ int main(int argc, const char **argv_) {
     /* If the input file doesn't specify its w/h (raw files), try to get
      * the data from the first stream's configuration.
      */
-    if (!input.w || !input.h)
+    if (!input.width || !input.height)
       FOREACH_STREAM( {
       if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-        input.w = stream->config.cfg.g_w;
-        input.h = stream->config.cfg.g_h;
+        input.width = stream->config.cfg.g_w;
+        input.height = stream->config.cfg.g_h;
         break;
       }
     });
 
     /* Update stream configurations from the input file's parameters */
-    if (!input.w || !input.h)
+    if (!input.width || !input.height)
       fatal("Specify stream dimensions with --width (-w) "
             " and --height (-h)");
-    FOREACH_STREAM(set_stream_dimensions(stream, input.w, input.h));
+    FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream));
 
     /* Ensure that --passes and --pass are consistent. If --pass is set and
@@ -2615,8 +1845,10 @@ int main(int argc, const char **argv_) {
     /* Use the frame rate from the file only if none was specified
      * on the command-line.
      */
-    if (!global.have_framerate)
-      global.framerate = input.framerate;
+    if (!global.have_framerate) {
+      global.framerate.num = input.framerate.numerator;
+      global.framerate.den = input.framerate.denominator;
+    }
 
     FOREACH_STREAM(set_default_kf_interval(stream, &global));
 
@@ -2634,7 +1866,7 @@ int main(int argc, const char **argv_) {
         vpx_img_alloc(&raw,
                       input.use_i420 ? VPX_IMG_FMT_I420
                       : VPX_IMG_FMT_YV12,
-                      input.w, input.h, 32);
+                      input.width, input.height, 32);
 
       FOREACH_STREAM(init_rate_histogram(&stream->rate_hist,
                                          &stream->config.cfg,
diff --git a/source/libvpx/vpxenc.h b/source/libvpx/vpxenc.h
new file mode 100644
index 0000000..5cb3f85
--- /dev/null
+++ b/source/libvpx/vpxenc.h
@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPXENC_H_
+#define VPXENC_H_
+
+#include "vpx/vpx_encoder.h"
+
+enum TestDecodeFatality {
+  TEST_DECODE_OFF,
+  TEST_DECODE_FATAL,
+  TEST_DECODE_WARN,
+};
+
+/* Configuration elements common to all streams. */
+struct VpxEncoderConfig {
+  const struct codec_item *codec;
+  int passes;
+  int pass;
+  int usage;
+  int deadline;
+  int use_i420;
+  int quiet;
+  int verbose;
+  int limit;
+  int skip_frames;
+  int show_psnr;
+  enum TestDecodeFatality test_decode;
+  int have_framerate;
+  struct vpx_rational framerate;
+  int out_part;
+  int debug;
+  int show_q_hist_buckets;
+  int show_rate_hist_buckets;
+  int disable_warnings;
+  int disable_warning_prompt;
+};
+
+#endif  // VPXENC_H_
diff --git a/source/libvpx/vpxstats.c b/source/libvpx/vpxstats.c
new file mode 100644
index 0000000..70cea3e
--- /dev/null
+++ b/source/libvpx/vpxstats.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpxstats.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "./tools_common.h"
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (pass == 0) {
+    stats->file = fopen(fpf, "wb");
+    stats->buf.sz = 0;
+    stats->buf.buf = NULL;
+    res = (stats->file != NULL);
+  } else {
+#if USE_POSIX_MMAP
+    struct stat stat_buf;
+    int fd;
+
+    fd = open(fpf, O_RDONLY);
+    stats->file = fdopen(fd, "rb");
+    fstat(fd, &stat_buf);
+    stats->buf.sz = stat_buf.st_size;
+    stats->buf.buf = mmap(NULL, stats->buf.sz, PROT_READ, MAP_PRIVATE, fd, 0);
+    res = (stats->buf.buf != NULL);
+#else
+    size_t nbytes;
+
+    stats->file = fopen(fpf, "rb");
+
+    if (fseek(stats->file, 0, SEEK_END))
+      fatal("First-pass stats file must be seekable!");
+
+    stats->buf.sz = stats->buf_alloc_sz = ftell(stats->file);
+    rewind(stats->file);
+
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+
+    if (!stats->buf.buf)
+      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+            (unsigned int)stats->buf_alloc_sz);
+
+    nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
+    res = (nbytes == stats->buf.sz);
+#endif  /* USE_POSIX_MMAP */
+  }
+
+  return res;
+}
+
+int stats_open_mem(stats_io_t *stats, int pass) {
+  int res;
+  stats->pass = pass;
+
+  if (!pass) {
+    stats->buf.sz = 0;
+    stats->buf_alloc_sz = 64 * 1024;
+    stats->buf.buf = malloc(stats->buf_alloc_sz);
+  }
+
+  stats->buf_ptr = stats->buf.buf;
+  res = (stats->buf.buf != NULL);
+  return res;
+}
+
+void stats_close(stats_io_t *stats, int last_pass) {
+  if (stats->file) {
+    if (stats->pass == last_pass) {
+#if USE_POSIX_MMAP
+      munmap(stats->buf.buf, stats->buf.sz);
+#else
+      free(stats->buf.buf);
+#endif  /* USE_POSIX_MMAP */
+    }
+
+    fclose(stats->file);
+    stats->file = NULL;
+  } else {
+    if (stats->pass == last_pass)
+      free(stats->buf.buf);
+  }
+}
+
+void stats_write(stats_io_t *stats, const void *pkt, size_t len) {
+  if (stats->file) {
+    (void) fwrite(pkt, 1, len, stats->file);
+  } else {
+    if (stats->buf.sz + len > stats->buf_alloc_sz) {
+      size_t  new_sz = stats->buf_alloc_sz + 64 * 1024;
+      char   *new_ptr = realloc(stats->buf.buf, new_sz);
+
+      if (new_ptr) {
+        stats->buf_ptr = new_ptr + (stats->buf_ptr - (char *)stats->buf.buf);
+        stats->buf.buf = new_ptr;
+        stats->buf_alloc_sz = new_sz;
+      } else {
+        fatal("Failed to realloc firstpass stats buffer.");
+      }
+    }
+
+    memcpy(stats->buf_ptr, pkt, len);
+    stats->buf.sz += len;
+    stats->buf_ptr += len;
+  }
+}
+
+vpx_fixed_buf_t stats_get(stats_io_t *stats) {
+  return stats->buf;
+}
+
+double vp8_mse2psnr(double samples, double peak, double mse) {
+  const int kMaxPSNR = 100;
+  double psnr = kMaxPSNR;
+
+  if (mse > 0.0)
+    psnr = 10.0 * log10(peak * peak * samples / mse);
+
+  if (psnr > kMaxPSNR)
+    psnr = kMaxPSNR;
+
+  return psnr;
+}
diff --git a/source/libvpx/vpxstats.h b/source/libvpx/vpxstats.h
new file mode 100644
index 0000000..18b3acd
--- /dev/null
+++ b/source/libvpx/vpxstats.h
@@ -0,0 +1,37 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPXSTATS_H_
+#define VPXSTATS_H_
+
+#include <stdio.h>
+
+#include "vpx/vpx_encoder.h"
+
+/* This structure is used to abstract the different ways of handling
+ * first pass statistics
+ */
+typedef struct {
+  vpx_fixed_buf_t buf;
+  int pass;
+  FILE *file;
+  char *buf_ptr;
+  size_t buf_alloc_sz;
+} stats_io_t;
+
+int stats_open_file(stats_io_t *stats, const char *fpf, int pass);
+int stats_open_mem(stats_io_t *stats, int pass);
+void stats_close(stats_io_t *stats, int last_pass);
+void stats_write(stats_io_t *stats, const void *pkt, size_t len);
+vpx_fixed_buf_t stats_get(stats_io_t *stats);
+
+double vp8_mse2psnr(double samples, double peak, double mse);
+
+#endif  // VPXSTATS_H_
diff --git a/source/libvpx/warnings.c b/source/libvpx/warnings.c
new file mode 100644
index 0000000..f76d706
--- /dev/null
+++ b/source/libvpx/warnings.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./warnings.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+
+#include "./tools_common.h"
+#include "./vpxenc.h"
+
+static const char quantizer_warning_string[] =
+    "Bad quantizer values. Quantizer values should not be equal, and should "
+    "differ by at least 8.";
+static const char lag_in_frames_with_realtime[] =
+    "Lag in frames is ignored when deadline is set to realtime.";
+
+struct WarningListNode {
+  const char *warning_string;
+  struct WarningListNode *next_warning;
+};
+
+struct WarningList {
+  struct WarningListNode *warning_node;
+};
+
+static void add_warning(const char *warning_string,
+                        struct WarningList *warning_list) {
+  struct WarningListNode **node = &warning_list->warning_node;
+
+  struct WarningListNode *new_node = malloc(sizeof(*new_node));
+  if (new_node == NULL) {
+    fatal("Unable to allocate warning node.");
+  }
+
+  new_node->warning_string = warning_string;
+  new_node->next_warning = NULL;
+
+  while (*node != NULL)
+    node = &(*node)->next_warning;
+
+  *node = new_node;
+}
+
+static void free_warning_list(struct WarningList *warning_list) {
+  struct WarningListNode *node = warning_list->warning_node;
+  while (warning_list->warning_node != NULL) {
+    node = warning_list->warning_node->next_warning;
+    free(warning_list->warning_node);
+    warning_list->warning_node = node;
+  }
+}
+
+static int continue_prompt(int num_warnings) {
+  int c;
+  fprintf(stderr,
+          "%d encoder configuration warning(s). Continue? (y to continue) ",
+          num_warnings);
+  c = getchar();
+  return c == 'y';
+}
+
+static void check_lag_in_frames_realtime_deadline(
+    int lag_in_frames,
+    int deadline,
+    struct WarningList *warning_list) {
+  if (deadline == VPX_DL_REALTIME && lag_in_frames != 0)
+    add_warning(lag_in_frames_with_realtime, warning_list);
+}
+
+static void check_quantizer(int min_q, int max_q,
+                            struct WarningList *warning_list) {
+  const int lossless = min_q == 0 && max_q == 0;
+  if (!lossless && (min_q == max_q || abs(max_q - min_q) < 8))
+    add_warning(quantizer_warning_string, warning_list);
+}
+
+void check_encoder_config(int disable_prompt,
+                          const struct VpxEncoderConfig *global_config,
+                          const struct vpx_codec_enc_cfg *stream_config) {
+  int num_warnings = 0;
+  struct WarningListNode *warning = NULL;
+  struct WarningList warning_list = {0};
+
+  check_quantizer(stream_config->rc_min_quantizer,
+                  stream_config->rc_max_quantizer,
+                  &warning_list);
+  check_lag_in_frames_realtime_deadline(stream_config->g_lag_in_frames,
+                                        global_config->deadline,
+                                        &warning_list);
+
+  /* Count and print warnings. */
+  for (warning = warning_list.warning_node;
+       warning != NULL;
+       warning = warning->next_warning,
+       ++num_warnings) {
+    warn(warning->warning_string);
+  }
+
+  free_warning_list(&warning_list);
+
+  if (num_warnings) {
+    if (!disable_prompt && !continue_prompt(num_warnings))
+      exit(EXIT_FAILURE);
+  }
+}
diff --git a/source/libvpx/warnings.h b/source/libvpx/warnings.h
new file mode 100644
index 0000000..ac3a4b6
--- /dev/null
+++ b/source/libvpx/warnings.h
@@ -0,0 +1,25 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WARNINGS_H_
+#define WARNINGS_H_
+
+struct vpx_codec_enc_cfg;
+struct VpxEncoderConfig;
+
+/*
+ * Checks config for improperly used settings. Warns user upon encountering
+ * settings that will lead to poor output quality. Prompts user to continue
+ * when warnings are issued.
+ */
+void check_encoder_config(int disable_prompt,
+                          const struct VpxEncoderConfig *global_config,
+                          const struct vpx_codec_enc_cfg *stream_config);
+
+#endif  // WARNINGS_H_
diff --git a/source/libvpx/webmdec.c b/source/libvpx/webmdec.c
new file mode 100644
index 0000000..0c75d7a
--- /dev/null
+++ b/source/libvpx/webmdec.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./webmdec.h"
+
+#include <stdarg.h>
+
+#include "nestegg/include/nestegg/nestegg.h"
+
+static int nestegg_read_cb(void *buffer, size_t length, void *userdata) {
+  FILE *f = userdata;
+
+  if (fread(buffer, 1, length, f) < length) {
+    if (ferror(f))
+      return -1;
+    if (feof(f))
+      return 0;
+  }
+  return 1;
+}
+
+static int nestegg_seek_cb(int64_t offset, int whence, void *userdata) {
+  switch (whence) {
+    case NESTEGG_SEEK_SET:
+      whence = SEEK_SET;
+      break;
+    case NESTEGG_SEEK_CUR:
+      whence = SEEK_CUR;
+      break;
+    case NESTEGG_SEEK_END:
+      whence = SEEK_END;
+      break;
+  };
+  return fseek(userdata, (int32_t)offset, whence) ? -1 : 0;
+}
+
+static int64_t nestegg_tell_cb(void *userdata) {
+  return ftell(userdata);
+}
+
+static void nestegg_log_cb(nestegg *context,
+                           unsigned int severity,
+                           char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  fprintf(stderr, "\n");
+  va_end(ap);
+}
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx) {
+  uint32_t i, n;
+  int track_type = -1;
+  int codec_id;
+
+  nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
+  nestegg_video_params params;
+
+  io.userdata = vpx_ctx->file;
+  if (nestegg_init(&webm_ctx->nestegg_ctx, io, NULL))
+    goto fail;
+
+  if (nestegg_track_count(webm_ctx->nestegg_ctx, &n))
+    goto fail;
+
+  for (i = 0; i < n; i++) {
+    track_type = nestegg_track_type(webm_ctx->nestegg_ctx, i);
+
+    if (track_type == NESTEGG_TRACK_VIDEO)
+      break;
+    else if (track_type < 0)
+      goto fail;
+  }
+
+  codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
+  if (codec_id == NESTEGG_CODEC_VP8) {
+    vpx_ctx->fourcc = VP8_FOURCC_MASK;
+  } else if (codec_id == NESTEGG_CODEC_VP9) {
+    vpx_ctx->fourcc = VP9_FOURCC_MASK;
+  } else {
+    fatal("Not VPx video, quitting.\n");
+  }
+
+  webm_ctx->video_track = i;
+
+  if (nestegg_track_video_params(webm_ctx->nestegg_ctx, i, &params))
+    goto fail;
+
+  vpx_ctx->framerate.denominator = 0;
+  vpx_ctx->framerate.numerator = 0;
+  vpx_ctx->width = params.width;
+  vpx_ctx->height = params.height;
+
+  return 1;
+
+ fail:
+  webm_ctx->nestegg_ctx = NULL;
+  rewind(vpx_ctx->file);
+
+  return 0;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size) {
+  if (webm_ctx->chunk >= webm_ctx->chunks) {
+    uint32_t track;
+
+    do {
+      /* End of this packet, get another. */
+      if (webm_ctx->pkt) {
+        nestegg_free_packet(webm_ctx->pkt);
+        webm_ctx->pkt = NULL;
+      }
+
+      if (nestegg_read_packet(webm_ctx->nestegg_ctx, &webm_ctx->pkt) <= 0 ||
+          nestegg_packet_track(webm_ctx->pkt, &track)) {
+        return 1;
+      }
+    } while (track != webm_ctx->video_track);
+
+    if (nestegg_packet_count(webm_ctx->pkt, &webm_ctx->chunks))
+      return 1;
+
+    webm_ctx->chunk = 0;
+  }
+
+  if (nestegg_packet_data(webm_ctx->pkt, webm_ctx->chunk,
+                          buffer, bytes_in_buffer)) {
+    return 1;
+  }
+
+  webm_ctx->chunk++;
+  return 0;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx) {
+  uint32_t i;
+  uint64_t tstamp = 0;
+
+  /* Check to see if we can seek before we parse any data. */
+  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0)) {
+    warn("Failed to guess framerate (no Cues), set to 30fps.\n");
+    vpx_ctx->framerate.numerator = 30;
+    vpx_ctx->framerate.denominator  = 1;
+    return 0;
+  }
+
+  /* Guess the framerate. Read up to 1 second, or 50 video packets,
+   * whichever comes first.
+   */
+  for (i = 0; tstamp < 1000000000 && i < 50;) {
+    nestegg_packet *pkt;
+    uint32_t track;
+
+    if (nestegg_read_packet(webm_ctx->nestegg_ctx, &pkt) <= 0)
+      break;
+
+    nestegg_packet_track(pkt, &track);
+    if (track == webm_ctx->video_track) {
+      nestegg_packet_tstamp(pkt, &tstamp);
+      ++i;
+    }
+
+    nestegg_free_packet(pkt);
+  }
+
+  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0))
+    goto fail;
+
+  vpx_ctx->framerate.numerator = (i - 1) * 1000000;
+  vpx_ctx->framerate.denominator = (int)(tstamp / 1000);
+  return 0;
+
+ fail:
+  nestegg_destroy(webm_ctx->nestegg_ctx);
+  webm_ctx->nestegg_ctx = NULL;
+  rewind(vpx_ctx->file);
+  return 1;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) {
+  if (webm_ctx && webm_ctx->nestegg_ctx) {
+    if (webm_ctx->pkt)
+      nestegg_free_packet(webm_ctx->pkt);
+    nestegg_destroy(webm_ctx->nestegg_ctx);
+  }
+}
diff --git a/source/libvpx/webmdec.h b/source/libvpx/webmdec.h
new file mode 100644
index 0000000..002fbe6
--- /dev/null
+++ b/source/libvpx/webmdec.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WEBMDEC_H_
+#define WEBMDEC_H_
+
+#include "./tools_common.h"
+
+struct nestegg;
+struct nestegg_packet;
+struct VpxInputContext;
+
+struct WebmInputContext {
+  uint32_t chunk;
+  uint32_t chunks;
+  uint32_t video_track;
+  struct nestegg *nestegg_ctx;
+  struct nestegg_packet *pkt;
+};
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx);
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size);
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx);
+
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#endif  // WEBMDEC_H_
diff --git a/source/libvpx/webmenc.c b/source/libvpx/webmenc.c
new file mode 100644
index 0000000..17bbeec
--- /dev/null
+++ b/source/libvpx/webmenc.c
@@ -0,0 +1,331 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "webmenc.h"
+
+#include <limits.h>
+#include <string.h>
+
+#include "third_party/libmkv/EbmlWriter.h"
+#include "third_party/libmkv/EbmlIDs.h"
+
+void Ebml_Write(struct EbmlGlobal *glob,
+                const void *buffer_in,
+                unsigned long len) {
+  (void) fwrite(buffer_in, 1, len, glob->stream);
+}
+
+#define WRITE_BUFFER(s) \
+for (i = len - 1; i >= 0; i--) { \
+  x = (char)(*(const s *)buffer_in >> (i * CHAR_BIT)); \
+  Ebml_Write(glob, &x, 1); \
+}
+
+void Ebml_Serialize(struct EbmlGlobal *glob,
+                    const void *buffer_in,
+                    int buffer_size,
+                    unsigned long len) {
+  char x;
+  int i;
+
+  /* buffer_size:
+   * 1 - int8_t;
+   * 2 - int16_t;
+   * 3 - int32_t;
+   * 4 - int64_t;
+   */
+  switch (buffer_size) {
+    case 1:
+      WRITE_BUFFER(int8_t)
+      break;
+    case 2:
+      WRITE_BUFFER(int16_t)
+      break;
+    case 4:
+      WRITE_BUFFER(int32_t)
+      break;
+    case 8:
+      WRITE_BUFFER(int64_t)
+      break;
+    default:
+      break;
+  }
+}
+#undef WRITE_BUFFER
+
+/* Need a fixed size serializer for the track ID. libmkv provides a 64 bit
+ * one, but not a 32 bit one.
+ */
+static void Ebml_SerializeUnsigned32(struct EbmlGlobal *glob,
+                                     unsigned int class_id,
+                                     uint64_t ui) {
+  const unsigned char sizeSerialized = 4 | 0x80;
+  Ebml_WriteID(glob, class_id);
+  Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
+  Ebml_Serialize(glob, &ui, sizeof(ui), 4);
+}
+
+static void Ebml_StartSubElement(struct EbmlGlobal *glob,
+                                 EbmlLoc *ebmlLoc,
+                                 unsigned int class_id) {
+  const uint64_t kEbmlUnknownLength = LITERALU64(0x01FFFFFF, 0xFFFFFFFF);
+  Ebml_WriteID(glob, class_id);
+  *ebmlLoc = ftello(glob->stream);
+  Ebml_Serialize(glob, &kEbmlUnknownLength, sizeof(kEbmlUnknownLength), 8);
+}
+
+static void Ebml_EndSubElement(struct EbmlGlobal *glob, EbmlLoc *ebmlLoc) {
+  off_t pos;
+  uint64_t size;
+
+  /* Save the current stream pointer. */
+  pos = ftello(glob->stream);
+
+  /* Calculate the size of this element. */
+  size = pos - *ebmlLoc - 8;
+  size |= LITERALU64(0x01000000, 0x00000000);
+
+  /* Seek back to the beginning of the element and write the new size. */
+  fseeko(glob->stream, *ebmlLoc, SEEK_SET);
+  Ebml_Serialize(glob, &size, sizeof(size), 8);
+
+  /* Reset the stream pointer. */
+  fseeko(glob->stream, pos, SEEK_SET);
+}
+
+void write_webm_seek_element(struct EbmlGlobal *ebml,
+                             unsigned int id,
+                             off_t pos) {
+  uint64_t offset = pos - ebml->position_reference;
+  EbmlLoc start;
+  Ebml_StartSubElement(ebml, &start, Seek);
+  Ebml_SerializeBinary(ebml, SeekID, id);
+  Ebml_SerializeUnsigned64(ebml, SeekPosition, offset);
+  Ebml_EndSubElement(ebml, &start);
+}
+
+void write_webm_seek_info(struct EbmlGlobal *ebml) {
+  off_t pos;
+  EbmlLoc start;
+  EbmlLoc startInfo;
+  uint64_t frame_time;
+  char version_string[64];
+
+  /* Save the current stream pointer. */
+  pos = ftello(ebml->stream);
+
+  if (ebml->seek_info_pos)
+    fseeko(ebml->stream, ebml->seek_info_pos, SEEK_SET);
+  else
+    ebml->seek_info_pos = pos;
+
+  Ebml_StartSubElement(ebml, &start, SeekHead);
+  write_webm_seek_element(ebml, Tracks, ebml->track_pos);
+  write_webm_seek_element(ebml, Cues, ebml->cue_pos);
+  write_webm_seek_element(ebml, Info, ebml->segment_info_pos);
+  Ebml_EndSubElement(ebml, &start);
+
+  /* Create and write the Segment Info. */
+  if (ebml->debug) {
+    strcpy(version_string, "vpxenc");
+  } else {
+    strcpy(version_string, "vpxenc ");
+    strncat(version_string,
+            vpx_codec_version_str(),
+            sizeof(version_string) - 1 - strlen(version_string));
+  }
+
+  frame_time = (uint64_t)1000 * ebml->framerate.den
+               / ebml->framerate.num;
+  ebml->segment_info_pos = ftello(ebml->stream);
+  Ebml_StartSubElement(ebml, &startInfo, Info);
+  Ebml_SerializeUnsigned(ebml, TimecodeScale, 1000000);
+  Ebml_SerializeFloat(ebml, Segment_Duration,
+                      (double)(ebml->last_pts_ms + frame_time));
+  Ebml_SerializeString(ebml, 0x4D80, version_string);
+  Ebml_SerializeString(ebml, 0x5741, version_string);
+  Ebml_EndSubElement(ebml, &startInfo);
+}
+
+void write_webm_file_header(struct EbmlGlobal *glob,
+                            const vpx_codec_enc_cfg_t *cfg,
+                            const struct vpx_rational *fps,
+                            stereo_format_t stereo_fmt,
+                            unsigned int fourcc) {
+  EbmlLoc start;
+  EbmlLoc trackStart;
+  EbmlLoc videoStart;
+  unsigned int trackNumber = 1;
+  uint64_t trackID = 0;
+  unsigned int pixelWidth = cfg->g_w;
+  unsigned int pixelHeight = cfg->g_h;
+
+  /* Write the EBML header. */
+  Ebml_StartSubElement(glob, &start, EBML);
+  Ebml_SerializeUnsigned(glob, EBMLVersion, 1);
+  Ebml_SerializeUnsigned(glob, EBMLReadVersion, 1);
+  Ebml_SerializeUnsigned(glob, EBMLMaxIDLength, 4);
+  Ebml_SerializeUnsigned(glob, EBMLMaxSizeLength, 8);
+  Ebml_SerializeString(glob, DocType, "webm");
+  Ebml_SerializeUnsigned(glob, DocTypeVersion, 2);
+  Ebml_SerializeUnsigned(glob, DocTypeReadVersion, 2);
+  Ebml_EndSubElement(glob, &start);
+
+  /* Open and begin writing the segment element. */
+  Ebml_StartSubElement(glob, &glob->startSegment, Segment);
+  glob->position_reference = ftello(glob->stream);
+  glob->framerate = *fps;
+  write_webm_seek_info(glob);
+
+  /* Open and write the Tracks element. */
+  glob->track_pos = ftello(glob->stream);
+  Ebml_StartSubElement(glob, &trackStart, Tracks);
+
+  /* Open and write the Track entry. */
+  Ebml_StartSubElement(glob, &start, TrackEntry);
+  Ebml_SerializeUnsigned(glob, TrackNumber, trackNumber);
+  glob->track_id_pos = ftello(glob->stream);
+  Ebml_SerializeUnsigned32(glob, TrackUID, trackID);
+  Ebml_SerializeUnsigned(glob, TrackType, 1);
+  Ebml_SerializeString(glob, CodecID,
+                       fourcc == VP8_FOURCC ? "V_VP8" : "V_VP9");
+  Ebml_StartSubElement(glob, &videoStart, Video);
+  Ebml_SerializeUnsigned(glob, PixelWidth, pixelWidth);
+  Ebml_SerializeUnsigned(glob, PixelHeight, pixelHeight);
+  Ebml_SerializeUnsigned(glob, StereoMode, stereo_fmt);
+  Ebml_EndSubElement(glob, &videoStart);
+
+  /* Close Track entry. */
+  Ebml_EndSubElement(glob, &start);
+
+  /* Close Tracks element. */
+  Ebml_EndSubElement(glob, &trackStart);
+
+  /* Segment element remains open. */
+}
+
+void write_webm_block(struct EbmlGlobal *glob,
+                      const vpx_codec_enc_cfg_t *cfg,
+                      const vpx_codec_cx_pkt_t *pkt) {
+  unsigned int block_length;
+  unsigned char track_number;
+  uint16_t block_timecode = 0;
+  unsigned char flags;
+  int64_t pts_ms;
+  int start_cluster = 0, is_keyframe;
+
+  /* Calculate the PTS of this frame in milliseconds. */
+  pts_ms = pkt->data.frame.pts * 1000
+           * (uint64_t)cfg->g_timebase.num / (uint64_t)cfg->g_timebase.den;
+
+  if (pts_ms <= glob->last_pts_ms)
+    pts_ms = glob->last_pts_ms + 1;
+
+  glob->last_pts_ms = pts_ms;
+
+  /* Calculate the relative time of this block. */
+  if (pts_ms - glob->cluster_timecode > SHRT_MAX)
+    start_cluster = 1;
+  else
+    block_timecode = (uint16_t)pts_ms - glob->cluster_timecode;
+
+  is_keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY);
+  if (start_cluster || is_keyframe) {
+    if (glob->cluster_open)
+      Ebml_EndSubElement(glob, &glob->startCluster);
+
+    /* Open the new cluster. */
+    block_timecode = 0;
+    glob->cluster_open = 1;
+    glob->cluster_timecode = (uint32_t)pts_ms;
+    glob->cluster_pos = ftello(glob->stream);
+    Ebml_StartSubElement(glob, &glob->startCluster, Cluster);
+    Ebml_SerializeUnsigned(glob, Timecode, glob->cluster_timecode);
+
+    /* Save a cue point if this is a keyframe. */
+    if (is_keyframe) {
+      struct cue_entry *cue, *new_cue_list;
+
+      new_cue_list = realloc(glob->cue_list,
+                             (glob->cues + 1) * sizeof(struct cue_entry));
+      if (new_cue_list)
+        glob->cue_list = new_cue_list;
+      else
+        fatal("Failed to realloc cue list.");
+
+      cue = &glob->cue_list[glob->cues];
+      cue->time = glob->cluster_timecode;
+      cue->loc = glob->cluster_pos;
+      glob->cues++;
+    }
+  }
+
+  /* Write the Simple Block. */
+  Ebml_WriteID(glob, SimpleBlock);
+
+  block_length = (unsigned int)pkt->data.frame.sz + 4;
+  block_length |= 0x10000000;
+  Ebml_Serialize(glob, &block_length, sizeof(block_length), 4);
+
+  track_number = 1;
+  track_number |= 0x80;
+  Ebml_Write(glob, &track_number, 1);
+
+  Ebml_Serialize(glob, &block_timecode, sizeof(block_timecode), 2);
+
+  flags = 0;
+  if (is_keyframe)
+    flags |= 0x80;
+  if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE)
+    flags |= 0x08;
+  Ebml_Write(glob, &flags, 1);
+
+  Ebml_Write(glob, pkt->data.frame.buf, (unsigned int)pkt->data.frame.sz);
+}
+
+void write_webm_file_footer(struct EbmlGlobal *glob, int hash) {
+  EbmlLoc start_cues;
+  EbmlLoc start_cue_point;
+  EbmlLoc start_cue_tracks;
+  unsigned int i;
+
+  if (glob->cluster_open)
+    Ebml_EndSubElement(glob, &glob->startCluster);
+
+  glob->cue_pos = ftello(glob->stream);
+  Ebml_StartSubElement(glob, &start_cues, Cues);
+
+  for (i = 0; i < glob->cues; i++) {
+    struct cue_entry *cue = &glob->cue_list[i];
+    Ebml_StartSubElement(glob, &start_cue_point, CuePoint);
+    Ebml_SerializeUnsigned(glob, CueTime, cue->time);
+
+    Ebml_StartSubElement(glob, &start_cue_tracks, CueTrackPositions);
+    Ebml_SerializeUnsigned(glob, CueTrack, 1);
+    Ebml_SerializeUnsigned64(glob, CueClusterPosition,
+                             cue->loc - glob->position_reference);
+    Ebml_EndSubElement(glob, &start_cue_tracks);
+
+    Ebml_EndSubElement(glob, &start_cue_point);
+  }
+
+  Ebml_EndSubElement(glob, &start_cues);
+
+  /* Close the Segment. */
+  Ebml_EndSubElement(glob, &glob->startSegment);
+
+  /* Patch up the seek info block. */
+  write_webm_seek_info(glob);
+
+  /* Patch up the track id. */
+  fseeko(glob->stream, glob->track_id_pos, SEEK_SET);
+  Ebml_SerializeUnsigned32(glob, TrackUID, glob->debug ? 0xDEADBEEF : hash);
+
+  fseeko(glob->stream, 0, SEEK_END);
+}
diff --git a/source/libvpx/webmenc.h b/source/libvpx/webmenc.h
new file mode 100644
index 0000000..f3bc3ec
--- /dev/null
+++ b/source/libvpx/webmenc.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WEBMENC_H_
+#define WEBMENC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER)
+/* MSVS doesn't define off_t */
+typedef __int64 off_t;
+#else
+#include <stdint.h>
+#endif
+
+#include "tools_common.h"
+#include "vpx/vpx_encoder.h"
+
+typedef off_t EbmlLoc;
+
+struct cue_entry {
+  unsigned int time;
+  uint64_t loc;
+};
+
+struct EbmlGlobal {
+  int debug;
+
+  FILE *stream;
+  int64_t last_pts_ms;
+  vpx_rational_t framerate;
+
+  /* These pointers are to the start of an element */
+  off_t position_reference;
+  off_t seek_info_pos;
+  off_t segment_info_pos;
+  off_t track_pos;
+  off_t cue_pos;
+  off_t cluster_pos;
+
+  /* This pointer is to a specific element to be serialized */
+  off_t track_id_pos;
+
+  /* These pointers are to the size field of the element */
+  EbmlLoc startSegment;
+  EbmlLoc startCluster;
+
+  uint32_t cluster_timecode;
+  int cluster_open;
+
+  struct cue_entry *cue_list;
+  unsigned int cues;
+};
+
+/* Stereo 3D packed frame format */
+typedef enum stereo_format {
+  STEREO_FORMAT_MONO = 0,
+  STEREO_FORMAT_LEFT_RIGHT = 1,
+  STEREO_FORMAT_BOTTOM_TOP = 2,
+  STEREO_FORMAT_TOP_BOTTOM = 3,
+  STEREO_FORMAT_RIGHT_LEFT = 11
+} stereo_format_t;
+
+void write_webm_seek_element(struct EbmlGlobal *ebml,
+                             unsigned int id,
+                             off_t pos);
+
+void write_webm_file_header(struct EbmlGlobal *glob,
+                            const vpx_codec_enc_cfg_t *cfg,
+                            const struct vpx_rational *fps,
+                            stereo_format_t stereo_fmt,
+                            unsigned int fourcc);
+
+void write_webm_block(struct EbmlGlobal *glob,
+                      const vpx_codec_enc_cfg_t *cfg,
+                      const vpx_codec_cx_pkt_t *pkt);
+
+void write_webm_file_footer(struct EbmlGlobal *glob, int hash);
+
+#endif  // WEBMENC_H_
author	johannkoenig@chromium.org <johannkoenig@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>	2013-12-16 19:49:40 +0000
committer	johannkoenig@chromium.org <johannkoenig@chromium.org@4ff67af0-8c30-449e-8e8b-ad334ec8d88c>	2013-12-16 19:49:40 +0000
commit	d851b91d14ef0bd71acdce7b90c9a8f1af1181ad (patch)
tree	ded826e4587a462cf390d127bf6d189dae806ed6 /source
parent	19002df347c5606f660056344d4ff7f7b9b37c5c (diff)
download	libvpx-d851b91d14ef0bd71acdce7b90c9a8f1af1181ad.tar.gz