diff options
Diffstat (limited to 'libvpx/vp9/common')
82 files changed, 23634 insertions, 0 deletions
diff --git a/libvpx/vp9/common/generic/vp9_systemdependent.c b/libvpx/vp9/common/generic/vp9_systemdependent.c new file mode 100644 index 000000000..79092cd0e --- /dev/null +++ b/libvpx/vp9/common/generic/vp9_systemdependent.c @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "./vpx_config.h" +#include "vp9_rtcd.h" +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_machine_specific_config(VP9_COMMON *ctx) { + vp9_rtcd(); +} diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c new file mode 100644 index 000000000..2660344d5 --- /dev/null +++ b/libvpx/vp9/common/vp9_alloccommon.c @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_systemdependent.h" + +void vp9_update_mode_info_border(VP9_COMMON *cm, MODE_INFO *mi) { + const int stride = cm->mode_info_stride; + int i; + + // Clear down top border row + vpx_memset(mi, 0, sizeof(MODE_INFO) * stride); + + // Clear left border column + for (i = 1; i < cm->mi_rows + 1; i++) + vpx_memset(&mi[i * stride], 0, sizeof(MODE_INFO)); +} + +void vp9_update_mode_info_in_image(VP9_COMMON *cm, MODE_INFO *mi) { + int i, j; + + // For each in image mode_info element set the in image flag to 1 + for (i = 0; i < cm->mi_rows; i++) { + MODE_INFO *ptr = mi; + for (j = 0; j < cm->mi_cols; j++) { + ptr->mbmi.mb_in_image = 1; + ptr++; // Next element in the row + } + + // Step over border element at start of next row + mi += cm->mode_info_stride; + } +} + +void vp9_free_frame_buffers(VP9_COMMON *oci) { + int i; + + for (i = 0; i < NUM_YV12_BUFFERS; i++) + vp9_free_frame_buffer(&oci->yv12_fb[i]); + + vp9_free_frame_buffer(&oci->temp_scale_frame); + vp9_free_frame_buffer(&oci->post_proc_buffer); + + vpx_free(oci->mip); + vpx_free(oci->prev_mip); + vpx_free(oci->above_seg_context); + + vpx_free(oci->above_context[0]); + for (i = 0; i < MAX_MB_PLANE; i++) + oci->above_context[i] = 0; + oci->mip = 0; + oci->prev_mip = 0; + oci->above_seg_context = 0; +} + +static void set_mb_mi(VP9_COMMON *cm, int aligned_width, int aligned_height) { + cm->mb_cols = (aligned_width + 8) >> 4; + cm->mb_rows = (aligned_height + 8) >> 4; + cm->MBs = cm->mb_rows * cm->mb_cols; + + cm->mi_cols = aligned_width >> LOG2_MI_SIZE; + cm->mi_rows = aligned_height >> LOG2_MI_SIZE; + cm->mode_info_stride = cm->mi_cols + 64 / MI_SIZE; +} + +static void setup_mi(VP9_COMMON *cm) { + cm->mi = cm->mip + cm->mode_info_stride + 1; + cm->prev_mi = cm->prev_mip + cm->mode_info_stride + 1; + + vpx_memset(cm->mip, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + + vp9_update_mode_info_border(cm, cm->mip); + vp9_update_mode_info_in_image(cm, cm->mi); + + vp9_update_mode_info_border(cm, cm->prev_mip); + vp9_update_mode_info_in_image(cm, cm->prev_mi); +} + +int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height) { + int i, mi_cols; + + // Our internal buffers are always multiples of 16 + const int aligned_width = multiple8(width); + const int aligned_height = multiple8(height); + const int ss_x = oci->subsampling_x; + const int ss_y = oci->subsampling_y; + + vp9_free_frame_buffers(oci); + + for (i = 0; i < NUM_YV12_BUFFERS; i++) { + oci->fb_idx_ref_cnt[i] = 0; + if (vp9_alloc_frame_buffer(&oci->yv12_fb[i], width, height, ss_x, ss_y, + VP9BORDERINPIXELS) < 0) + goto fail; + } + + oci->new_fb_idx = NUM_YV12_BUFFERS - 1; + oci->fb_idx_ref_cnt[oci->new_fb_idx] = 1; + + for (i = 0; i < ALLOWED_REFS_PER_FRAME; i++) + oci->active_ref_idx[i] = i; + + for (i = 0; i < NUM_REF_FRAMES; i++) { + oci->ref_frame_map[i] = i; + oci->fb_idx_ref_cnt[i] = 1; + } + + if (vp9_alloc_frame_buffer(&oci->temp_scale_frame, width, 16, ss_x, ss_y, + VP9BORDERINPIXELS) < 0) + goto fail; + + if (vp9_alloc_frame_buffer(&oci->post_proc_buffer, width, height, ss_x, ss_y, + VP9BORDERINPIXELS) < 0) + goto fail; + + set_mb_mi(oci, aligned_width, aligned_height); + + // Allocation + oci->mip = vpx_calloc(oci->mode_info_stride * (oci->mi_rows + 64 / MI_SIZE), + sizeof(MODE_INFO)); + if (!oci->mip) + goto fail; + + oci->prev_mip = vpx_calloc(oci->mode_info_stride * + (oci->mi_rows + 64 / MI_SIZE), + sizeof(MODE_INFO)); + if (!oci->prev_mip) + goto fail; + + setup_mi(oci); + + // FIXME(jkoleszar): allocate subsampled arrays for U/V once subsampling + // information is exposed at this level + mi_cols = mi_cols_aligned_to_sb(oci); + + // 2 contexts per 'mi unit', so that we have one context per 4x4 txfm + // block where mi unit size is 8x8. +# if CONFIG_ALPHA + oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 8 * mi_cols, 1); +#else + oci->above_context[0] = vpx_calloc(sizeof(ENTROPY_CONTEXT) * 6 * mi_cols, 1); +#endif + if (!oci->above_context[0]) + goto fail; + + for (i = 1; i < MAX_MB_PLANE; i++) + oci->above_context[i] = + oci->above_context[0] + i * sizeof(ENTROPY_CONTEXT) * 2 * mi_cols; + + oci->above_seg_context = vpx_calloc(sizeof(PARTITION_CONTEXT) * mi_cols, 1); + if (!oci->above_seg_context) + goto fail; + + return 0; + + fail: + vp9_free_frame_buffers(oci); + return 1; +} + +void vp9_create_common(VP9_COMMON *oci) { + vp9_machine_specific_config(oci); + + vp9_init_mbmode_probs(oci); + + oci->txfm_mode = ONLY_4X4; + oci->comp_pred_mode = HYBRID_PREDICTION; + oci->clr_type = REG_YUV; + + // Initialize reference frame sign bias structure to defaults + vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias)); +} + +void vp9_remove_common(VP9_COMMON *oci) { + vp9_free_frame_buffers(oci); +} + +void vp9_initialize_common() { + vp9_coef_tree_initialize(); + vp9_entropy_mode_init(); + vp9_entropy_mv_init(); +} + +void vp9_update_frame_size(VP9_COMMON *cm) { + const int aligned_width = multiple8(cm->width); + const int aligned_height = multiple8(cm->height); + + set_mb_mi(cm, aligned_width, aligned_height); + setup_mi(cm); +} diff --git a/libvpx/vp9/common/vp9_alloccommon.h b/libvpx/vp9/common/vp9_alloccommon.h new file mode 100644 index 000000000..8bf5ed160 --- /dev/null +++ b/libvpx/vp9/common/vp9_alloccommon.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_ALLOCCOMMON_H_ +#define VP9_COMMON_VP9_ALLOCCOMMON_H_ + +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_initialize_common(); + +void vp9_update_mode_info_border(VP9_COMMON *cpi, MODE_INFO *mi); +void vp9_update_mode_info_in_image(VP9_COMMON *cpi, MODE_INFO *mi); + +void vp9_create_common(VP9_COMMON *oci); +void vp9_remove_common(VP9_COMMON *oci); + +int vp9_alloc_frame_buffers(VP9_COMMON *oci, int width, int height); +void vp9_free_frame_buffers(VP9_COMMON *oci); + + +void vp9_update_frame_size(VP9_COMMON *cm); + +#endif // VP9_COMMON_VP9_ALLOCCOMMON_H_ diff --git a/libvpx/vp9/common/vp9_asm_com_offsets.c b/libvpx/vp9/common/vp9_asm_com_offsets.c new file mode 100644 index 000000000..94ccb6ebd --- /dev/null +++ b/libvpx/vp9/common/vp9_asm_com_offsets.c @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" +#include "vpx/vpx_codec.h" +#include "vpx_ports/asm_offsets.h" + +BEGIN + +END + +/* add asserts for any offset that is not supported by assembly code */ +/* add asserts for any size that is not supported by assembly code */ diff --git a/libvpx/vp9/common/vp9_blockd.h b/libvpx/vp9/common/vp9_blockd.h new file mode 100644 index 000000000..37d29af17 --- /dev/null +++ b/libvpx/vp9/common/vp9_blockd.h @@ -0,0 +1,904 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_BLOCKD_H_ +#define VP9_COMMON_VP9_BLOCKD_H_ + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_convolve.h" +#include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_treecoder.h" +#include "vpx_ports/mem.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" + +#define BLOCK_SIZE_GROUPS 4 +#define MAX_MB_SEGMENTS 8 +#define MB_SEG_TREE_PROBS (MAX_MB_SEGMENTS-1) + +#define PREDICTION_PROBS 3 + +#define MBSKIP_CONTEXTS 3 + +#define MAX_REF_LF_DELTAS 4 +#define MAX_MODE_LF_DELTAS 2 + +/* Segment Feature Masks */ +#define SEGMENT_DELTADATA 0 +#define SEGMENT_ABSDATA 1 +#define MAX_MV_REF_CANDIDATES 2 + +#define INTRA_INTER_CONTEXTS 4 +#define COMP_INTER_CONTEXTS 5 +#define REF_CONTEXTS 5 + +typedef enum { + PLANE_TYPE_Y_WITH_DC, + PLANE_TYPE_UV, +} PLANE_TYPE; + +typedef char ENTROPY_CONTEXT; + +typedef char PARTITION_CONTEXT; + +static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a, + ENTROPY_CONTEXT b) { + return (a != 0) + (b != 0); +} + +typedef enum { + KEY_FRAME = 0, + INTER_FRAME = 1, + NUM_FRAME_TYPES, +} FRAME_TYPE; + +typedef enum { + EIGHTTAP_SMOOTH, + EIGHTTAP, + EIGHTTAP_SHARP, + BILINEAR, + SWITCHABLE /* should be the last one */ +} INTERPOLATIONFILTERTYPE; + +typedef enum { + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi) + D135_PRED, // Directional 135 deg = 180 - 45 + D117_PRED, // Directional 117 deg = 180 - 63 + D153_PRED, // Directional 153 deg = 180 - 27 + D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi) + D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) + TM_PRED, // True-motion + NEARESTMV, + NEARMV, + ZEROMV, + NEWMV, + MB_MODE_COUNT +} MB_PREDICTION_MODE; + +static INLINE int is_inter_mode(MB_PREDICTION_MODE mode) { + return mode >= NEARESTMV && mode <= NEWMV; +} + +// Segment level features. +typedef enum { + SEG_LVL_ALT_Q = 0, // Use alternate Quantizer .... + SEG_LVL_ALT_LF = 1, // Use alternate loop filter value... + SEG_LVL_REF_FRAME = 2, // Optional Segment reference frame + SEG_LVL_SKIP = 3, // Optional Segment (0,0) + skip mode + SEG_LVL_MAX = 4 // Number of MB level features supported +} SEG_LVL_FEATURES; + +// Segment level features. +typedef enum { + TX_4X4 = 0, // 4x4 dct transform + TX_8X8 = 1, // 8x8 dct transform + TX_16X16 = 2, // 16x16 dct transform + TX_32X32 = 3, // 32x32 dct transform + TX_SIZE_MAX_SB, // Number of transforms available to SBs +} TX_SIZE; + +typedef enum { + DCT_DCT = 0, // DCT in both horizontal and vertical + ADST_DCT = 1, // ADST in vertical, DCT in horizontal + DCT_ADST = 2, // DCT in vertical, ADST in horizontal + ADST_ADST = 3 // ADST in both directions +} TX_TYPE; + +#define VP9_INTRA_MODES (TM_PRED + 1) + +#define VP9_INTER_MODES (1 + NEWMV - NEARESTMV) + +#define WHT_UPSCALE_FACTOR 2 + +#define TX_SIZE_PROBS 6 // (TX_SIZE_MAX_SB * (TX_SIZE_MAX_SB - 1) / 2) + +#define get_tx_probs(c, b) ((b) < BLOCK_SIZE_MB16X16 ? \ + (c)->fc.tx_probs_8x8p : \ + (b) < BLOCK_SIZE_SB32X32 ? \ + (c)->fc.tx_probs_16x16p : (c)->fc.tx_probs_32x32p) + +/* For keyframes, intra block modes are predicted by the (already decoded) + modes for the Y blocks to the left and above us; for interframes, there + is a single probability table. */ + +union b_mode_info { + struct { + MB_PREDICTION_MODE first; + } as_mode; + int_mv as_mv[2]; // first, second inter predictor motion vectors +}; + +typedef enum { + NONE = -1, + INTRA_FRAME = 0, + LAST_FRAME = 1, + GOLDEN_FRAME = 2, + ALTREF_FRAME = 3, + MAX_REF_FRAMES = 4 +} MV_REFERENCE_FRAME; + +static INLINE int b_width_log2(BLOCK_SIZE_TYPE sb_type) { + switch (sb_type) { + case BLOCK_SIZE_SB4X8: + case BLOCK_SIZE_AB4X4: return 0; + case BLOCK_SIZE_SB8X4: + case BLOCK_SIZE_SB8X8: + case BLOCK_SIZE_SB8X16: return 1; + case BLOCK_SIZE_SB16X8: + case BLOCK_SIZE_MB16X16: + case BLOCK_SIZE_SB16X32: return 2; + case BLOCK_SIZE_SB32X16: + case BLOCK_SIZE_SB32X32: + case BLOCK_SIZE_SB32X64: return 3; + case BLOCK_SIZE_SB64X32: + case BLOCK_SIZE_SB64X64: return 4; + default: assert(0); + return -1; + } +} + +static INLINE int b_height_log2(BLOCK_SIZE_TYPE sb_type) { + switch (sb_type) { + case BLOCK_SIZE_SB8X4: + case BLOCK_SIZE_AB4X4: return 0; + case BLOCK_SIZE_SB4X8: + case BLOCK_SIZE_SB8X8: + case BLOCK_SIZE_SB16X8: return 1; + case BLOCK_SIZE_SB8X16: + case BLOCK_SIZE_MB16X16: + case BLOCK_SIZE_SB32X16: return 2; + case BLOCK_SIZE_SB16X32: + case BLOCK_SIZE_SB32X32: + case BLOCK_SIZE_SB64X32: return 3; + case BLOCK_SIZE_SB32X64: + case BLOCK_SIZE_SB64X64: return 4; + default: assert(0); + return -1; + } +} + +static INLINE int mi_width_log2(BLOCK_SIZE_TYPE sb_type) { + int a = b_width_log2(sb_type) - 1; + // align 4x4 block to mode_info + if (a < 0) + a = 0; + assert(a >= 0); + return a; +} + +static INLINE int mi_height_log2(BLOCK_SIZE_TYPE sb_type) { + int a = b_height_log2(sb_type) - 1; + if (a < 0) + a = 0; + assert(a >= 0); + return a; +} + +typedef struct { + MB_PREDICTION_MODE mode, uv_mode; + MV_REFERENCE_FRAME ref_frame[2]; + TX_SIZE txfm_size; + int_mv mv[2]; // for each reference frame used + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; + int_mv best_mv, best_second_mv; + + int mb_mode_context[MAX_REF_FRAMES]; + + unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ + unsigned char segment_id; // Segment id for current frame + + // Flags used for prediction status of various bistream signals + unsigned char seg_id_predicted; + + // Indicates if the mb is part of the image (1) vs border (0) + // This can be useful in determining whether the MB provides + // a valid predictor + unsigned char mb_in_image; + + INTERPOLATIONFILTERTYPE interp_filter; + + BLOCK_SIZE_TYPE sb_type; +} MB_MODE_INFO; + +typedef struct { + MB_MODE_INFO mbmi; + union b_mode_info bmi[4]; +} MODE_INFO; + +#define VP9_REF_SCALE_SHIFT 14 +struct scale_factors { + int x_scale_fp; // horizontal fixed point scale factor + int y_scale_fp; // vertical fixed point scale factor + int x_offset_q4; + int x_step_q4; + int y_offset_q4; + int y_step_q4; + + int (*scale_value_x)(int val, const struct scale_factors *scale); + int (*scale_value_y)(int val, const struct scale_factors *scale); + void (*set_scaled_offsets)(struct scale_factors *scale, int row, int col); + int_mv32 (*scale_mv_q3_to_q4)(const int_mv *src_mv, + const struct scale_factors *scale); + int32_t (*scale_mv_component_q4)(int mv_q4, int scale_fp, int offset_q4); + + convolve_fn_t predict[2][2][2]; // horiz, vert, avg +}; + +#if CONFIG_ALPHA +enum { MAX_MB_PLANE = 4 }; +#else +enum { MAX_MB_PLANE = 3 }; +#endif + +struct buf_2d { + uint8_t *buf; + int stride; +}; + +struct macroblockd_plane { + DECLARE_ALIGNED(16, int16_t, qcoeff[64 * 64]); + DECLARE_ALIGNED(16, int16_t, dqcoeff[64 * 64]); + DECLARE_ALIGNED(16, uint16_t, eobs[256]); + PLANE_TYPE plane_type; + int subsampling_x; + int subsampling_y; + struct buf_2d dst; + struct buf_2d pre[2]; + int16_t *dequant; + ENTROPY_CONTEXT *above_context; + ENTROPY_CONTEXT *left_context; +}; + +#define BLOCK_OFFSET(x, i, n) ((x) + (i) * (n)) + +typedef struct macroblockd { + struct macroblockd_plane plane[MAX_MB_PLANE]; + + struct scale_factors scale_factor[2]; + struct scale_factors scale_factor_uv[2]; + + MODE_INFO *prev_mode_info_context; + MODE_INFO *mode_info_context; + int mode_info_stride; + + FRAME_TYPE frame_type; + + int up_available; + int left_available; + int right_available; + + // partition contexts + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT *left_seg_context; + + /* 0 (disable) 1 (enable) segmentation */ + unsigned char segmentation_enabled; + + /* 0 (do not update) 1 (update) the macroblock segmentation map. */ + unsigned char update_mb_segmentation_map; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char update_mb_segmentation_data; + + /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ + unsigned char mb_segment_abs_delta; + + /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ + /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ + + // Probability Tree used to code Segment number + vp9_prob mb_segment_tree_probs[MB_SEG_TREE_PROBS]; + + // Segment features + int16_t segment_feature_data[MAX_MB_SEGMENTS][SEG_LVL_MAX]; + unsigned int segment_feature_mask[MAX_MB_SEGMENTS]; + + /* mode_based Loop filter adjustment */ + unsigned char mode_ref_lf_delta_enabled; + unsigned char mode_ref_lf_delta_update; + + /* Delta values have the range +/- MAX_LOOP_FILTER */ + /* 0 = Intra, Last, GF, ARF */ + signed char last_ref_lf_deltas[MAX_REF_LF_DELTAS]; + /* 0 = Intra, Last, GF, ARF */ + signed char ref_lf_deltas[MAX_REF_LF_DELTAS]; + /* 0 = ZERO_MV, MV */ + signed char last_mode_lf_deltas[MAX_MODE_LF_DELTAS]; + /* 0 = ZERO_MV, MV */ + signed char mode_lf_deltas[MAX_MODE_LF_DELTAS]; + + /* Distance of MB away from frame edges */ + int mb_to_left_edge; + int mb_to_right_edge; + int mb_to_top_edge; + int mb_to_bottom_edge; + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + + int lossless; + /* Inverse transform function pointers. */ + void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride); + void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride); + void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob); + + struct subpix_fn_table subpix; + + int allow_high_precision_mv; + + int corrupted; + + int sb_index; // index of 32x32 block inside the 64x64 block + int mb_index; // index of 16x16 block inside the 32x32 block + int b_index; // index of 8x8 block inside the 16x16 block + int ab_index; // index of 4x4 block inside the 8x8 block + int q_index; + +} MACROBLOCKD; + +static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { + switch (subsize) { + case BLOCK_SIZE_SB64X64: + case BLOCK_SIZE_SB64X32: + case BLOCK_SIZE_SB32X64: + case BLOCK_SIZE_SB32X32: + return &xd->sb_index; + case BLOCK_SIZE_SB32X16: + case BLOCK_SIZE_SB16X32: + case BLOCK_SIZE_MB16X16: + return &xd->mb_index; + case BLOCK_SIZE_SB16X8: + case BLOCK_SIZE_SB8X16: + case BLOCK_SIZE_SB8X8: + return &xd->b_index; + case BLOCK_SIZE_SB8X4: + case BLOCK_SIZE_SB4X8: + case BLOCK_SIZE_AB4X4: + return &xd->ab_index; + default: + assert(0); + return NULL; + } +} + +static INLINE void update_partition_context(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE sb_type, + BLOCK_SIZE_TYPE sb_size) { + int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2; + int bwl = b_width_log2(sb_type); + int bhl = b_height_log2(sb_type); + int boffset = b_width_log2(BLOCK_SIZE_SB64X64) - bsl; + int i; + + // update the partition context at the end notes. set partition bits + // of block sizes larger than the current one to be one, and partition + // bits of smaller block sizes to be zero. + if ((bwl == bsl) && (bhl == bsl)) { + for (i = 0; i < bs; i++) + xd->left_seg_context[i] = ~(0xf << boffset); + for (i = 0; i < bs; i++) + xd->above_seg_context[i] = ~(0xf << boffset); + } else if ((bwl == bsl) && (bhl < bsl)) { + for (i = 0; i < bs; i++) + xd->left_seg_context[i] = ~(0xe << boffset); + for (i = 0; i < bs; i++) + xd->above_seg_context[i] = ~(0xf << boffset); + } else if ((bwl < bsl) && (bhl == bsl)) { + for (i = 0; i < bs; i++) + xd->left_seg_context[i] = ~(0xf << boffset); + for (i = 0; i < bs; i++) + xd->above_seg_context[i] = ~(0xe << boffset); + } else if ((bwl < bsl) && (bhl < bsl)) { + for (i = 0; i < bs; i++) + xd->left_seg_context[i] = ~(0xe << boffset); + for (i = 0; i < bs; i++) + xd->above_seg_context[i] = ~(0xe << boffset); + } else { + assert(0); + } +} + +static INLINE int partition_plane_context(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE sb_type) { + int bsl = mi_width_log2(sb_type), bs = 1 << bsl; + int above = 0, left = 0, i; + int boffset = mi_width_log2(BLOCK_SIZE_SB64X64) - bsl; + + assert(mi_width_log2(sb_type) == mi_height_log2(sb_type)); + assert(bsl >= 0); + assert(boffset >= 0); + + for (i = 0; i < bs; i++) + above |= (xd->above_seg_context[i] & (1 << boffset)); + for (i = 0; i < bs; i++) + left |= (xd->left_seg_context[i] & (1 << boffset)); + + above = (above > 0); + left = (left > 0); + + return (left * 2 + above) + bsl * PARTITION_PLOFFSET; +} + +static BLOCK_SIZE_TYPE get_subsize(BLOCK_SIZE_TYPE bsize, + PARTITION_TYPE partition) { + BLOCK_SIZE_TYPE subsize; + switch (partition) { + case PARTITION_NONE: + subsize = bsize; + break; + case PARTITION_HORZ: + if (bsize == BLOCK_SIZE_SB64X64) + subsize = BLOCK_SIZE_SB64X32; + else if (bsize == BLOCK_SIZE_SB32X32) + subsize = BLOCK_SIZE_SB32X16; + else if (bsize == BLOCK_SIZE_MB16X16) + subsize = BLOCK_SIZE_SB16X8; + else if (bsize == BLOCK_SIZE_SB8X8) + subsize = BLOCK_SIZE_SB8X4; + else + assert(0); + break; + case PARTITION_VERT: + if (bsize == BLOCK_SIZE_SB64X64) + subsize = BLOCK_SIZE_SB32X64; + else if (bsize == BLOCK_SIZE_SB32X32) + subsize = BLOCK_SIZE_SB16X32; + else if (bsize == BLOCK_SIZE_MB16X16) + subsize = BLOCK_SIZE_SB8X16; + else if (bsize == BLOCK_SIZE_SB8X8) + subsize = BLOCK_SIZE_SB4X8; + else + assert(0); + break; + case PARTITION_SPLIT: + if (bsize == BLOCK_SIZE_SB64X64) + subsize = BLOCK_SIZE_SB32X32; + else if (bsize == BLOCK_SIZE_SB32X32) + subsize = BLOCK_SIZE_MB16X16; + else if (bsize == BLOCK_SIZE_MB16X16) + subsize = BLOCK_SIZE_SB8X8; + else if (bsize == BLOCK_SIZE_SB8X8) + subsize = BLOCK_SIZE_AB4X4; + else + assert(0); + break; + default: + assert(0); + } + return subsize; +} + +// transform mapping +static TX_TYPE txfm_map(MB_PREDICTION_MODE bmode) { + switch (bmode) { + case TM_PRED : + case D135_PRED : + return ADST_ADST; + + case V_PRED : + case D117_PRED : + case D63_PRED: + return ADST_DCT; + + case H_PRED : + case D153_PRED : + case D27_PRED : + return DCT_ADST; + + default: + return DCT_DCT; + } +} + +static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) { + TX_TYPE tx_type; + MODE_INFO *mi = xd->mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; + if (xd->lossless || mbmi->ref_frame[0] != INTRA_FRAME) + return DCT_DCT; + if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { + tx_type = txfm_map(mi->bmi[ib].as_mode.first); + } else { + assert(mbmi->mode <= TM_PRED); + tx_type = txfm_map(mbmi->mode); + } + return tx_type; +} + +static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) { + TX_TYPE tx_type = DCT_DCT; + if (xd->mode_info_context->mbmi.mode <= TM_PRED) { + tx_type = txfm_map(xd->mode_info_context->mbmi.mode); + } + return tx_type; +} + +static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) { + TX_TYPE tx_type = DCT_DCT; + if (xd->mode_info_context->mbmi.mode <= TM_PRED) { + tx_type = txfm_map(xd->mode_info_context->mbmi.mode); + } + return tx_type; +} + +void vp9_setup_block_dptrs(MACROBLOCKD *xd, + int subsampling_x, int subsampling_y); + +static TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi) { + const TX_SIZE size = mbmi->txfm_size; + + switch (mbmi->sb_type) { + case BLOCK_SIZE_SB64X64: + return size; + case BLOCK_SIZE_SB64X32: + case BLOCK_SIZE_SB32X64: + case BLOCK_SIZE_SB32X32: + if (size == TX_32X32) + return TX_16X16; + else + return size; + case BLOCK_SIZE_SB32X16: + case BLOCK_SIZE_SB16X32: + case BLOCK_SIZE_MB16X16: + if (size == TX_16X16) + return TX_8X8; + else + return size; + default: + return TX_4X4; + } + + return size; +} + +struct plane_block_idx { + int plane; + int block; +}; + +// TODO(jkoleszar): returning a struct so it can be used in a const context, +// expect to refactor this further later. +static INLINE struct plane_block_idx plane_block_idx(int y_blocks, + int b_idx) { + const int v_offset = y_blocks * 5 / 4; + struct plane_block_idx res; + + if (b_idx < y_blocks) { + res.plane = 0; + res.block = b_idx; + } else if (b_idx < v_offset) { + res.plane = 1; + res.block = b_idx - y_blocks; + } else { + assert(b_idx < y_blocks * 3 / 2); + res.plane = 2; + res.block = b_idx - v_offset; + } + return res; +} + +static INLINE int plane_block_width(BLOCK_SIZE_TYPE bsize, + const struct macroblockd_plane* plane) { + return 4 << (b_width_log2(bsize) - plane->subsampling_x); +} + +static INLINE int plane_block_height(BLOCK_SIZE_TYPE bsize, + const struct macroblockd_plane* plane) { + return 4 << (b_height_log2(bsize) - plane->subsampling_y); +} + +typedef void (*foreach_transformed_block_visitor)(int plane, int block, + BLOCK_SIZE_TYPE bsize, + int ss_txfrm_size, + void *arg); + +static INLINE void foreach_transformed_block_in_plane( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane, + foreach_transformed_block_visitor visit, void *arg) { + const int bw = b_width_log2(bsize), bh = b_height_log2(bsize); + + // block and transform sizes, in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // transform size varies per plane, look it up in a common way. + const MB_MODE_INFO* mbmi = &xd->mode_info_context->mbmi; + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) + : mbmi->txfm_size; + const int block_size_b = bw + bh; + const int txfrm_size_b = tx_size * 2; + + // subsampled size of the block + const int ss_sum = xd->plane[plane].subsampling_x + + xd->plane[plane].subsampling_y; + const int ss_block_size = block_size_b - ss_sum; + + const int step = 1 << txfrm_size_b; + + int i; + + assert(txfrm_size_b <= block_size_b); + assert(txfrm_size_b <= ss_block_size); + + // If mb_to_right_edge is < 0 we are in a situation in which + // the current block size extends into the UMV and we won't + // visit the sub blocks that are wholly within the UMV. + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) { + int r, c; + const int sw = bw - xd->plane[plane].subsampling_x; + const int sh = bh - xd->plane[plane].subsampling_y; + int max_blocks_wide = 1 << sw; + int max_blocks_high = 1 << sh; + + // xd->mb_to_right_edge is in units of pixels * 8. This converts + // it to 4x4 block sizes. + if (xd->mb_to_right_edge < 0) + max_blocks_wide += + + (xd->mb_to_right_edge >> (5 + xd->plane[plane].subsampling_x)); + + if (xd->mb_to_bottom_edge < 0) + max_blocks_high += + + (xd->mb_to_bottom_edge >> (5 + xd->plane[plane].subsampling_y)); + + i = 0; + // Unlike the normal case - in here we have to keep track of the + // row and column of the blocks we use so that we know if we are in + // the unrestricted motion border.. + for (r = 0; r < (1 << sh); r += (1 << tx_size)) { + for (c = 0; c < (1 << sw); c += (1 << tx_size)) { + if (r < max_blocks_high && c < max_blocks_wide) + visit(plane, i, bsize, txfrm_size_b, arg); + i += step; + } + } + } else { + for (i = 0; i < (1 << ss_block_size); i += step) { + visit(plane, i, bsize, txfrm_size_b, arg); + } + } +} + +static INLINE void foreach_transformed_block( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + foreach_transformed_block_visitor visit, void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; plane++) { + foreach_transformed_block_in_plane(xd, bsize, plane, + visit, arg); + } +} + +static INLINE void foreach_transformed_block_uv( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + foreach_transformed_block_visitor visit, void *arg) { + int plane; + + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + foreach_transformed_block_in_plane(xd, bsize, plane, + visit, arg); + } +} + +// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could +// calculate the subsampled BLOCK_SIZE_TYPE, but that type isn't defined for +// sizes smaller than 16x16 yet. +typedef void (*foreach_predicted_block_visitor)(int plane, int block, + BLOCK_SIZE_TYPE bsize, + int pred_w, int pred_h, + void *arg); +static INLINE void foreach_predicted_block_in_plane( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, int plane, + foreach_predicted_block_visitor visit, void *arg) { + int i, x, y; + + // block sizes in number of 4x4 blocks log 2 ("*_b") + // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8 + // subsampled size of the block + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + + // size of the predictor to use. + int pred_w, pred_h; + + if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + assert(bsize == BLOCK_SIZE_SB8X8); + pred_w = 0; + pred_h = 0; + } else { + pred_w = bwl; + pred_h = bhl; + } + assert(pred_w <= bwl); + assert(pred_h <= bhl); + + // visit each subblock in raster order + i = 0; + for (y = 0; y < 1 << bhl; y += 1 << pred_h) { + for (x = 0; x < 1 << bwl; x += 1 << pred_w) { + visit(plane, i, bsize, pred_w, pred_h, arg); + i += 1 << pred_w; + } + i += (1 << (bwl + pred_h)) - (1 << bwl); + } +} +static INLINE void foreach_predicted_block( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + foreach_predicted_block_visitor visit, void *arg) { + int plane; + + for (plane = 0; plane < MAX_MB_PLANE; plane++) { + foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg); + } +} +static INLINE void foreach_predicted_block_uv( + const MACROBLOCKD* const xd, BLOCK_SIZE_TYPE bsize, + foreach_predicted_block_visitor visit, void *arg) { + int plane; + + for (plane = 1; plane < MAX_MB_PLANE; plane++) { + foreach_predicted_block_in_plane(xd, bsize, plane, visit, arg); + } +} +static int raster_block_offset(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, + int plane, int block, int stride) { + const int bw = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int y = 4 * (block >> bw), x = 4 * (block & ((1 << bw) - 1)); + return y * stride + x; +} +static int16_t* raster_block_offset_int16(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, + int plane, int block, int16_t *base) { + const int stride = plane_block_width(bsize, &xd->plane[plane]); + return base + raster_block_offset(xd, bsize, plane, block, stride); +} +static uint8_t* raster_block_offset_uint8(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, + int plane, int block, + uint8_t *base, int stride) { + return base + raster_block_offset(xd, bsize, plane, block, stride); +} + +static int txfrm_block_to_raster_block(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, + int plane, int block, + int ss_txfrm_size) { + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int txwl = ss_txfrm_size / 2; + const int tx_cols_lg2 = bwl - txwl; + const int tx_cols = 1 << tx_cols_lg2; + const int raster_mb = block >> ss_txfrm_size; + const int x = (raster_mb & (tx_cols - 1)) << (txwl); + const int y = raster_mb >> tx_cols_lg2 << (txwl); + return x + (y << bwl); +} + +static void txfrm_block_to_raster_xy(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize, + int plane, int block, + int ss_txfrm_size, + int *x, int *y) { + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int txwl = ss_txfrm_size / 2; + const int tx_cols_lg2 = bwl - txwl; + const int tx_cols = 1 << tx_cols_lg2; + const int raster_mb = block >> ss_txfrm_size; + *x = (raster_mb & (tx_cols - 1)) << (txwl); + *y = raster_mb >> tx_cols_lg2 << (txwl); +} + +static void extend_for_intra(MACROBLOCKD* const xd, int plane, int block, + BLOCK_SIZE_TYPE bsize, int ss_txfrm_size) { + const int bw = plane_block_width(bsize, &xd->plane[plane]); + const int bh = plane_block_height(bsize, &xd->plane[plane]); + int x, y; + txfrm_block_to_raster_xy(xd, bsize, plane, block, ss_txfrm_size, &x, &y); + x = x * 4 - 1; + y = y * 4 - 1; + // Copy a pixel into the umv if we are in a situation where the block size + // extends into the UMV. + // TODO(JBB): Should be able to do the full extend in place so we don't have + // to do this multiple times. + if (xd->mb_to_right_edge < 0) { + int umv_border_start = bw + + (xd->mb_to_right_edge >> (3 + xd->plane[plane].subsampling_x)); + + if (x + bw > umv_border_start) + vpx_memset( + xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride + + umv_border_start, + *(xd->plane[plane].dst.buf + y * xd->plane[plane].dst.stride + + umv_border_start - 1), + bw); + } + if (xd->mb_to_bottom_edge < 0) { + int umv_border_start = bh + + (xd->mb_to_bottom_edge >> (3 + xd->plane[plane].subsampling_y)); + int i; + uint8_t c = *(xd->plane[plane].dst.buf + + (umv_border_start - 1) * xd->plane[plane].dst.stride + x); + + uint8_t *d = xd->plane[plane].dst.buf + + umv_border_start * xd->plane[plane].dst.stride + x; + + if (y + bh > umv_border_start) + for (i = 0; i < bh; i++, d += xd->plane[plane].dst.stride) + *d = c; + } +} +static void set_contexts_on_border(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, + int plane, int ss_tx_size, int eob, int aoff, + int loff, ENTROPY_CONTEXT *A, + ENTROPY_CONTEXT *L) { + const int bw = b_width_log2(bsize), bh = b_height_log2(bsize); + const int sw = bw - xd->plane[plane].subsampling_x; + const int sh = bh - xd->plane[plane].subsampling_y; + int mi_blocks_wide = 1 << sw; + int mi_blocks_high = 1 << sh; + int tx_size_in_blocks = (1 << ss_tx_size); + int above_contexts = tx_size_in_blocks; + int left_contexts = tx_size_in_blocks; + int pt; + + // xd->mb_to_right_edge is in units of pixels * 8. This converts + // it to 4x4 block sizes. + if (xd->mb_to_right_edge < 0) { + mi_blocks_wide += (xd->mb_to_right_edge + >> (5 + xd->plane[plane].subsampling_x)); + } + + // this code attempts to avoid copying into contexts that are outside + // our border. Any blocks that do are set to 0... + if (above_contexts + aoff > mi_blocks_wide) + above_contexts = mi_blocks_wide - aoff; + + if (xd->mb_to_bottom_edge < 0) { + mi_blocks_high += (xd->mb_to_bottom_edge + >> (5 + xd->plane[plane].subsampling_y)); + } + if (left_contexts + loff > mi_blocks_high) { + left_contexts = mi_blocks_high - loff; + } + + for (pt = 0; pt < above_contexts; pt++) + A[pt] = eob > 0; + for (pt = above_contexts; pt < (1 << ss_tx_size); pt++) + A[pt] = 0; + for (pt = 0; pt < left_contexts; pt++) + L[pt] = eob > 0; + for (pt = left_contexts; pt < (1 << ss_tx_size); pt++) + L[pt] = 0; +} + + +#endif // VP9_COMMON_VP9_BLOCKD_H_ diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h new file mode 100644 index 000000000..0d7babf97 --- /dev/null +++ b/libvpx/vp9/common/vp9_common.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_COMMON_H_ +#define VP9_COMMON_VP9_COMMON_H_ + +/* Interface header for common constant data structures and lookup tables */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" + +#define MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n)) + +/* If we don't want to use ROUND_POWER_OF_TWO macro +static INLINE int16_t round_power_of_two(int16_t value, int n) { + return (value + (1 << (n - 1))) >> n; +}*/ + +// Only need this for fixed-size arrays, for structs just assign. +#define vp9_copy(dest, src) { \ + assert(sizeof(dest) == sizeof(src)); \ + vpx_memcpy(dest, src, sizeof(src)); \ + } + +// Use this for variably-sized arrays. +#define vp9_copy_array(dest, src, n) { \ + assert(sizeof(*dest) == sizeof(*src)); \ + vpx_memcpy(dest, src, n * sizeof(*src)); \ + } + +#define vp9_zero(dest) vpx_memset(&dest, 0, sizeof(dest)); +#define vp9_zero_array(dest, n) vpx_memset(dest, 0, n * sizeof(*dest)); + +static INLINE uint8_t clip_pixel(int val) { + return (val > 255) ? 255u : (val < 0) ? 0u : val; +} + +static INLINE int clamp(int value, int low, int high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE double fclamp(double value, double low, double high) { + return value < low ? low : (value > high ? high : value); +} + +static INLINE int multiple8(int value) { + return (value + 7) & ~7; +} + +#define SYNC_CODE_0 0x49 +#define SYNC_CODE_1 0x83 +#define SYNC_CODE_2 0x42 + + +#endif // VP9_COMMON_VP9_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_convolve.c b/libvpx/vp9/common/vp9_convolve.c new file mode 100644 index 000000000..46ae50349 --- /dev/null +++ b/libvpx/vp9/common/vp9_convolve.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vp9/common/vp9_convolve.h" + +#include <assert.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_common.h" +#include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" + +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 + +/* Assume a bank of 16 filters to choose from. There are two implementations + * for filter wrapping behavior, since we want to be able to pick which filter + * to start with. We could either: + * + * 1) make filter_ a pointer to the base of the filter array, and then add an + * additional offset parameter, to choose the starting filter. + * 2) use a pointer to 2 periods worth of filters, so that even if the original + * phase offset is at 15/16, we'll have valid data to read. The filter + * tables become [32][8], and the second half is duplicated. + * 3) fix the alignment of the filter tables, so that we know the 0/16 is + * always 256 byte aligned. + * + * Implementations 2 and 3 are likely preferable, as they avoid an extra 2 + * parameters, and switching between them is trivial, with the + * ALIGN_FILTERS_256 macro, below. + */ + #define ALIGN_FILTERS_256 1 + +static void convolve_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x0, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + const int16_t *filter_x_base = filter_x0; + +#if ALIGN_FILTERS_256 + filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source line */ + src -= taps / 2 - 1; + + for (y = 0; y < h; ++y) { + /* Pointer to filter to use */ + const int16_t *filter_x = filter_x0; + + /* Initial phase offset */ + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; + + for (x = 0; x < w; ++x) { + /* Per-pixel src offset */ + int src_x = (x_q4 - x0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[src_x + k] * filter_x[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT); + + /* Adjust source and filter to use for the next pixel */ + x_q4 += x_step_q4; + filter_x = filter_x_base + (x_q4 & 0xf) * taps; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_avg_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x0, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + const int16_t *filter_x_base = filter_x0; + +#if ALIGN_FILTERS_256 + filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source line */ + src -= taps / 2 - 1; + + for (y = 0; y < h; ++y) { + /* Pointer to filter to use */ + const int16_t *filter_x = filter_x0; + + /* Initial phase offset */ + int x0_q4 = (filter_x - filter_x_base) / taps; + int x_q4 = x0_q4; + + for (x = 0; x < w; ++x) { + /* Per-pixel src offset */ + int src_x = (x_q4 - x0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[src_x + k] * filter_x[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; + + /* Adjust source and filter to use for the next pixel */ + x_q4 += x_step_q4; + filter_x = filter_x_base + (x_q4 & 0xf) * taps; + } + src += src_stride; + dst += dst_stride; + } +} + +static void convolve_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y0, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + + const int16_t *filter_y_base = filter_y0; + +#if ALIGN_FILTERS_256 + filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source column */ + src -= src_stride * (taps / 2 - 1); + for (x = 0; x < w; ++x) { + /* Pointer to filter to use */ + const int16_t *filter_y = filter_y0; + + /* Initial phase offset */ + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; + + for (y = 0; y < h; ++y) { + /* Per-pixel src offset */ + int src_y = (y_q4 - y0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[(src_y + k) * src_stride] * filter_y[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT); + + /* Adjust source and filter to use for the next pixel */ + y_q4 += y_step_q4; + filter_y = filter_y_base + (y_q4 & 0xf) * taps; + } + ++src; + ++dst; + } +} + +static void convolve_avg_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y0, int y_step_q4, + int w, int h, int taps) { + int x, y, k, sum; + + const int16_t *filter_y_base = filter_y0; + +#if ALIGN_FILTERS_256 + filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff); +#endif + + /* Adjust base pointer address for this source column */ + src -= src_stride * (taps / 2 - 1); + for (x = 0; x < w; ++x) { + /* Pointer to filter to use */ + const int16_t *filter_y = filter_y0; + + /* Initial phase offset */ + int y0_q4 = (filter_y - filter_y_base) / taps; + int y_q4 = y0_q4; + + for (y = 0; y < h; ++y) { + /* Per-pixel src offset */ + int src_y = (y_q4 - y0_q4) >> 4; + + for (sum = 0, k = 0; k < taps; ++k) { + sum += src[(src_y + k) * src_stride] * filter_y[k]; + } + sum += (VP9_FILTER_WEIGHT >> 1); + dst[y * dst_stride] = + (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1; + + /* Adjust source and filter to use for the next pixel */ + y_q4 += y_step_q4; + filter_y = filter_y_base + (y_q4 & 0xf) * taps; + } + ++src; + ++dst; + } +} + +static void convolve_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + /* Fixed size intermediate buffer places limits on parameters. + * Maximum intermediate_height is 135, for y_step_q4 == 32, + * h == 64, taps == 8. + */ + uint8_t temp[64 * 135]; + int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + + assert(w <= 64); + assert(h <= 64); + assert(taps <= 8); + assert(y_step_q4 <= 32); + + if (intermediate_height < h) + intermediate_height = h; + + convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height, taps); + convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, taps); +} + +static void convolve_avg_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int taps) { + /* Fixed size intermediate buffer places limits on parameters. + * Maximum intermediate_height is 135, for y_step_q4 == 32, + * h == 64, taps == 8. + */ + uint8_t temp[64 * 135]; + int intermediate_height = ((h * y_step_q4) >> 4) + taps - 1; + + assert(w <= 64); + assert(h <= 64); + assert(taps <= 8); + assert(y_step_q4 <= 32); + + if (intermediate_height < h) + intermediate_height = h; + + convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, + temp, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, intermediate_height, taps); + convolve_avg_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, taps); +} + +void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + convolve_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, 8); +} + +void vp9_convolve8_avg_c(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + /* Fixed size intermediate buffer places limits on parameters. */ + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp, 64 * 64); + assert(w <= 64); + assert(h <= 64); + + vp9_convolve8(src, src_stride, + temp, 64, + filter_x, x_step_q4, + filter_y, y_step_q4, + w, h); + vp9_convolve_avg(temp, 64, + dst, dst_stride, + NULL, 0, /* These unused parameter should be removed! */ + NULL, 0, /* These unused parameter should be removed! */ + w, h); +} + +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + if (w == 16 && h == 16) { + vp9_copy_mem16x16(src, src_stride, dst, dst_stride); + } else if (w == 8 && h == 8) { + vp9_copy_mem8x8(src, src_stride, dst, dst_stride); + } else if (w == 8 && h == 4) { + vp9_copy_mem8x4(src, src_stride, dst, dst_stride); + } else { + int r; + + for (r = h; r > 0; --r) { + memcpy(dst, src, w); + src += src_stride; + dst += dst_stride; + } + } +} + +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h) { + int x, y; + + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + dst[x] = (dst[x] + src[x] + 1) >> 1; + } + src += src_stride; + dst += dst_stride; + } +} diff --git a/libvpx/vp9/common/vp9_convolve.h b/libvpx/vp9/common/vp9_convolve.h new file mode 100644 index 000000000..0596080c0 --- /dev/null +++ b/libvpx/vp9/common/vp9_convolve.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VP9_COMMON_CONVOLVE_H_ +#define VP9_COMMON_CONVOLVE_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block copy conforming to the convolution prototype +void vp9_convolve_copy(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +// Not a convolution, a block average conforming to the convolution prototype +void vp9_convolve_avg(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h); + +struct subpix_fn_table { + const int16_t (*filter_x)[8]; + const int16_t (*filter_y)[8]; +}; + +#endif // VP9_COMMON_CONVOLVE_H_ diff --git a/libvpx/vp9/common/vp9_debugmodes.c b/libvpx/vp9/common/vp9_debugmodes.c new file mode 100644 index 000000000..5841f8091 --- /dev/null +++ b/libvpx/vp9/common/vp9_debugmodes.c @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdio.h> + +#include "vp9/common/vp9_blockd.h" + +void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, + int frame, char *file) { + int mi_row; + int mi_col; + int mi_index = 0; + FILE *mvs = fopen(file, "a"); + + // Print out the macroblock Y modes + fprintf(mvs, "SB Types for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type); + + mi_index++; + } + + fprintf(mvs, "\n"); + mi_index += 8; + } + + // Print out the macroblock Y modes + fprintf(mvs, "Mb Modes for Frame %d\n", frame); + mi_index = 0; + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode); + + mi_index++; + } + + fprintf(mvs, "\n"); + mi_index += 8; + } + + fprintf(mvs, "\n"); + + mi_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]); + + mi_index++; + } + + fprintf(mvs, "\n"); + mi_index += 8; + } + fprintf(mvs, "\n"); + + mi_index = 0; + fprintf(mvs, "Mb mv ref for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row, + mi[mi_index].mbmi.mv[0].as_mv.col); + + mi_index++; + } + + fprintf(mvs, "\n"); + mi_index += 8; + } + + fprintf(mvs, "\n"); + + /* print out the macroblock txform sizes */ + mi_index = 0; + fprintf(mvs, "TXFM size for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size); + + mi_index++; + } + + mi_index += 8; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the macroblock UV modes */ + mi_index = 0; + fprintf(mvs, "UV Modes for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode); + + mi_index++; + } + + mi_index += 8; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + /* print out the macroblock mvs */ + mi_index = 0; + fprintf(mvs, "MVs for Frame %d\n", frame); + + for (mi_row = 0; mi_row < rows; mi_row++) { + for (mi_col = 0; mi_col < cols; mi_col++) { + fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2, + mi[mi_index].mbmi.mv[0].as_mv.col / 2); + + mi_index++; + } + + mi_index += 8; + fprintf(mvs, "\n"); + } + + fprintf(mvs, "\n"); + + fclose(mvs); +} diff --git a/libvpx/vp9/common/vp9_default_coef_probs.h b/libvpx/vp9/common/vp9_default_coef_probs.h new file mode 100644 index 000000000..1954093f5 --- /dev/null +++ b/libvpx/vp9/common/vp9_default_coef_probs.h @@ -0,0 +1,1384 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +*/ + + +/*Generated file, included by vp9_entropy.c*/ + +#if CONFIG_BALANCED_COEFTREE +static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 6, 213, 178 }, + { 26, 113, 132 }, + { 34, 17, 68 } + }, { /* Coeff Band 1 */ + { 66, 96, 178 }, + { 63, 96, 174 }, + { 67, 54, 154 }, + { 62, 28, 126 }, + { 48, 9, 84 }, + { 20, 1, 32 } + }, { /* Coeff Band 2 */ + { 64, 144, 206 }, + { 70, 99, 191 }, + { 69, 36, 152 }, + { 55, 9, 106 }, + { 35, 1, 60 }, + { 14, 1, 22 } + }, { /* Coeff Band 3 */ + { 82, 154, 222 }, + { 83, 112, 205 }, + { 81, 31, 164 }, + { 62, 7, 118 }, + { 42, 1, 74 }, + { 18, 1, 30 } + }, { /* Coeff Band 4 */ + { 52, 179, 233 }, + { 64, 132, 214 }, + { 73, 36, 170 }, + { 59, 8, 116 }, + { 38, 1, 65 }, + { 15, 1, 26 } + }, { /* Coeff Band 5 */ + { 29, 175, 238 }, + { 26, 169, 223 }, + { 41, 80, 182 }, + { 39, 32, 127 }, + { 26, 10, 69 }, + { 11, 2, 28 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 21, 226, 234 }, + { 52, 182, 212 }, + { 80, 112, 177 } + }, { /* Coeff Band 1 */ + { 111, 164, 243 }, + { 88, 152, 231 }, + { 90, 43, 186 }, + { 70, 12, 132 }, + { 44, 2, 76 }, + { 19, 1, 33 } + }, { /* Coeff Band 2 */ + { 96, 185, 246 }, + { 99, 127, 231 }, + { 88, 21, 177 }, + { 64, 5, 122 }, + { 38, 1, 69 }, + { 18, 1, 30 } + }, { /* Coeff Band 3 */ + { 84, 206, 249 }, + { 94, 147, 237 }, + { 95, 33, 187 }, + { 71, 8, 131 }, + { 47, 1, 83 }, + { 26, 1, 44 } + }, { /* Coeff Band 4 */ + { 38, 221, 252 }, + { 58, 177, 241 }, + { 78, 46, 188 }, + { 59, 9, 122 }, + { 34, 1, 66 }, + { 18, 1, 34 } + }, { /* Coeff Band 5 */ + { 21, 216, 253 }, + { 21, 206, 244 }, + { 42, 93, 200 }, + { 43, 41, 146 }, + { 36, 13, 93 }, + { 31, 1, 55 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 7, 213, 219 }, + { 23, 139, 182 }, + { 38, 60, 125 } + }, { /* Coeff Band 1 */ + { 69, 156, 220 }, + { 52, 178, 213 }, + { 69, 111, 190 }, + { 69, 58, 155 }, + { 58, 21, 104 }, + { 39, 7, 60 } + }, { /* Coeff Band 2 */ + { 68, 189, 228 }, + { 70, 158, 221 }, + { 83, 64, 189 }, + { 73, 18, 141 }, + { 48, 4, 88 }, + { 23, 1, 41 } + }, { /* Coeff Band 3 */ + { 99, 194, 236 }, + { 91, 138, 224 }, + { 91, 53, 189 }, + { 74, 20, 142 }, + { 48, 6, 90 }, + { 22, 1, 41 } + }, { /* Coeff Band 4 */ + { 52, 203, 244 }, + { 60, 168, 231 }, + { 75, 62, 189 }, + { 61, 18, 132 }, + { 38, 4, 72 }, + { 17, 1, 39 } + }, { /* Coeff Band 5 */ + { 33, 192, 247 }, + { 31, 185, 234 }, + { 46, 85, 185 }, + { 39, 35, 132 }, + { 28, 15, 80 }, + { 13, 5, 38 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 5, 247, 246 }, + { 28, 209, 228 }, + { 65, 137, 203 } + }, { /* Coeff Band 1 */ + { 69, 208, 250 }, + { 54, 207, 242 }, + { 81, 92, 204 }, + { 70, 54, 153 }, + { 58, 40, 108 }, + { 58, 35, 71 } + }, { /* Coeff Band 2 */ + { 65, 215, 250 }, + { 72, 185, 239 }, + { 92, 50, 197 }, + { 75, 14, 147 }, + { 49, 2, 99 }, + { 26, 1, 53 } + }, { /* Coeff Band 3 */ + { 70, 220, 251 }, + { 76, 186, 241 }, + { 90, 65, 198 }, + { 75, 26, 151 }, + { 58, 12, 112 }, + { 34, 6, 49 } + }, { /* Coeff Band 4 */ + { 34, 224, 253 }, + { 44, 204, 245 }, + { 69, 85, 204 }, + { 64, 31, 150 }, + { 44, 2, 78 }, + { 1, 1, 128 } + }, { /* Coeff Band 5 */ + { 25, 216, 253 }, + { 21, 215, 248 }, + { 47, 108, 214 }, + { 47, 48, 160 }, + { 26, 20, 90 }, + { 64, 171, 128 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 9, 203, 199 }, + { 26, 92, 128 }, + { 28, 11, 55 } + }, { /* Coeff Band 1 */ + { 99, 54, 160 }, + { 78, 99, 155 }, + { 80, 44, 138 }, + { 71, 17, 115 }, + { 51, 5, 80 }, + { 27, 1, 40 } + }, { /* Coeff Band 2 */ + { 135, 81, 190 }, + { 113, 61, 182 }, + { 93, 16, 153 }, + { 70, 4, 115 }, + { 41, 1, 68 }, + { 16, 1, 27 } + }, { /* Coeff Band 3 */ + { 155, 103, 214 }, + { 129, 48, 199 }, + { 95, 10, 159 }, + { 63, 1, 110 }, + { 32, 1, 58 }, + { 12, 1, 21 } + }, { /* Coeff Band 4 */ + { 163, 149, 231 }, + { 137, 69, 213 }, + { 95, 11, 164 }, + { 62, 3, 108 }, + { 32, 1, 57 }, + { 13, 1, 22 } + }, { /* Coeff Band 5 */ + { 136, 189, 239 }, + { 123, 102, 223 }, + { 97, 19, 170 }, + { 66, 4, 111 }, + { 38, 1, 60 }, + { 18, 1, 26 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 24, 226, 244 }, + { 54, 178, 211 }, + { 80, 74, 152 } + }, { /* Coeff Band 1 */ + { 145, 153, 236 }, + { 101, 163, 223 }, + { 108, 50, 187 }, + { 90, 22, 145 }, + { 66, 8, 97 }, + { 42, 4, 50 } + }, { /* Coeff Band 2 */ + { 150, 159, 238 }, + { 128, 90, 218 }, + { 94, 9, 163 }, + { 64, 3, 110 }, + { 34, 1, 61 }, + { 13, 1, 24 } + }, { /* Coeff Band 3 */ + { 151, 162, 242 }, + { 135, 80, 222 }, + { 93, 9, 166 }, + { 61, 3, 111 }, + { 31, 1, 59 }, + { 12, 1, 22 } + }, { /* Coeff Band 4 */ + { 161, 170, 245 }, + { 140, 84, 228 }, + { 99, 8, 174 }, + { 64, 1, 116 }, + { 34, 1, 63 }, + { 14, 1, 26 } + }, { /* Coeff Band 5 */ + { 138, 197, 246 }, + { 127, 109, 233 }, + { 100, 16, 179 }, + { 66, 3, 119 }, + { 37, 1, 66 }, + { 16, 1, 30 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 6, 216, 212 }, + { 25, 134, 171 }, + { 43, 48, 118 } + }, { /* Coeff Band 1 */ + { 93, 112, 209 }, + { 66, 159, 206 }, + { 82, 78, 184 }, + { 75, 28, 148 }, + { 46, 4, 82 }, + { 18, 1, 28 } + }, { /* Coeff Band 2 */ + { 108, 148, 220 }, + { 90, 130, 216 }, + { 92, 40, 186 }, + { 73, 10, 135 }, + { 46, 1, 79 }, + { 20, 1, 35 } + }, { /* Coeff Band 3 */ + { 125, 173, 232 }, + { 109, 117, 223 }, + { 97, 31, 183 }, + { 71, 7, 127 }, + { 44, 1, 76 }, + { 21, 1, 36 } + }, { /* Coeff Band 4 */ + { 133, 195, 236 }, + { 112, 121, 224 }, + { 97, 23, 178 }, + { 69, 3, 122 }, + { 42, 1, 72 }, + { 19, 1, 34 } + }, { /* Coeff Band 5 */ + { 132, 180, 238 }, + { 119, 102, 225 }, + { 101, 18, 179 }, + { 71, 3, 124 }, + { 42, 1, 70 }, + { 17, 1, 28 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 5, 242, 250 }, + { 26, 198, 226 }, + { 58, 98, 168 } + }, { /* Coeff Band 1 */ + { 82, 201, 246 }, + { 50, 219, 237 }, + { 94, 107, 205 }, + { 89, 61, 167 }, + { 77, 31, 131 }, + { 57, 14, 91 } + }, { /* Coeff Band 2 */ + { 99, 202, 247 }, + { 96, 165, 234 }, + { 100, 31, 190 }, + { 72, 8, 131 }, + { 41, 1, 72 }, + { 14, 1, 24 } + }, { /* Coeff Band 3 */ + { 108, 204, 248 }, + { 107, 156, 235 }, + { 103, 27, 186 }, + { 71, 4, 124 }, + { 39, 1, 66 }, + { 14, 1, 19 } + }, { /* Coeff Band 4 */ + { 120, 211, 248 }, + { 118, 149, 234 }, + { 107, 19, 182 }, + { 72, 3, 126 }, + { 40, 1, 69 }, + { 16, 1, 24 } + }, { /* Coeff Band 5 */ + { 127, 199, 245 }, + { 122, 125, 232 }, + { 112, 20, 186 }, + { 82, 3, 136 }, + { 55, 1, 88 }, + { 10, 1, 38 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 25, 9, 101 }, + { 25, 2, 67 }, + { 15, 1, 28 } + }, { /* Coeff Band 1 */ + { 67, 30, 118 }, + { 61, 56, 116 }, + { 60, 31, 105 }, + { 52, 11, 85 }, + { 34, 2, 54 }, + { 14, 1, 22 } + }, { /* Coeff Band 2 */ + { 107, 58, 149 }, + { 92, 53, 147 }, + { 78, 14, 123 }, + { 56, 3, 87 }, + { 35, 1, 56 }, + { 17, 1, 27 } + }, { /* Coeff Band 3 */ + { 142, 61, 171 }, + { 111, 30, 162 }, + { 80, 4, 128 }, + { 53, 1, 87 }, + { 31, 1, 52 }, + { 14, 1, 24 } + }, { /* Coeff Band 4 */ + { 171, 73, 200 }, + { 129, 28, 184 }, + { 86, 3, 140 }, + { 54, 1, 90 }, + { 28, 1, 49 }, + { 12, 1, 21 } + }, { /* Coeff Band 5 */ + { 193, 129, 227 }, + { 148, 28, 200 }, + { 90, 2, 144 }, + { 53, 1, 90 }, + { 28, 1, 50 }, + { 13, 1, 22 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 60, 7, 234 }, + { 64, 4, 184 }, + { 56, 1, 104 } + }, { /* Coeff Band 1 */ + { 150, 111, 210 }, + { 87, 185, 202 }, + { 101, 81, 177 }, + { 90, 34, 142 }, + { 67, 11, 95 }, + { 38, 2, 51 } + }, { /* Coeff Band 2 */ + { 153, 139, 218 }, + { 120, 72, 195 }, + { 90, 11, 147 }, + { 63, 3, 101 }, + { 39, 1, 61 }, + { 20, 1, 33 } + }, { /* Coeff Band 3 */ + { 171, 132, 223 }, + { 131, 56, 200 }, + { 92, 6, 147 }, + { 58, 1, 95 }, + { 32, 1, 52 }, + { 14, 1, 23 } + }, { /* Coeff Band 4 */ + { 183, 137, 227 }, + { 139, 48, 204 }, + { 91, 3, 148 }, + { 55, 1, 91 }, + { 28, 1, 47 }, + { 13, 1, 21 } + }, { /* Coeff Band 5 */ + { 198, 149, 234 }, + { 153, 32, 208 }, + { 95, 2, 148 }, + { 55, 1, 90 }, + { 30, 1, 51 }, + { 16, 1, 25 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 7, 209, 217 }, + { 31, 106, 151 }, + { 40, 21, 86 } + }, { /* Coeff Band 1 */ + { 101, 71, 184 }, + { 74, 131, 177 }, + { 88, 50, 158 }, + { 78, 16, 129 }, + { 51, 2, 82 }, + { 18, 1, 29 } + }, { /* Coeff Band 2 */ + { 116, 115, 199 }, + { 102, 88, 191 }, + { 94, 22, 160 }, + { 74, 6, 122 }, + { 47, 1, 77 }, + { 18, 1, 30 } + }, { /* Coeff Band 3 */ + { 157, 124, 210 }, + { 130, 53, 201 }, + { 102, 10, 165 }, + { 73, 1, 120 }, + { 42, 1, 69 }, + { 16, 1, 27 } + }, { /* Coeff Band 4 */ + { 174, 147, 225 }, + { 134, 67, 212 }, + { 100, 10, 168 }, + { 66, 1, 111 }, + { 36, 1, 60 }, + { 16, 1, 27 } + }, { /* Coeff Band 5 */ + { 185, 165, 232 }, + { 147, 56, 214 }, + { 105, 5, 165 }, + { 66, 1, 108 }, + { 35, 1, 59 }, + { 16, 1, 27 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 3, 232, 245 }, + { 18, 162, 210 }, + { 38, 64, 131 } + }, { /* Coeff Band 1 */ + { 84, 187, 239 }, + { 35, 231, 231 }, + { 82, 150, 209 }, + { 87, 97, 181 }, + { 81, 64, 151 }, + { 67, 60, 119 } + }, { /* Coeff Band 2 */ + { 107, 185, 239 }, + { 100, 149, 224 }, + { 107, 34, 185 }, + { 83, 12, 141 }, + { 49, 4, 92 }, + { 21, 1, 40 } + }, { /* Coeff Band 3 */ + { 125, 184, 243 }, + { 121, 127, 228 }, + { 113, 25, 185 }, + { 82, 6, 134 }, + { 48, 1, 82 }, + { 26, 1, 38 } + }, { /* Coeff Band 4 */ + { 143, 185, 245 }, + { 133, 115, 231 }, + { 114, 14, 184 }, + { 77, 3, 126 }, + { 43, 1, 68 }, + { 34, 1, 40 } + }, { /* Coeff Band 5 */ + { 170, 194, 241 }, + { 151, 80, 226 }, + { 118, 9, 180 }, + { 81, 1, 130 }, + { 51, 1, 78 }, + { 18, 1, 49 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 29, 42, 137 }, + { 26, 3, 60 }, + { 13, 1, 23 } + }, { /* Coeff Band 1 */ + { 69, 36, 122 }, + { 63, 57, 123 }, + { 60, 33, 112 }, + { 52, 11, 90 }, + { 32, 2, 52 }, + { 10, 1, 15 } + }, { /* Coeff Band 2 */ + { 107, 55, 143 }, + { 86, 69, 143 }, + { 74, 24, 116 }, + { 52, 5, 78 }, + { 29, 1, 44 }, + { 12, 1, 18 } + }, { /* Coeff Band 3 */ + { 137, 71, 160 }, + { 107, 34, 152 }, + { 73, 6, 114 }, + { 44, 1, 69 }, + { 25, 1, 40 }, + { 12, 1, 18 } + }, { /* Coeff Band 4 */ + { 165, 70, 174 }, + { 118, 24, 159 }, + { 74, 3, 117 }, + { 45, 1, 73 }, + { 26, 1, 43 }, + { 12, 1, 19 } + }, { /* Coeff Band 5 */ + { 220, 93, 223 }, + { 153, 10, 187 }, + { 86, 2, 131 }, + { 49, 1, 79 }, + { 26, 1, 43 }, + { 12, 1, 20 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 30, 58, 227 }, + { 35, 10, 172 }, + { 24, 23, 112 } + }, { /* Coeff Band 1 */ + { 117, 145, 219 }, + { 51, 221, 216 }, + { 75, 169, 196 }, + { 88, 96, 165 }, + { 77, 43, 117 }, + { 53, 18, 60 } + }, { /* Coeff Band 2 */ + { 128, 176, 225 }, + { 108, 114, 202 }, + { 92, 19, 152 }, + { 65, 4, 103 }, + { 38, 1, 61 }, + { 19, 1, 30 } + }, { /* Coeff Band 3 */ + { 146, 184, 228 }, + { 122, 95, 205 }, + { 92, 11, 149 }, + { 62, 1, 98 }, + { 35, 1, 57 }, + { 17, 1, 26 } + }, { /* Coeff Band 4 */ + { 165, 192, 230 }, + { 132, 81, 206 }, + { 93, 6, 147 }, + { 58, 1, 94 }, + { 32, 1, 52 }, + { 15, 1, 24 } + }, { /* Coeff Band 5 */ + { 204, 223, 234 }, + { 156, 49, 204 }, + { 97, 3, 145 }, + { 59, 1, 92 }, + { 33, 1, 52 }, + { 15, 1, 24 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 7, 184, 200 }, + { 25, 67, 113 }, + { 30, 9, 59 } + }, { /* Coeff Band 1 */ + { 92, 42, 158 }, + { 65, 121, 159 }, + { 77, 56, 146 }, + { 70, 22, 120 }, + { 47, 4, 76 }, + { 18, 1, 26 } + }, { /* Coeff Band 2 */ + { 113, 81, 177 }, + { 96, 75, 167 }, + { 84, 24, 136 }, + { 63, 8, 100 }, + { 37, 1, 58 }, + { 13, 1, 19 } + }, { /* Coeff Band 3 */ + { 147, 85, 194 }, + { 119, 36, 178 }, + { 88, 8, 139 }, + { 59, 1, 93 }, + { 31, 1, 49 }, + { 10, 1, 18 } + }, { /* Coeff Band 4 */ + { 169, 108, 210 }, + { 131, 41, 191 }, + { 92, 5, 144 }, + { 56, 1, 88 }, + { 29, 1, 47 }, + { 14, 1, 22 } + }, { /* Coeff Band 5 */ + { 210, 106, 223 }, + { 148, 14, 192 }, + { 89, 2, 138 }, + { 52, 1, 84 }, + { 29, 1, 47 }, + { 14, 1, 23 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 3, 207, 245 }, + { 12, 102, 213 }, + { 18, 33, 144 } + }, { /* Coeff Band 1 */ + { 85, 205, 245 }, + { 18, 249, 242 }, + { 59, 221, 229 }, + { 91, 166, 213 }, + { 88, 117, 183 }, + { 70, 95, 149 } + }, { /* Coeff Band 2 */ + { 114, 193, 241 }, + { 104, 155, 221 }, + { 100, 33, 181 }, + { 78, 10, 132 }, + { 43, 2, 75 }, + { 15, 1, 48 } + }, { /* Coeff Band 3 */ + { 118, 198, 244 }, + { 117, 142, 224 }, + { 111, 25, 179 }, + { 83, 4, 134 }, + { 57, 1, 84 }, + { 1, 1, 1 } + }, { /* Coeff Band 4 */ + { 144, 201, 248 }, + { 136, 130, 234 }, + { 124, 12, 188 }, + { 83, 1, 130 }, + { 61, 1, 66 }, + { 64, 171, 128 } + }, { /* Coeff Band 5 */ + { 174, 227, 250 }, + { 165, 118, 242 }, + { 132, 21, 197 }, + { 84, 3, 134 }, + { 70, 1, 69 }, + { 1, 1, 1 } + } + } + } +}; +#else +static const vp9_coeff_probs_model default_coef_probs_4x4[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 195, 29, 183 }, + { 84, 49, 136 }, + { 8, 42, 71 } + }, { /* Coeff Band 1 */ + { 31, 107, 169 }, + { 35, 99, 159 }, + { 17, 82, 140 }, + { 8, 66, 114 }, + { 2, 44, 76 }, + { 1, 19, 32 } + }, { /* Coeff Band 2 */ + { 40, 132, 201 }, + { 29, 114, 187 }, + { 13, 91, 157 }, + { 7, 75, 127 }, + { 3, 58, 95 }, + { 1, 28, 47 } + }, { /* Coeff Band 3 */ + { 69, 142, 221 }, + { 42, 122, 201 }, + { 15, 91, 159 }, + { 6, 67, 121 }, + { 1, 42, 77 }, + { 1, 17, 31 } + }, { /* Coeff Band 4 */ + { 102, 148, 228 }, + { 67, 117, 204 }, + { 17, 82, 154 }, + { 6, 59, 114 }, + { 2, 39, 75 }, + { 1, 15, 29 } + }, { /* Coeff Band 5 */ + { 156, 57, 233 }, + { 119, 57, 212 }, + { 58, 48, 163 }, + { 29, 40, 124 }, + { 12, 30, 81 }, + { 3, 12, 31 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 191, 107, 226 }, + { 124, 117, 204 }, + { 25, 99, 155 } + }, { /* Coeff Band 1 */ + { 29, 148, 210 }, + { 37, 126, 194 }, + { 8, 93, 157 }, + { 2, 68, 118 }, + { 1, 39, 69 }, + { 1, 17, 33 } + }, { /* Coeff Band 2 */ + { 41, 151, 213 }, + { 27, 123, 193 }, + { 3, 82, 144 }, + { 1, 58, 105 }, + { 1, 32, 60 }, + { 1, 13, 26 } + }, { /* Coeff Band 3 */ + { 59, 159, 220 }, + { 23, 126, 198 }, + { 4, 88, 151 }, + { 1, 66, 114 }, + { 1, 38, 71 }, + { 1, 18, 34 } + }, { /* Coeff Band 4 */ + { 114, 136, 232 }, + { 51, 114, 207 }, + { 11, 83, 155 }, + { 3, 56, 105 }, + { 1, 33, 65 }, + { 1, 17, 34 } + }, { /* Coeff Band 5 */ + { 149, 65, 234 }, + { 121, 57, 215 }, + { 61, 49, 166 }, + { 28, 36, 114 }, + { 12, 25, 76 }, + { 3, 16, 42 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 214, 49, 220 }, + { 132, 63, 188 }, + { 42, 65, 137 } + }, { /* Coeff Band 1 */ + { 85, 137, 221 }, + { 104, 131, 216 }, + { 49, 111, 192 }, + { 21, 87, 155 }, + { 2, 49, 87 }, + { 1, 16, 28 } + }, { /* Coeff Band 2 */ + { 89, 163, 230 }, + { 90, 137, 220 }, + { 29, 100, 183 }, + { 10, 70, 135 }, + { 2, 42, 81 }, + { 1, 17, 33 } + }, { /* Coeff Band 3 */ + { 108, 167, 237 }, + { 55, 133, 222 }, + { 15, 97, 179 }, + { 4, 72, 135 }, + { 1, 45, 85 }, + { 1, 19, 38 } + }, { /* Coeff Band 4 */ + { 124, 146, 240 }, + { 66, 124, 224 }, + { 17, 88, 175 }, + { 4, 58, 122 }, + { 1, 36, 75 }, + { 1, 18, 37 } + }, { /* Coeff Band 5 */ + { 141, 79, 241 }, + { 126, 70, 227 }, + { 66, 58, 182 }, + { 30, 44, 136 }, + { 12, 34, 96 }, + { 2, 20, 47 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 229, 99, 249 }, + { 143, 111, 235 }, + { 46, 109, 192 } + }, { /* Coeff Band 1 */ + { 82, 158, 236 }, + { 94, 146, 224 }, + { 25, 117, 191 }, + { 9, 87, 149 }, + { 3, 56, 99 }, + { 1, 33, 57 } + }, { /* Coeff Band 2 */ + { 83, 167, 237 }, + { 68, 145, 222 }, + { 10, 103, 177 }, + { 2, 72, 131 }, + { 1, 41, 79 }, + { 1, 20, 39 } + }, { /* Coeff Band 3 */ + { 99, 167, 239 }, + { 47, 141, 224 }, + { 10, 104, 178 }, + { 2, 73, 133 }, + { 1, 44, 85 }, + { 1, 22, 47 } + }, { /* Coeff Band 4 */ + { 127, 145, 243 }, + { 71, 129, 228 }, + { 17, 93, 177 }, + { 3, 61, 124 }, + { 1, 41, 84 }, + { 1, 21, 52 } + }, { /* Coeff Band 5 */ + { 157, 78, 244 }, + { 140, 72, 231 }, + { 69, 58, 184 }, + { 31, 44, 137 }, + { 14, 38, 105 }, + { 8, 23, 61 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_8x8[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 125, 34, 187 }, + { 52, 41, 133 }, + { 6, 31, 56 } + }, { /* Coeff Band 1 */ + { 37, 109, 153 }, + { 51, 102, 147 }, + { 23, 87, 128 }, + { 8, 67, 101 }, + { 1, 41, 63 }, + { 1, 19, 29 } + }, { /* Coeff Band 2 */ + { 31, 154, 185 }, + { 17, 127, 175 }, + { 6, 96, 145 }, + { 2, 73, 114 }, + { 1, 51, 82 }, + { 1, 28, 45 } + }, { /* Coeff Band 3 */ + { 23, 163, 200 }, + { 10, 131, 185 }, + { 2, 93, 148 }, + { 1, 67, 111 }, + { 1, 41, 69 }, + { 1, 14, 24 } + }, { /* Coeff Band 4 */ + { 29, 176, 217 }, + { 12, 145, 201 }, + { 3, 101, 156 }, + { 1, 69, 111 }, + { 1, 39, 63 }, + { 1, 14, 23 } + }, { /* Coeff Band 5 */ + { 57, 192, 233 }, + { 25, 154, 215 }, + { 6, 109, 167 }, + { 3, 78, 118 }, + { 1, 48, 69 }, + { 1, 21, 29 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 202, 105, 245 }, + { 108, 106, 216 }, + { 18, 90, 144 } + }, { /* Coeff Band 1 */ + { 33, 172, 219 }, + { 64, 149, 206 }, + { 14, 117, 177 }, + { 5, 90, 141 }, + { 2, 61, 95 }, + { 1, 37, 57 } + }, { /* Coeff Band 2 */ + { 33, 179, 220 }, + { 11, 140, 198 }, + { 1, 89, 148 }, + { 1, 60, 104 }, + { 1, 33, 57 }, + { 1, 12, 21 } + }, { /* Coeff Band 3 */ + { 30, 181, 221 }, + { 8, 141, 198 }, + { 1, 87, 145 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 20 } + }, { /* Coeff Band 4 */ + { 32, 186, 224 }, + { 7, 142, 198 }, + { 1, 86, 143 }, + { 1, 58, 100 }, + { 1, 31, 55 }, + { 1, 12, 22 } + }, { /* Coeff Band 5 */ + { 57, 192, 227 }, + { 20, 143, 204 }, + { 3, 96, 154 }, + { 1, 68, 112 }, + { 1, 42, 69 }, + { 1, 19, 32 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 212, 35, 215 }, + { 113, 47, 169 }, + { 29, 48, 105 } + }, { /* Coeff Band 1 */ + { 74, 129, 203 }, + { 106, 120, 203 }, + { 49, 107, 178 }, + { 19, 84, 144 }, + { 4, 50, 84 }, + { 1, 15, 25 } + }, { /* Coeff Band 2 */ + { 71, 172, 217 }, + { 44, 141, 209 }, + { 15, 102, 173 }, + { 6, 76, 133 }, + { 2, 51, 89 }, + { 1, 24, 42 } + }, { /* Coeff Band 3 */ + { 64, 185, 231 }, + { 31, 148, 216 }, + { 8, 103, 175 }, + { 3, 74, 131 }, + { 1, 46, 81 }, + { 1, 18, 30 } + }, { /* Coeff Band 4 */ + { 65, 196, 235 }, + { 25, 157, 221 }, + { 5, 105, 174 }, + { 1, 67, 120 }, + { 1, 38, 69 }, + { 1, 15, 30 } + }, { /* Coeff Band 5 */ + { 65, 204, 238 }, + { 30, 156, 224 }, + { 7, 107, 177 }, + { 2, 70, 124 }, + { 1, 42, 73 }, + { 1, 18, 34 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 225, 86, 251 }, + { 144, 104, 235 }, + { 42, 99, 181 } + }, { /* Coeff Band 1 */ + { 85, 175, 239 }, + { 112, 165, 229 }, + { 29, 136, 200 }, + { 12, 103, 162 }, + { 6, 77, 123 }, + { 2, 53, 84 } + }, { /* Coeff Band 2 */ + { 75, 183, 239 }, + { 30, 155, 221 }, + { 3, 106, 171 }, + { 1, 74, 128 }, + { 1, 44, 76 }, + { 1, 17, 28 } + }, { /* Coeff Band 3 */ + { 73, 185, 240 }, + { 27, 159, 222 }, + { 2, 107, 172 }, + { 1, 75, 127 }, + { 1, 42, 73 }, + { 1, 17, 29 } + }, { /* Coeff Band 4 */ + { 62, 190, 238 }, + { 21, 159, 222 }, + { 2, 107, 172 }, + { 1, 72, 122 }, + { 1, 40, 71 }, + { 1, 18, 32 } + }, { /* Coeff Band 5 */ + { 61, 199, 240 }, + { 27, 161, 226 }, + { 4, 113, 180 }, + { 1, 76, 129 }, + { 1, 46, 80 }, + { 1, 23, 41 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_16x16[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 7, 27, 153 }, + { 5, 30, 95 }, + { 1, 16, 30 } + }, { /* Coeff Band 1 */ + { 50, 75, 127 }, + { 57, 75, 124 }, + { 27, 67, 108 }, + { 10, 54, 86 }, + { 1, 33, 52 }, + { 1, 12, 18 } + }, { /* Coeff Band 2 */ + { 43, 125, 151 }, + { 26, 108, 148 }, + { 7, 83, 122 }, + { 2, 59, 89 }, + { 1, 38, 60 }, + { 1, 17, 27 } + }, { /* Coeff Band 3 */ + { 23, 144, 163 }, + { 13, 112, 154 }, + { 2, 75, 117 }, + { 1, 50, 81 }, + { 1, 31, 51 }, + { 1, 14, 23 } + }, { /* Coeff Band 4 */ + { 18, 162, 185 }, + { 6, 123, 171 }, + { 1, 78, 125 }, + { 1, 51, 86 }, + { 1, 31, 54 }, + { 1, 14, 23 } + }, { /* Coeff Band 5 */ + { 15, 199, 227 }, + { 3, 150, 204 }, + { 1, 91, 146 }, + { 1, 55, 95 }, + { 1, 30, 53 }, + { 1, 11, 20 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 19, 55, 240 }, + { 19, 59, 196 }, + { 3, 52, 105 } + }, { /* Coeff Band 1 */ + { 41, 166, 207 }, + { 104, 153, 199 }, + { 31, 123, 181 }, + { 14, 101, 152 }, + { 5, 72, 106 }, + { 1, 36, 52 } + }, { /* Coeff Band 2 */ + { 35, 176, 211 }, + { 12, 131, 190 }, + { 2, 88, 144 }, + { 1, 60, 101 }, + { 1, 36, 60 }, + { 1, 16, 28 } + }, { /* Coeff Band 3 */ + { 28, 183, 213 }, + { 8, 134, 191 }, + { 1, 86, 142 }, + { 1, 56, 96 }, + { 1, 30, 53 }, + { 1, 12, 20 } + }, { /* Coeff Band 4 */ + { 20, 190, 215 }, + { 4, 135, 192 }, + { 1, 84, 139 }, + { 1, 53, 91 }, + { 1, 28, 49 }, + { 1, 11, 20 } + }, { /* Coeff Band 5 */ + { 13, 196, 216 }, + { 2, 137, 192 }, + { 1, 86, 143 }, + { 1, 57, 99 }, + { 1, 32, 56 }, + { 1, 13, 24 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 211, 29, 217 }, + { 96, 47, 156 }, + { 22, 43, 87 } + }, { /* Coeff Band 1 */ + { 78, 120, 193 }, + { 111, 116, 186 }, + { 46, 102, 164 }, + { 15, 80, 128 }, + { 2, 49, 76 }, + { 1, 18, 28 } + }, { /* Coeff Band 2 */ + { 71, 161, 203 }, + { 42, 132, 192 }, + { 10, 98, 150 }, + { 3, 69, 109 }, + { 1, 44, 70 }, + { 1, 18, 29 } + }, { /* Coeff Band 3 */ + { 57, 186, 211 }, + { 30, 140, 196 }, + { 4, 93, 146 }, + { 1, 62, 102 }, + { 1, 38, 65 }, + { 1, 16, 27 } + }, { /* Coeff Band 4 */ + { 47, 199, 217 }, + { 14, 145, 196 }, + { 1, 88, 142 }, + { 1, 57, 98 }, + { 1, 36, 62 }, + { 1, 15, 26 } + }, { /* Coeff Band 5 */ + { 26, 219, 229 }, + { 5, 155, 207 }, + { 1, 94, 151 }, + { 1, 60, 104 }, + { 1, 36, 62 }, + { 1, 16, 28 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 233, 29, 248 }, + { 146, 47, 220 }, + { 43, 52, 140 } + }, { /* Coeff Band 1 */ + { 100, 163, 232 }, + { 179, 161, 222 }, + { 63, 142, 204 }, + { 37, 113, 174 }, + { 26, 89, 137 }, + { 18, 68, 97 } + }, { /* Coeff Band 2 */ + { 85, 181, 230 }, + { 32, 146, 209 }, + { 7, 100, 164 }, + { 3, 71, 121 }, + { 1, 45, 77 }, + { 1, 18, 30 } + }, { /* Coeff Band 3 */ + { 65, 187, 230 }, + { 20, 148, 207 }, + { 2, 97, 159 }, + { 1, 68, 116 }, + { 1, 40, 70 }, + { 1, 14, 29 } + }, { /* Coeff Band 4 */ + { 40, 194, 227 }, + { 8, 147, 204 }, + { 1, 94, 155 }, + { 1, 65, 112 }, + { 1, 39, 66 }, + { 1, 14, 26 } + }, { /* Coeff Band 5 */ + { 16, 208, 228 }, + { 3, 151, 207 }, + { 1, 98, 160 }, + { 1, 67, 117 }, + { 1, 41, 74 }, + { 1, 17, 31 } + } + } + } +}; +static const vp9_coeff_probs_model default_coef_probs_32x32[BLOCK_TYPES] = { + { /* block Type 0 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 17, 38, 140 }, + { 7, 34, 80 }, + { 1, 17, 29 } + }, { /* Coeff Band 1 */ + { 37, 75, 128 }, + { 41, 76, 128 }, + { 26, 66, 116 }, + { 12, 52, 94 }, + { 2, 32, 55 }, + { 1, 10, 16 } + }, { /* Coeff Band 2 */ + { 50, 127, 154 }, + { 37, 109, 152 }, + { 16, 82, 121 }, + { 5, 59, 85 }, + { 1, 35, 54 }, + { 1, 13, 20 } + }, { /* Coeff Band 3 */ + { 40, 142, 167 }, + { 17, 110, 157 }, + { 2, 71, 112 }, + { 1, 44, 72 }, + { 1, 27, 45 }, + { 1, 11, 17 } + }, { /* Coeff Band 4 */ + { 30, 175, 188 }, + { 9, 124, 169 }, + { 1, 74, 116 }, + { 1, 48, 78 }, + { 1, 30, 49 }, + { 1, 11, 18 } + }, { /* Coeff Band 5 */ + { 10, 222, 223 }, + { 2, 150, 194 }, + { 1, 83, 128 }, + { 1, 48, 79 }, + { 1, 27, 45 }, + { 1, 11, 17 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 36, 41, 235 }, + { 29, 36, 193 }, + { 10, 27, 111 } + }, { /* Coeff Band 1 */ + { 85, 165, 222 }, + { 177, 162, 215 }, + { 110, 135, 195 }, + { 57, 113, 168 }, + { 23, 83, 120 }, + { 10, 49, 61 } + }, { /* Coeff Band 2 */ + { 85, 190, 223 }, + { 36, 139, 200 }, + { 5, 90, 146 }, + { 1, 60, 103 }, + { 1, 38, 65 }, + { 1, 18, 30 } + }, { /* Coeff Band 3 */ + { 72, 202, 223 }, + { 23, 141, 199 }, + { 2, 86, 140 }, + { 1, 56, 97 }, + { 1, 36, 61 }, + { 1, 16, 27 } + }, { /* Coeff Band 4 */ + { 55, 218, 225 }, + { 13, 145, 200 }, + { 1, 86, 141 }, + { 1, 57, 99 }, + { 1, 35, 61 }, + { 1, 13, 22 } + }, { /* Coeff Band 5 */ + { 15, 235, 212 }, + { 1, 132, 184 }, + { 1, 84, 139 }, + { 1, 57, 97 }, + { 1, 34, 56 }, + { 1, 14, 23 } + } + } + }, { /* block Type 1 */ + { /* Intra */ + { /* Coeff Band 0 */ + { 181, 21, 201 }, + { 61, 37, 123 }, + { 10, 38, 71 } + }, { /* Coeff Band 1 */ + { 47, 106, 172 }, + { 95, 104, 173 }, + { 42, 93, 159 }, + { 18, 77, 131 }, + { 4, 50, 81 }, + { 1, 17, 23 } + }, { /* Coeff Band 2 */ + { 62, 147, 199 }, + { 44, 130, 189 }, + { 28, 102, 154 }, + { 18, 75, 115 }, + { 2, 44, 65 }, + { 1, 12, 19 } + }, { /* Coeff Band 3 */ + { 55, 153, 210 }, + { 24, 130, 194 }, + { 3, 93, 146 }, + { 1, 61, 97 }, + { 1, 31, 50 }, + { 1, 10, 16 } + }, { /* Coeff Band 4 */ + { 49, 186, 223 }, + { 17, 148, 204 }, + { 1, 96, 142 }, + { 1, 53, 83 }, + { 1, 26, 44 }, + { 1, 11, 17 } + }, { /* Coeff Band 5 */ + { 13, 217, 212 }, + { 2, 136, 180 }, + { 1, 78, 124 }, + { 1, 50, 83 }, + { 1, 29, 49 }, + { 1, 14, 23 } + } + }, { /* Inter */ + { /* Coeff Band 0 */ + { 197, 13, 247 }, + { 82, 17, 222 }, + { 25, 17, 162 } + }, { /* Coeff Band 1 */ + { 126, 186, 247 }, + { 234, 191, 243 }, + { 176, 177, 234 }, + { 104, 158, 220 }, + { 66, 128, 186 }, + { 55, 90, 137 } + }, { /* Coeff Band 2 */ + { 111, 197, 242 }, + { 46, 158, 219 }, + { 9, 104, 171 }, + { 2, 65, 125 }, + { 1, 44, 80 }, + { 1, 17, 91 } + }, { /* Coeff Band 3 */ + { 104, 208, 245 }, + { 39, 168, 224 }, + { 3, 109, 162 }, + { 1, 79, 124 }, + { 1, 50, 102 }, + { 1, 43, 102 } + }, { /* Coeff Band 4 */ + { 84, 220, 246 }, + { 31, 177, 231 }, + { 2, 115, 180 }, + { 1, 79, 134 }, + { 1, 55, 77 }, + { 1, 60, 79 } + }, { /* Coeff Band 5 */ + { 43, 243, 240 }, + { 8, 180, 217 }, + { 1, 115, 166 }, + { 1, 84, 121 }, + { 1, 51, 67 }, + { 1, 16, 6 } + } + } + } +}; +#endif diff --git a/libvpx/vp9/common/vp9_entropy.c b/libvpx/vp9/common/vp9_entropy.c new file mode 100644 index 000000000..080867e7a --- /dev/null +++ b/libvpx/vp9/common/vp9_entropy.c @@ -0,0 +1,737 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymode.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx/vpx_integer.h" + +DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = { + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +DECLARE_ALIGNED(16, const uint8_t, + vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 5 +}; + +DECLARE_ALIGNED(16, const uint8_t, + vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = { + 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 5, 5, 5, 5, 5, 5 +}; + +DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = { + 0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5 +}; + +DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]) = { + 0, 4, 1, 5, + 8, 2, 12, 9, + 3, 6, 13, 10, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]) = { + 0, 4, 8, 1, + 12, 5, 9, 2, + 13, 6, 10, 3, + 7, 14, 11, 15, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]) = { + 0, 1, 4, 2, + 5, 3, 6, 8, + 9, 7, 12, 10, + 13, 11, 14, 15, +}; + +DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]) = { + 0, 8, 1, 16, 9, 2, 17, 24, + 10, 3, 18, 25, 32, 11, 4, 26, + 33, 19, 40, 12, 34, 27, 5, 41, + 20, 48, 13, 35, 42, 28, 21, 6, + 49, 56, 36, 43, 29, 7, 14, 50, + 57, 44, 22, 37, 15, 51, 58, 30, + 45, 23, 52, 59, 38, 31, 60, 53, + 46, 39, 61, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]) = { + 0, 8, 16, 1, 24, 9, 32, 17, + 2, 40, 25, 10, 33, 18, 48, 3, + 26, 41, 11, 56, 19, 34, 4, 49, + 27, 42, 12, 35, 20, 57, 50, 28, + 5, 43, 13, 36, 58, 51, 21, 44, + 6, 29, 59, 37, 14, 52, 22, 7, + 45, 60, 30, 15, 38, 53, 23, 46, + 31, 61, 39, 54, 47, 62, 55, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]) = { + 0, 1, 2, 8, 9, 3, 16, 10, + 4, 17, 11, 24, 5, 18, 25, 12, + 19, 26, 32, 6, 13, 20, 33, 27, + 7, 34, 40, 21, 28, 41, 14, 35, + 48, 42, 29, 36, 49, 22, 43, 15, + 56, 37, 50, 44, 30, 57, 23, 51, + 58, 45, 38, 52, 31, 59, 53, 46, + 60, 39, 61, 47, 54, 55, 62, 63, +}; + +DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]) = { + 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 34, 49, 19, 65, 80, + 50, 4, 35, 66, 20, 81, 96, 51, 5, 36, 82, 97, 67, 112, 21, 52, + 98, 37, 83, 113, 6, 68, 128, 53, 22, 99, 114, 84, 7, 129, 38, 69, + 100, 115, 144, 130, 85, 54, 23, 8, 145, 39, 70, 116, 101, 131, 160, 146, + 55, 86, 24, 71, 132, 117, 161, 40, 9, 102, 147, 176, 162, 87, 56, 25, + 133, 118, 177, 148, 72, 103, 41, 163, 10, 192, 178, 88, 57, 134, 149, 119, + 26, 164, 73, 104, 193, 42, 179, 208, 11, 135, 89, 165, 120, 150, 58, 194, + 180, 27, 74, 209, 105, 151, 136, 43, 90, 224, 166, 195, 181, 121, 210, 59, + 12, 152, 106, 167, 196, 75, 137, 225, 211, 240, 182, 122, 91, 28, 197, 13, + 226, 168, 183, 153, 44, 212, 138, 107, 241, 60, 29, 123, 198, 184, 227, 169, + 242, 76, 213, 154, 45, 92, 14, 199, 139, 61, 228, 214, 170, 185, 243, 108, + 77, 155, 30, 15, 200, 229, 124, 215, 244, 93, 46, 186, 171, 201, 109, 140, + 230, 62, 216, 245, 31, 125, 78, 156, 231, 47, 187, 202, 217, 94, 246, 141, + 63, 232, 172, 110, 247, 157, 79, 218, 203, 126, 233, 188, 248, 95, 173, 142, + 219, 111, 249, 234, 158, 127, 189, 204, 250, 235, 143, 174, 220, 205, 159, 251, + 190, 221, 175, 236, 237, 191, 206, 252, 222, 253, 207, 238, 223, 254, 239, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]) = { + 0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81, + 34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4, + 67, 176, 20, 114, 145, 83, 36, 99, 130, 52, 192, 5, 161, 68, 115, 21, + 146, 84, 208, 177, 37, 131, 100, 53, 162, 224, 69, 6, 116, 193, 147, 85, + 22, 240, 132, 38, 178, 101, 163, 54, 209, 117, 70, 7, 148, 194, 86, 179, + 225, 23, 133, 39, 164, 8, 102, 210, 241, 55, 195, 118, 149, 71, 180, 24, + 87, 226, 134, 165, 211, 40, 103, 56, 72, 150, 196, 242, 119, 9, 181, 227, + 88, 166, 25, 135, 41, 104, 212, 57, 151, 197, 120, 73, 243, 182, 136, 167, + 213, 89, 10, 228, 105, 152, 198, 26, 42, 121, 183, 244, 168, 58, 137, 229, + 74, 214, 90, 153, 199, 184, 11, 106, 245, 27, 122, 230, 169, 43, 215, 59, + 200, 138, 185, 246, 75, 12, 91, 154, 216, 231, 107, 28, 44, 201, 123, 170, + 60, 247, 232, 76, 139, 13, 92, 217, 186, 248, 155, 108, 29, 124, 45, 202, + 233, 171, 61, 14, 77, 140, 15, 249, 93, 30, 187, 156, 218, 46, 109, 125, + 62, 172, 78, 203, 31, 141, 234, 94, 47, 188, 63, 157, 110, 250, 219, 79, + 126, 204, 173, 142, 95, 189, 111, 235, 158, 220, 251, 127, 174, 143, 205, 236, + 159, 190, 221, 252, 175, 206, 237, 191, 253, 222, 238, 207, 254, 223, 239, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]) = { + 0, 1, 2, 16, 3, 17, 4, 18, 32, 5, 33, 19, 6, 34, 48, 20, + 49, 7, 35, 21, 50, 64, 8, 36, 65, 22, 51, 37, 80, 9, 66, 52, + 23, 38, 81, 67, 10, 53, 24, 82, 68, 96, 39, 11, 54, 83, 97, 69, + 25, 98, 84, 40, 112, 55, 12, 70, 99, 113, 85, 26, 41, 56, 114, 100, + 13, 71, 128, 86, 27, 115, 101, 129, 42, 57, 72, 116, 14, 87, 130, 102, + 144, 73, 131, 117, 28, 58, 15, 88, 43, 145, 103, 132, 146, 118, 74, 160, + 89, 133, 104, 29, 59, 147, 119, 44, 161, 148, 90, 105, 134, 162, 120, 176, + 75, 135, 149, 30, 60, 163, 177, 45, 121, 91, 106, 164, 178, 150, 192, 136, + 165, 179, 31, 151, 193, 76, 122, 61, 137, 194, 107, 152, 180, 208, 46, 166, + 167, 195, 92, 181, 138, 209, 123, 153, 224, 196, 77, 168, 210, 182, 240, 108, + 197, 62, 154, 225, 183, 169, 211, 47, 139, 93, 184, 226, 212, 241, 198, 170, + 124, 155, 199, 78, 213, 185, 109, 227, 200, 63, 228, 242, 140, 214, 171, 186, + 156, 229, 243, 125, 94, 201, 244, 215, 216, 230, 141, 187, 202, 79, 172, 110, + 157, 245, 217, 231, 95, 246, 232, 126, 203, 247, 233, 173, 218, 142, 111, 158, + 188, 248, 127, 234, 219, 249, 189, 204, 143, 174, 159, 250, 235, 205, 220, 175, + 190, 251, 221, 191, 206, 236, 207, 237, 252, 222, 253, 223, 238, 239, 254, 255, +}; + +DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]) = { + 0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160, 129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193, 68, 131, 37, 100, + 225, 194, 256, 163, 69, 132, 6, 226, 257, 288, 195, 101, 164, 38, 258, 7, 227, 289, 133, 320, 70, 196, 165, 290, 259, 228, 39, 321, 102, 352, 8, 197, + 71, 134, 322, 291, 260, 353, 384, 229, 166, 103, 40, 354, 323, 292, 135, 385, 198, 261, 72, 9, 416, 167, 386, 355, 230, 324, 104, 293, 41, 417, 199, 136, + 262, 387, 448, 325, 356, 10, 73, 418, 231, 168, 449, 294, 388, 105, 419, 263, 42, 200, 357, 450, 137, 480, 74, 326, 232, 11, 389, 169, 295, 420, 106, 451, + 481, 358, 264, 327, 201, 43, 138, 512, 482, 390, 296, 233, 170, 421, 75, 452, 359, 12, 513, 265, 483, 328, 107, 202, 514, 544, 422, 391, 453, 139, 44, 234, + 484, 297, 360, 171, 76, 515, 545, 266, 329, 454, 13, 423, 203, 108, 546, 485, 576, 298, 235, 140, 361, 330, 172, 547, 45, 455, 267, 577, 486, 77, 204, 362, + 608, 14, 299, 578, 109, 236, 487, 609, 331, 141, 579, 46, 15, 173, 610, 363, 78, 205, 16, 110, 237, 611, 142, 47, 174, 79, 206, 17, 111, 238, 48, 143, + 80, 175, 112, 207, 49, 18, 239, 81, 113, 19, 50, 82, 114, 51, 83, 115, 640, 516, 392, 268, 144, 20, 672, 641, 548, 517, 424, 393, 300, 269, 176, 145, + 52, 21, 704, 673, 642, 580, 549, 518, 456, 425, 394, 332, 301, 270, 208, 177, 146, 84, 53, 22, 736, 705, 674, 643, 612, 581, 550, 519, 488, 457, 426, 395, + 364, 333, 302, 271, 240, 209, 178, 147, 116, 85, 54, 23, 737, 706, 675, 613, 582, 551, 489, 458, 427, 365, 334, 303, 241, 210, 179, 117, 86, 55, 738, 707, + 614, 583, 490, 459, 366, 335, 242, 211, 118, 87, 739, 615, 491, 367, 243, 119, 768, 644, 520, 396, 272, 148, 24, 800, 769, 676, 645, 552, 521, 428, 397, 304, + 273, 180, 149, 56, 25, 832, 801, 770, 708, 677, 646, 584, 553, 522, 460, 429, 398, 336, 305, 274, 212, 181, 150, 88, 57, 26, 864, 833, 802, 771, 740, 709, + 678, 647, 616, 585, 554, 523, 492, 461, 430, 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 865, 834, 803, 741, 710, 679, 617, 586, 555, 493, + 462, 431, 369, 338, 307, 245, 214, 183, 121, 90, 59, 866, 835, 742, 711, 618, 587, 494, 463, 370, 339, 246, 215, 122, 91, 867, 743, 619, 495, 371, 247, 123, + 896, 772, 648, 524, 400, 276, 152, 28, 928, 897, 804, 773, 680, 649, 556, 525, 432, 401, 308, 277, 184, 153, 60, 29, 960, 929, 898, 836, 805, 774, 712, 681, + 650, 588, 557, 526, 464, 433, 402, 340, 309, 278, 216, 185, 154, 92, 61, 30, 992, 961, 930, 899, 868, 837, 806, 775, 744, 713, 682, 651, 620, 589, 558, 527, + 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93, 62, 31, 993, 962, 931, 869, 838, 807, 745, 714, 683, 621, 590, 559, 497, 466, 435, 373, + 342, 311, 249, 218, 187, 125, 94, 63, 994, 963, 870, 839, 746, 715, 622, 591, 498, 467, 374, 343, 250, 219, 126, 95, 995, 871, 747, 623, 499, 375, 251, 127, + 900, 776, 652, 528, 404, 280, 156, 932, 901, 808, 777, 684, 653, 560, 529, 436, 405, 312, 281, 188, 157, 964, 933, 902, 840, 809, 778, 716, 685, 654, 592, 561, + 530, 468, 437, 406, 344, 313, 282, 220, 189, 158, 996, 965, 934, 903, 872, 841, 810, 779, 748, 717, 686, 655, 624, 593, 562, 531, 500, 469, 438, 407, 376, 345, + 314, 283, 252, 221, 190, 159, 997, 966, 935, 873, 842, 811, 749, 718, 687, 625, 594, 563, 501, 470, 439, 377, 346, 315, 253, 222, 191, 998, 967, 874, 843, 750, + 719, 626, 595, 502, 471, 378, 347, 254, 223, 999, 875, 751, 627, 503, 379, 255, 904, 780, 656, 532, 408, 284, 936, 905, 812, 781, 688, 657, 564, 533, 440, 409, + 316, 285, 968, 937, 906, 844, 813, 782, 720, 689, 658, 596, 565, 534, 472, 441, 410, 348, 317, 286, 1000, 969, 938, 907, 876, 845, 814, 783, 752, 721, 690, 659, + 628, 597, 566, 535, 504, 473, 442, 411, 380, 349, 318, 287, 1001, 970, 939, 877, 846, 815, 753, 722, 691, 629, 598, 567, 505, 474, 443, 381, 350, 319, 1002, 971, + 878, 847, 754, 723, 630, 599, 506, 475, 382, 351, 1003, 879, 755, 631, 507, 383, 908, 784, 660, 536, 412, 940, 909, 816, 785, 692, 661, 568, 537, 444, 413, 972, + 941, 910, 848, 817, 786, 724, 693, 662, 600, 569, 538, 476, 445, 414, 1004, 973, 942, 911, 880, 849, 818, 787, 756, 725, 694, 663, 632, 601, 570, 539, 508, 477, + 446, 415, 1005, 974, 943, 881, 850, 819, 757, 726, 695, 633, 602, 571, 509, 478, 447, 1006, 975, 882, 851, 758, 727, 634, 603, 510, 479, 1007, 883, 759, 635, 511, + 912, 788, 664, 540, 944, 913, 820, 789, 696, 665, 572, 541, 976, 945, 914, 852, 821, 790, 728, 697, 666, 604, 573, 542, 1008, 977, 946, 915, 884, 853, 822, 791, + 760, 729, 698, 667, 636, 605, 574, 543, 1009, 978, 947, 885, 854, 823, 761, 730, 699, 637, 606, 575, 1010, 979, 886, 855, 762, 731, 638, 607, 1011, 887, 763, 639, + 916, 792, 668, 948, 917, 824, 793, 700, 669, 980, 949, 918, 856, 825, 794, 732, 701, 670, 1012, 981, 950, 919, 888, 857, 826, 795, 764, 733, 702, 671, 1013, 982, + 951, 889, 858, 827, 765, 734, 703, 1014, 983, 890, 859, 766, 735, 1015, 891, 767, 920, 796, 952, 921, 828, 797, 984, 953, 922, 860, 829, 798, 1016, 985, 954, 923, + 892, 861, 830, 799, 1017, 986, 955, 893, 862, 831, 1018, 987, 894, 863, 1019, 895, 924, 956, 925, 988, 957, 926, 1020, 989, 958, 927, 1021, 990, 959, 1022, 991, 1023, +}; + +/* Array indices are identical to previously-existing CONTEXT_NODE indices */ + +const vp9_tree_index vp9_coef_tree[ 22] = /* corresponding _CONTEXT_NODEs */ +{ +#if CONFIG_BALANCED_COEFTREE + -ZERO_TOKEN, 2, /* 0 = ZERO */ + -DCT_EOB_TOKEN, 4, /* 1 = EOB */ +#else + -DCT_EOB_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ +#endif + -ONE_TOKEN, 6, /* 2 = ONE */ + 8, 12, /* 3 = LOW_VAL */ + -TWO_TOKEN, 10, /* 4 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 5 = THREE */ + 14, 16, /* 6 = HIGH_LOW */ + -DCT_VAL_CATEGORY1, -DCT_VAL_CATEGORY2, /* 7 = CAT_ONE */ + 18, 20, /* 8 = CAT_THREEFOUR */ + -DCT_VAL_CATEGORY3, -DCT_VAL_CATEGORY4, /* 9 = CAT_THREE */ + -DCT_VAL_CATEGORY5, -DCT_VAL_CATEGORY6 /* 10 = CAT_FIVE */ +}; + +struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; + +/* Trees for extra bits. Probabilities are constant and + do not depend on previously encoded bits */ + +static const vp9_prob Pcat1[] = { 159}; +static const vp9_prob Pcat2[] = { 165, 145}; +static const vp9_prob Pcat3[] = { 173, 148, 140}; +static const vp9_prob Pcat4[] = { 176, 155, 140, 135}; +static const vp9_prob Pcat5[] = { 180, 157, 141, 134, 130}; +static const vp9_prob Pcat6[] = { + 254, 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129 +}; + +const vp9_tree_index vp9_coefmodel_tree[6] = { +#if CONFIG_BALANCED_COEFTREE + -ZERO_TOKEN, 2, + -DCT_EOB_MODEL_TOKEN, 4, +#else + -DCT_EOB_MODEL_TOKEN, 2, /* 0 = EOB */ + -ZERO_TOKEN, 4, /* 1 = ZERO */ +#endif + -ONE_TOKEN, -TWO_TOKEN, +}; + +// Model obtained from a 2-sided zero-centerd distribuition derived +// from a Pareto distribution. The cdf of the distribution is: +// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta] +// +// For a given beta and a given probablity of the 1-node, the alpha +// is first solved, and then the {alpha, beta} pair is used to generate +// the probabilities for the rest of the nodes. + +// beta = 8 +const vp9_prob vp9_modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = { + { 3, 86, 128, 6, 86, 23, 88, 29}, + { 9, 86, 129, 17, 88, 61, 94, 76}, + { 15, 87, 129, 28, 89, 93, 100, 110}, + { 20, 88, 130, 38, 91, 118, 106, 136}, + { 26, 89, 131, 48, 92, 139, 111, 156}, + { 31, 90, 131, 58, 94, 156, 117, 171}, + { 37, 90, 132, 66, 95, 171, 122, 184}, + { 42, 91, 132, 75, 97, 183, 127, 194}, + { 47, 92, 133, 83, 98, 193, 132, 202}, + { 52, 93, 133, 90, 100, 201, 137, 208}, + { 57, 94, 134, 98, 101, 208, 142, 214}, + { 62, 94, 135, 105, 103, 214, 146, 218}, + { 66, 95, 135, 111, 104, 219, 151, 222}, + { 71, 96, 136, 117, 106, 224, 155, 225}, + { 76, 97, 136, 123, 107, 227, 159, 228}, + { 80, 98, 137, 129, 109, 231, 162, 231}, + { 84, 98, 138, 134, 110, 234, 166, 233}, + { 89, 99, 138, 140, 112, 236, 170, 235}, + { 93, 100, 139, 145, 113, 238, 173, 236}, + { 97, 101, 140, 149, 115, 240, 176, 238}, + {101, 102, 140, 154, 116, 242, 179, 239}, + {105, 103, 141, 158, 118, 243, 182, 240}, + {109, 104, 141, 162, 119, 244, 185, 241}, + {113, 104, 142, 166, 120, 245, 187, 242}, + {116, 105, 143, 170, 122, 246, 190, 243}, + {120, 106, 143, 173, 123, 247, 192, 244}, + {123, 107, 144, 177, 125, 248, 195, 244}, + {127, 108, 145, 180, 126, 249, 197, 245}, + {130, 109, 145, 183, 128, 249, 199, 245}, + {134, 110, 146, 186, 129, 250, 201, 246}, + {137, 111, 147, 189, 131, 251, 203, 246}, + {140, 112, 147, 192, 132, 251, 205, 247}, + {143, 113, 148, 194, 133, 251, 207, 247}, + {146, 114, 149, 197, 135, 252, 208, 248}, + {149, 115, 149, 199, 136, 252, 210, 248}, + {152, 115, 150, 201, 138, 252, 211, 248}, + {155, 116, 151, 204, 139, 253, 213, 249}, + {158, 117, 151, 206, 140, 253, 214, 249}, + {161, 118, 152, 208, 142, 253, 216, 249}, + {163, 119, 153, 210, 143, 253, 217, 249}, + {166, 120, 153, 212, 144, 254, 218, 250}, + {168, 121, 154, 213, 146, 254, 220, 250}, + {171, 122, 155, 215, 147, 254, 221, 250}, + {173, 123, 155, 217, 148, 254, 222, 250}, + {176, 124, 156, 218, 150, 254, 223, 250}, + {178, 125, 157, 220, 151, 254, 224, 251}, + {180, 126, 157, 221, 152, 254, 225, 251}, + {183, 127, 158, 222, 153, 254, 226, 251}, + {185, 128, 159, 224, 155, 255, 227, 251}, + {187, 129, 160, 225, 156, 255, 228, 251}, + {189, 131, 160, 226, 157, 255, 228, 251}, + {191, 132, 161, 227, 159, 255, 229, 251}, + {193, 133, 162, 228, 160, 255, 230, 252}, + {195, 134, 163, 230, 161, 255, 231, 252}, + {197, 135, 163, 231, 162, 255, 231, 252}, + {199, 136, 164, 232, 163, 255, 232, 252}, + {201, 137, 165, 233, 165, 255, 233, 252}, + {202, 138, 166, 233, 166, 255, 233, 252}, + {204, 139, 166, 234, 167, 255, 234, 252}, + {206, 140, 167, 235, 168, 255, 235, 252}, + {207, 141, 168, 236, 169, 255, 235, 252}, + {209, 142, 169, 237, 171, 255, 236, 252}, + {210, 144, 169, 237, 172, 255, 236, 252}, + {212, 145, 170, 238, 173, 255, 237, 252}, + {214, 146, 171, 239, 174, 255, 237, 253}, + {215, 147, 172, 240, 175, 255, 238, 253}, + {216, 148, 173, 240, 176, 255, 238, 253}, + {218, 149, 173, 241, 177, 255, 239, 253}, + {219, 150, 174, 241, 179, 255, 239, 253}, + {220, 152, 175, 242, 180, 255, 240, 253}, + {222, 153, 176, 242, 181, 255, 240, 253}, + {223, 154, 177, 243, 182, 255, 240, 253}, + {224, 155, 178, 244, 183, 255, 241, 253}, + {225, 156, 178, 244, 184, 255, 241, 253}, + {226, 158, 179, 244, 185, 255, 242, 253}, + {228, 159, 180, 245, 186, 255, 242, 253}, + {229, 160, 181, 245, 187, 255, 242, 253}, + {230, 161, 182, 246, 188, 255, 243, 253}, + {231, 163, 183, 246, 189, 255, 243, 253}, + {232, 164, 184, 247, 190, 255, 243, 253}, + {233, 165, 185, 247, 191, 255, 244, 253}, + {234, 166, 185, 247, 192, 255, 244, 253}, + {235, 168, 186, 248, 193, 255, 244, 253}, + {236, 169, 187, 248, 194, 255, 244, 253}, + {236, 170, 188, 248, 195, 255, 245, 253}, + {237, 171, 189, 249, 196, 255, 245, 254}, + {238, 173, 190, 249, 197, 255, 245, 254}, + {239, 174, 191, 249, 198, 255, 245, 254}, + {240, 175, 192, 249, 199, 255, 246, 254}, + {240, 177, 193, 250, 200, 255, 246, 254}, + {241, 178, 194, 250, 201, 255, 246, 254}, + {242, 179, 195, 250, 202, 255, 246, 254}, + {242, 181, 196, 250, 203, 255, 247, 254}, + {243, 182, 197, 251, 204, 255, 247, 254}, + {244, 184, 198, 251, 205, 255, 247, 254}, + {244, 185, 199, 251, 206, 255, 247, 254}, + {245, 186, 200, 251, 207, 255, 247, 254}, + {246, 188, 201, 252, 207, 255, 248, 254}, + {246, 189, 202, 252, 208, 255, 248, 254}, + {247, 191, 203, 252, 209, 255, 248, 254}, + {247, 192, 204, 252, 210, 255, 248, 254}, + {248, 194, 205, 252, 211, 255, 248, 254}, + {248, 195, 206, 252, 212, 255, 249, 254}, + {249, 197, 207, 253, 213, 255, 249, 254}, + {249, 198, 208, 253, 214, 255, 249, 254}, + {250, 200, 210, 253, 215, 255, 249, 254}, + {250, 201, 211, 253, 215, 255, 249, 254}, + {250, 203, 212, 253, 216, 255, 249, 254}, + {251, 204, 213, 253, 217, 255, 250, 254}, + {251, 206, 214, 254, 218, 255, 250, 254}, + {252, 207, 216, 254, 219, 255, 250, 254}, + {252, 209, 217, 254, 220, 255, 250, 254}, + {252, 211, 218, 254, 221, 255, 250, 254}, + {253, 213, 219, 254, 222, 255, 250, 254}, + {253, 214, 221, 254, 223, 255, 250, 254}, + {253, 216, 222, 254, 224, 255, 251, 254}, + {253, 218, 224, 254, 225, 255, 251, 254}, + {254, 220, 225, 254, 225, 255, 251, 254}, + {254, 222, 227, 255, 226, 255, 251, 254}, + {254, 224, 228, 255, 227, 255, 251, 254}, + {254, 226, 230, 255, 228, 255, 251, 254}, + {255, 228, 231, 255, 230, 255, 251, 254}, + {255, 230, 233, 255, 231, 255, 252, 254}, + {255, 232, 235, 255, 232, 255, 252, 254}, + {255, 235, 237, 255, 233, 255, 252, 254}, + {255, 238, 240, 255, 235, 255, 252, 255}, + {255, 241, 243, 255, 236, 255, 252, 254}, + {255, 246, 247, 255, 239, 255, 253, 255} +}; + +static void extend_model_to_full_distribution(vp9_prob p, + vp9_prob *tree_probs) { + const int l = ((p - 1) / 2); + const vp9_prob (*model)[MODEL_NODES]; + model = vp9_modelcoefprobs_pareto8; + if (p & 1) { + vpx_memcpy(tree_probs + UNCONSTRAINED_NODES, + model[l], MODEL_NODES * sizeof(vp9_prob)); + } else { + // interpolate + int i; + for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i) + tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] + + model[l + 1][i - UNCONSTRAINED_NODES]) >> 1; + } +} + +void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) { + if (full != model) + vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES); + extend_model_to_full_distribution(model[PIVOT_NODE], full); +} + +void vp9_model_to_full_probs_sb( + vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES], + vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]) { + int c, p; + for (c = 0; c < COEF_BANDS; ++c) + for (p = 0; p < PREV_COEF_CONTEXTS; ++p) { + vp9_model_to_full_probs(model[c][p], full[c][p]); + } +} + +static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28]; + +static void init_bit_tree(vp9_tree_index *p, int n) { + int i = 0; + + while (++i < n) { + p[0] = p[1] = i << 1; + p += 2; + } + + p[0] = p[1] = 0; +} + +static void init_bit_trees() { + init_bit_tree(cat1, 1); + init_bit_tree(cat2, 2); + init_bit_tree(cat3, 3); + init_bit_tree(cat4, 4); + init_bit_tree(cat5, 5); + init_bit_tree(cat6, 14); +} + +vp9_extra_bit vp9_extra_bits[12] = { + { 0, 0, 0, 0}, + { 0, 0, 0, 1}, + { 0, 0, 0, 2}, + { 0, 0, 0, 3}, + { 0, 0, 0, 4}, + { cat1, Pcat1, 1, 5}, + { cat2, Pcat2, 2, 7}, + { cat3, Pcat3, 3, 11}, + { cat4, Pcat4, 4, 19}, + { cat5, Pcat5, 5, 35}, + { cat6, Pcat6, 14, 67}, + { 0, 0, 0, 0} +}; + +#include "vp9/common/vp9_default_coef_probs.h" + +// This function updates and then returns n AC coefficient context +// This is currently a placeholder function to allow experimentation +// using various context models based on the energy earlier tokens +// within the current block. +// +// For now it just returns the previously used context. +#define MAX_NEIGHBORS 2 +int vp9_get_coef_context(const int *scan, const int *neighbors, + int nb_pad, uint8_t *token_cache, int c, int l) { + int eob = l; + assert(nb_pad == MAX_NEIGHBORS); + if (c == eob) { + return 0; + } else { + int ctx; + assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0); + if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) { + ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] + + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1; + } else { + ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]]; + } + return ctx; + } +}; + +void vp9_default_coef_probs(VP9_COMMON *pc) { + vpx_memcpy(pc->fc.coef_probs[TX_4X4], default_coef_probs_4x4, + sizeof(pc->fc.coef_probs[TX_4X4])); + vpx_memcpy(pc->fc.coef_probs[TX_8X8], default_coef_probs_8x8, + sizeof(pc->fc.coef_probs[TX_8X8])); + vpx_memcpy(pc->fc.coef_probs[TX_16X16], default_coef_probs_16x16, + sizeof(pc->fc.coef_probs[TX_16X16])); + vpx_memcpy(pc->fc.coef_probs[TX_32X32], default_coef_probs_32x32, + sizeof(pc->fc.coef_probs[TX_32X32])); +} + +// Neighborhood 5-tuples for various scans and blocksizes, +// in {top, left, topleft, topright, bottomleft} order +// for each position in raster scan order. +// -1 indicates the neighbor does not exist. +DECLARE_ALIGNED(16, int, + vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); +DECLARE_ALIGNED(16, int, + vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]); + +static int find_in_scan(const int *scan, int l, int idx) { + int n, l2 = l * l; + for (n = 0; n < l2; n++) { + int rc = scan[n]; + if (rc == idx) + return n; + } + assert(0); + return -1; +} +static void init_scan_neighbors(const int *scan, int l, int *neighbors, + int max_neighbors) { + int l2 = l * l; + int n, i, j; + + for (n = 0; n < l2; n++) { + int rc = scan[n]; + assert(max_neighbors == MAX_NEIGHBORS); + i = rc / l; + j = rc % l; + if (i > 0 && j > 0) { + // col/row scan is used for adst/dct, and generally means that + // energy decreases to zero much faster in the dimension in + // which ADST is used compared to the direction in which DCT + // is used. Likewise, we find much higher correlation between + // coefficients within the direction in which DCT is used. + // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff + // as a context. If ADST or DCT is used in both directions, we + // use the combination of the two as a context. + int a = find_in_scan(scan, l, (i - 1) * l + j); + int b = find_in_scan(scan, l, i * l + j - 1); + if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || + scan == vp9_col_scan_16x16) { + neighbors[max_neighbors * n + 0] = a; + neighbors[max_neighbors * n + 1] = -1; + } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || + scan == vp9_row_scan_16x16) { + neighbors[max_neighbors * n + 0] = b; + neighbors[max_neighbors * n + 1] = -1; + } else { + neighbors[max_neighbors * n + 0] = a; + neighbors[max_neighbors * n + 1] = b; + } + } else if (i > 0) { + neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j); + neighbors[max_neighbors * n + 1] = -1; + } else if (j > 0) { + neighbors[max_neighbors * n + 0] = + find_in_scan(scan, l, i * l + j - 1); + neighbors[max_neighbors * n + 1] = -1; + } else { + assert(n == 0); + // dc predictor doesn't use previous tokens + neighbors[max_neighbors * n + 0] = -1; + } + assert(neighbors[max_neighbors * n + 0] < n); + } +} + +void vp9_init_neighbors() { + init_scan_neighbors(vp9_default_scan_4x4, 4, + vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_row_scan_4x4, 4, + vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_col_scan_4x4, 4, + vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_default_scan_8x8, 8, + vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_row_scan_8x8, 8, + vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_col_scan_8x8, 8, + vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_default_scan_16x16, 16, + vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_row_scan_16x16, 16, + vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_col_scan_16x16, 16, + vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS); + init_scan_neighbors(vp9_default_scan_32x32, 32, + vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS); +} + +const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad) { + if (scan == vp9_default_scan_4x4) { + *pad = MAX_NEIGHBORS; + return vp9_default_scan_4x4_neighbors; + } else if (scan == vp9_row_scan_4x4) { + *pad = MAX_NEIGHBORS; + return vp9_row_scan_4x4_neighbors; + } else if (scan == vp9_col_scan_4x4) { + *pad = MAX_NEIGHBORS; + return vp9_col_scan_4x4_neighbors; + } else if (scan == vp9_default_scan_8x8) { + *pad = MAX_NEIGHBORS; + return vp9_default_scan_8x8_neighbors; + } else if (scan == vp9_row_scan_8x8) { + *pad = 2; + return vp9_row_scan_8x8_neighbors; + } else if (scan == vp9_col_scan_8x8) { + *pad = 2; + return vp9_col_scan_8x8_neighbors; + } else if (scan == vp9_default_scan_16x16) { + *pad = MAX_NEIGHBORS; + return vp9_default_scan_16x16_neighbors; + } else if (scan == vp9_row_scan_16x16) { + *pad = 2; + return vp9_row_scan_16x16_neighbors; + } else if (scan == vp9_col_scan_16x16) { + *pad = 2; + return vp9_col_scan_16x16_neighbors; + } else if (scan == vp9_default_scan_32x32) { + *pad = MAX_NEIGHBORS; + return vp9_default_scan_32x32_neighbors; + } else { + assert(0); + return NULL; + } +} + +void vp9_coef_tree_initialize() { + vp9_init_neighbors(); + init_bit_trees(); + vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree); +} + +// #define COEF_COUNT_TESTING + +#define COEF_COUNT_SAT 24 +#define COEF_MAX_UPDATE_FACTOR 112 +#define COEF_COUNT_SAT_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_KEY 112 +#define COEF_COUNT_SAT_AFTER_KEY 24 +#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128 + +void vp9_full_to_model_count(unsigned int *model_count, + unsigned int *full_count) { + int n; + model_count[ZERO_TOKEN] = full_count[ZERO_TOKEN]; + model_count[ONE_TOKEN] = full_count[ONE_TOKEN]; + model_count[TWO_TOKEN] = full_count[TWO_TOKEN]; + for (n = THREE_TOKEN; n < DCT_EOB_TOKEN; ++n) + model_count[TWO_TOKEN] += full_count[n]; + model_count[DCT_EOB_MODEL_TOKEN] = full_count[DCT_EOB_TOKEN]; +} + +void vp9_full_to_model_counts( + vp9_coeff_count_model *model_count, vp9_coeff_count *full_count) { + int i, j, k, l; + for (i = 0; i < BLOCK_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + vp9_full_to_model_count(model_count[i][j][k][l], + full_count[i][j][k][l]); + } +} + +static void adapt_coef_probs(VP9_COMMON *cm, TX_SIZE txfm_size, + int count_sat, int update_factor) { + vp9_coeff_probs_model *dst_coef_probs = cm->fc.coef_probs[txfm_size]; + vp9_coeff_probs_model *pre_coef_probs = cm->fc.pre_coef_probs[txfm_size]; + vp9_coeff_count_model *coef_counts = cm->fc.coef_counts[txfm_size]; + unsigned int (*eob_branch_count)[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] = + cm->fc.eob_branch_counts[txfm_size]; + int t, i, j, k, l, count; + int factor; + unsigned int branch_ct[UNCONSTRAINED_NODES][2]; + vp9_prob coef_probs[UNCONSTRAINED_NODES]; + int entropy_nodes_adapt = UNCONSTRAINED_NODES; + + for (i = 0; i < BLOCK_TYPES; ++i) + for (j = 0; j < REF_TYPES; ++j) + for (k = 0; k < COEF_BANDS; ++k) + for (l = 0; l < PREV_COEF_CONTEXTS; ++l) { + if (l >= 3 && k == 0) + continue; + vp9_tree_probs_from_distribution( + vp9_coefmodel_tree, + coef_probs, branch_ct, + coef_counts[i][j][k][l], 0); +#if CONFIG_BALANCED_COEFTREE + branch_ct[1][1] = eob_branch_count[i][j][k][l] - branch_ct[1][0]; + coef_probs[1] = get_binary_prob(branch_ct[1][0], branch_ct[1][1]); +#else + branch_ct[0][1] = eob_branch_count[i][j][k][l] - branch_ct[0][0]; + coef_probs[0] = get_binary_prob(branch_ct[0][0], branch_ct[0][1]); +#endif + for (t = 0; t < entropy_nodes_adapt; ++t) { + count = branch_ct[t][0] + branch_ct[t][1]; + count = count > count_sat ? count_sat : count; + factor = (update_factor * count / count_sat); + dst_coef_probs[i][j][k][l][t] = + weighted_prob(pre_coef_probs[i][j][k][l][t], + coef_probs[t], factor); + } + } +} + +void vp9_adapt_coef_probs(VP9_COMMON *cm) { + TX_SIZE t; + int count_sat; + int update_factor; /* denominator 256 */ + + if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { + update_factor = COEF_MAX_UPDATE_FACTOR_KEY; + count_sat = COEF_COUNT_SAT_KEY; + } else if (cm->last_frame_type == KEY_FRAME) { + update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY; /* adapt quickly */ + count_sat = COEF_COUNT_SAT_AFTER_KEY; + } else { + update_factor = COEF_MAX_UPDATE_FACTOR; + count_sat = COEF_COUNT_SAT; + } + for (t = TX_4X4; t <= TX_32X32; t++) + adapt_coef_probs(cm, t, count_sat, update_factor); +} diff --git a/libvpx/vp9/common/vp9_entropy.h b/libvpx/vp9/common/vp9_entropy.h new file mode 100644 index 000000000..7f2bf3d6e --- /dev/null +++ b/libvpx/vp9/common/vp9_entropy.h @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENTROPY_H_ +#define VP9_COMMON_VP9_ENTROPY_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" + +/* Coefficient token alphabet */ + +#define ZERO_TOKEN 0 /* 0 Extra Bits 0+0 */ +#define ONE_TOKEN 1 /* 1 Extra Bits 0+1 */ +#define TWO_TOKEN 2 /* 2 Extra Bits 0+1 */ +#define THREE_TOKEN 3 /* 3 Extra Bits 0+1 */ +#define FOUR_TOKEN 4 /* 4 Extra Bits 0+1 */ +#define DCT_VAL_CATEGORY1 5 /* 5-6 Extra Bits 1+1 */ +#define DCT_VAL_CATEGORY2 6 /* 7-10 Extra Bits 2+1 */ +#define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ +#define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ +#define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 14+1 */ +#define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ +#define MAX_ENTROPY_TOKENS 12 +#define ENTROPY_NODES 11 +#define EOSB_TOKEN 127 /* Not signalled, encoder only */ + +#define INTER_MODE_CONTEXTS 7 + +extern const vp9_tree_index vp9_coef_tree[]; + +#define DCT_EOB_MODEL_TOKEN 3 /* EOB Extra Bits 0+0 */ +extern const vp9_tree_index vp9_coefmodel_tree[]; + +extern struct vp9_token vp9_coef_encodings[MAX_ENTROPY_TOKENS]; + +typedef struct { + vp9_tree_p tree; + const vp9_prob *prob; + int len; + int base_val; +} vp9_extra_bit; + +extern vp9_extra_bit vp9_extra_bits[12]; /* indexed by token value */ + +#define PROB_UPDATE_BASELINE_COST 7 + +#define MAX_PROB 255 +#define DCT_MAX_VALUE 16384 + +/* Coefficients are predicted via a 3-dimensional probability table. */ + +/* Outside dimension. 0 = Y with DC, 1 = UV */ +#define BLOCK_TYPES 2 +#define REF_TYPES 2 // intra=0, inter=1 + +/* Middle dimension reflects the coefficient position within the transform. */ +#define COEF_BANDS 6 + +/* Inside dimension is measure of nearby complexity, that reflects the energy + of nearby coefficients are nonzero. For the first coefficient (DC, unless + block type is 0), we look at the (already encoded) blocks above and to the + left of the current block. The context index is then the number (0,1,or 2) + of these blocks having nonzero coefficients. + After decoding a coefficient, the measure is determined by the size of the + most recently decoded coefficient. + Note that the intuitive meaning of this measure changes as coefficients + are decoded, e.g., prior to the first token, a zero means that my neighbors + are empty while, after the first token, because of the use of end-of-block, + a zero means we just decoded a zero and hence guarantees that a non-zero + coefficient will appear later in this block. However, this shift + in meaning is perfectly OK because our context depends also on the + coefficient band (and since zigzag positions 0, 1, and 2 are in + distinct bands). */ + +/*# define DC_TOKEN_CONTEXTS 3*/ /* 00, 0!0, !0!0 */ +#define PREV_COEF_CONTEXTS 6 + +// #define ENTROPY_STATS + +typedef unsigned int vp9_coeff_count[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [MAX_ENTROPY_TOKENS]; +typedef unsigned int vp9_coeff_stats[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES][2]; +typedef vp9_prob vp9_coeff_probs[REF_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] + [ENTROPY_NODES]; + +#define SUBEXP_PARAM 4 /* Subexponential code parameter */ +#define MODULUS_PARAM 13 /* Modulus parameter */ + +struct VP9Common; +void vp9_default_coef_probs(struct VP9Common *); +extern DECLARE_ALIGNED(16, const int, vp9_default_scan_4x4[16]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan_4x4[16]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan_4x4[16]); + +extern DECLARE_ALIGNED(64, const int, vp9_default_scan_8x8[64]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan_8x8[64]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan_8x8[64]); + +extern DECLARE_ALIGNED(16, const int, vp9_default_scan_16x16[256]); + +extern DECLARE_ALIGNED(16, const int, vp9_col_scan_16x16[256]); +extern DECLARE_ALIGNED(16, const int, vp9_row_scan_16x16[256]); + +extern DECLARE_ALIGNED(16, const int, vp9_default_scan_32x32[1024]); + +void vp9_coef_tree_initialize(void); +void vp9_adapt_coef_probs(struct VP9Common *); + +static INLINE void vp9_reset_sb_tokens_context(MACROBLOCKD* const xd, + BLOCK_SIZE_TYPE bsize) { + /* Clear entropy contexts */ + const int bw = 1 << b_width_log2(bsize); + const int bh = 1 << b_height_log2(bsize); + int i; + for (i = 0; i < MAX_MB_PLANE; i++) { + vpx_memset(xd->plane[i].above_context, 0, + sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[i].subsampling_x); + vpx_memset(xd->plane[i].left_context, 0, + sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[i].subsampling_y); + } +} + +// This is the index in the scan order beyond which all coefficients for +// 8x8 transform and above are in the top band. +// For 4x4 blocks the index is less but to keep things common the lookup +// table for 4x4 is padded out to this index. +#define MAXBAND_INDEX 21 + +extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]; +extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]; + + +static int get_coef_band(const uint8_t * band_translate, int coef_index) { + return (coef_index > MAXBAND_INDEX) + ? (COEF_BANDS-1) : band_translate[coef_index]; +} + +extern int vp9_get_coef_context(const int *scan, const int *neighbors, + int nb_pad, uint8_t *token_cache, int c, int l); +const int *vp9_get_coef_neighbors_handle(const int *scan, int *pad); + + +// 128 lists of probabilities are stored for the following ONE node probs: +// 1, 3, 5, 7, ..., 253, 255 +// In between probabilities are interpolated linearly + +#define COEFPROB_MODELS 128 + +#define UNCONSTRAINED_NODES 3 +#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES) + +#define PIVOT_NODE 2 // which node is pivot + +typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS] + [PREV_COEF_CONTEXTS] + [UNCONSTRAINED_NODES]; + +typedef unsigned int vp9_coeff_count_model[REF_TYPES][COEF_BANDS] + [PREV_COEF_CONTEXTS] + [UNCONSTRAINED_NODES + 1]; +typedef unsigned int vp9_coeff_stats_model[REF_TYPES][COEF_BANDS] + [PREV_COEF_CONTEXTS] + [UNCONSTRAINED_NODES][2]; +extern void vp9_full_to_model_count(unsigned int *model_count, + unsigned int *full_count); +extern void vp9_full_to_model_counts( + vp9_coeff_count_model *model_count, vp9_coeff_count *full_count); + +void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full); + +void vp9_model_to_full_probs_sb( + vp9_prob model[COEF_BANDS][PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES], + vp9_prob full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]); + +extern const vp9_prob vp9_modelcoefprobs[COEFPROB_MODELS][ENTROPY_NODES - 1]; + +static INLINE const int* get_scan_4x4(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_4x4; + case DCT_ADST: + return vp9_col_scan_4x4; + default: + return vp9_default_scan_4x4; + } +} + +static INLINE const int* get_scan_8x8(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_8x8; + case DCT_ADST: + return vp9_col_scan_8x8; + default: + return vp9_default_scan_8x8; + } +} + +static INLINE const int* get_scan_16x16(TX_TYPE tx_type) { + switch (tx_type) { + case ADST_DCT: + return vp9_row_scan_16x16; + case DCT_ADST: + return vp9_col_scan_16x16; + default: + return vp9_default_scan_16x16; + } +} + +enum { VP9_COEF_UPDATE_PROB = 252 }; + +#endif // VP9_COMMON_VP9_ENTROPY_H_ diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c new file mode 100644 index 000000000..33028146a --- /dev/null +++ b/libvpx/vp9/common/vp9_entropymode.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_modecont.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_alloccommon.h" +#include "vpx_mem/vpx_mem.h" + +static const vp9_prob default_kf_uv_probs[VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1] = { + { 144, 11, 54, 157, 195, 130, 46, 58, 108 } /* y = dc */, + { 118, 15, 123, 148, 131, 101, 44, 93, 131 } /* y = v */, + { 113, 12, 23, 188, 226, 142, 26, 32, 125 } /* y = h */, + { 120, 11, 50, 123, 163, 135, 64, 77, 103 } /* y = d45 */, + { 113, 9, 36, 155, 111, 157, 32, 44, 161 } /* y = d135 */, + { 116, 9, 55, 176, 76, 96, 37, 61, 149 } /* y = d117 */, + { 115, 9, 28, 141, 161, 167, 21, 25, 193 } /* y = d153 */, + { 120, 12, 32, 145, 195, 142, 32, 38, 86 } /* y = d27 */, + { 116, 12, 64, 120, 140, 125, 49, 115, 121 } /* y = d63 */, + { 102, 19, 66, 162, 182, 122, 35, 59, 128 } /* y = tm */ +}; + +static const vp9_prob default_if_y_probs[BLOCK_SIZE_GROUPS] + [VP9_INTRA_MODES - 1] = { + { 65, 32, 18, 144, 162, 194, 41, 51, 98 } /* block_size < 8x8 */, + { 132, 68, 18, 165, 217, 196, 45, 40, 78 } /* block_size < 16x16 */, + { 173, 80, 19, 176, 240, 193, 64, 35, 46 } /* block_size < 32x32 */, + { 221, 135, 38, 194, 248, 121, 96, 85, 29 } /* block_size >= 32x32 */ +}; + +static const vp9_prob default_if_uv_probs[VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1] = { + { 120, 7, 76, 176, 208, 126, 28, 54, 103 } /* y = dc */, + { 48, 12, 154, 155, 139, 90, 34, 117, 119 } /* y = v */, + { 67, 6, 25, 204, 243, 158, 13, 21, 96 } /* y = h */, + { 97, 5, 44, 131, 176, 139, 48, 68, 97 } /* y = d45 */, + { 83, 5, 42, 156, 111, 152, 26, 49, 152 } /* y = d135 */, + { 80, 5, 58, 178, 74, 83, 33, 62, 145 } /* y = d117 */, + { 86, 5, 32, 154, 192, 168, 14, 22, 163 } /* y = d153 */, + { 85, 5, 32, 156, 216, 148, 19, 29, 73 } /* y = d27 */, + { 77, 7, 64, 116, 132, 122, 37, 126, 120 } /* y = d63 */, + { 101, 21, 107, 181, 192, 103, 19, 67, 125 } /* y = tm */ +}; + +const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] + [PARTITION_TYPES - 1] = { + { /* frame_type = keyframe */ + /* 8x8 -> 4x4 */ + { 158, 97, 94 } /* a/l both not split */, + { 93, 24, 99 } /* a split, l not split */, + { 85, 119, 44 } /* l split, a not split */, + { 62, 59, 67 } /* a/l both split */, + /* 16x16 -> 8x8 */ + { 149, 53, 53 } /* a/l both not split */, + { 94, 20, 48 } /* a split, l not split */, + { 83, 53, 24 } /* l split, a not split */, + { 52, 18, 18 } /* a/l both split */, + /* 32x32 -> 16x16 */ + { 150, 40, 39 } /* a/l both not split */, + { 78, 12, 26 } /* a split, l not split */, + { 67, 33, 11 } /* l split, a not split */, + { 24, 7, 5 } /* a/l both split */, + /* 64x64 -> 32x32 */ + { 174, 35, 49 } /* a/l both not split */, + { 68, 11, 27 } /* a split, l not split */, + { 57, 15, 9 } /* l split, a not split */, + { 12, 3, 3 } /* a/l both split */ + }, { /* frame_type = interframe */ + /* 8x8 -> 4x4 */ + { 199, 122, 141 } /* a/l both not split */, + { 147, 63, 159 } /* a split, l not split */, + { 148, 133, 118 } /* l split, a not split */, + { 121, 104, 114 } /* a/l both split */, + /* 16x16 -> 8x8 */ + { 174, 73, 87 } /* a/l both not split */, + { 92, 41, 83 } /* a split, l not split */, + { 82, 99, 50 } /* l split, a not split */, + { 53, 39, 39 } /* a/l both split */, + /* 32x32 -> 16x16 */ + { 177, 58, 59 } /* a/l both not split */, + { 68, 26, 63 } /* a split, l not split */, + { 52, 79, 25 } /* l split, a not split */, + { 17, 14, 12 } /* a/l both split */, + /* 64x64 -> 32x32 */ + { 222, 34, 30 } /* a/l both not split */, + { 72, 16, 44 } /* a split, l not split */, + { 58, 32, 12 } /* l split, a not split */, + { 10, 7, 6 } /* a/l both split */ + } +}; + +/* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */ +const vp9_tree_index vp9_intra_mode_tree[VP9_INTRA_MODES * 2 - 2] = { + -DC_PRED, 2, /* 0 = DC_NODE */ + -TM_PRED, 4, /* 1 = TM_NODE */ + -V_PRED, 6, /* 2 = V_NODE */ + 8, 12, /* 3 = COM_NODE */ + -H_PRED, 10, /* 4 = H_NODE */ + -D135_PRED, -D117_PRED, /* 5 = D135_NODE */ + -D45_PRED, 14, /* 6 = D45_NODE */ + -D63_PRED, 16, /* 7 = D63_NODE */ + -D153_PRED, -D27_PRED /* 8 = D153_NODE */ +}; + +const vp9_tree_index vp9_sb_mv_ref_tree[6] = { + -ZEROMV, 2, + -NEARESTMV, 4, + -NEARMV, -NEWMV +}; + +const vp9_tree_index vp9_partition_tree[6] = { + -PARTITION_NONE, 2, + -PARTITION_HORZ, 4, + -PARTITION_VERT, -PARTITION_SPLIT +}; + +struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; + +struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES]; + +struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; + +static const vp9_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = { + 9, 102, 187, 225 +}; + +static const vp9_prob default_comp_inter_p[COMP_INTER_CONTEXTS] = { + 239, 183, 119, 96, 41 +}; + +static const vp9_prob default_comp_ref_p[REF_CONTEXTS] = { + 50, 126, 123, 221, 226 +}; + +static const vp9_prob default_single_ref_p[REF_CONTEXTS][2] = { + { 33, 16 }, + { 77, 74 }, + { 142, 142 }, + { 172, 170 }, + { 238, 247 } +}; + +const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 1] = { + { 3, 136, 37, }, + { 5, 52, 13, }, +}; +const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 2] = { + { 20, 152, }, + { 15, 101, }, +}; +const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 3] = { + { 100, }, + { 66, }, +}; + +void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]) { + ct_32x32p[0][0] = tx_count_32x32p[TX_4X4]; + ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] + + tx_count_32x32p[TX_16X16] + + tx_count_32x32p[TX_32X32]; + ct_32x32p[1][0] = tx_count_32x32p[TX_8X8]; + ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] + + tx_count_32x32p[TX_32X32]; + ct_32x32p[2][0] = tx_count_32x32p[TX_16X16]; + ct_32x32p[2][1] = tx_count_32x32p[TX_32X32]; +} + +void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]) { + ct_16x16p[0][0] = tx_count_16x16p[TX_4X4]; + ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + + tx_count_16x16p[TX_16X16]; + ct_16x16p[1][0] = tx_count_16x16p[TX_8X8]; + ct_16x16p[1][1] = tx_count_16x16p[TX_16X16]; +} + +void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]) { + ct_8x8p[0][0] = tx_count_8x8p[TX_4X4]; + ct_8x8p[0][1] = tx_count_8x8p[TX_8X8]; +} + +const vp9_prob vp9_default_mbskip_probs[MBSKIP_CONTEXTS] = { + 192, 128, 64 +}; + +void vp9_init_mbmode_probs(VP9_COMMON *x) { + vpx_memcpy(x->fc.uv_mode_prob, default_if_uv_probs, + sizeof(default_if_uv_probs)); + vpx_memcpy(x->kf_uv_mode_prob, default_kf_uv_probs, + sizeof(default_kf_uv_probs)); + vpx_memcpy(x->fc.y_mode_prob, default_if_y_probs, + sizeof(default_if_y_probs)); + + vpx_memcpy(x->fc.switchable_interp_prob, vp9_switchable_interp_prob, + sizeof(vp9_switchable_interp_prob)); + + vpx_memcpy(x->fc.partition_prob, vp9_partition_probs, + sizeof(vp9_partition_probs)); + + vpx_memcpy(x->fc.intra_inter_prob, default_intra_inter_p, + sizeof(default_intra_inter_p)); + vpx_memcpy(x->fc.comp_inter_prob, default_comp_inter_p, + sizeof(default_comp_inter_p)); + vpx_memcpy(x->fc.comp_ref_prob, default_comp_ref_p, + sizeof(default_comp_ref_p)); + vpx_memcpy(x->fc.single_ref_prob, default_single_ref_p, + sizeof(default_single_ref_p)); + vpx_memcpy(x->fc.tx_probs_32x32p, vp9_default_tx_probs_32x32p, + sizeof(vp9_default_tx_probs_32x32p)); + vpx_memcpy(x->fc.tx_probs_16x16p, vp9_default_tx_probs_16x16p, + sizeof(vp9_default_tx_probs_16x16p)); + vpx_memcpy(x->fc.tx_probs_8x8p, vp9_default_tx_probs_8x8p, + sizeof(vp9_default_tx_probs_8x8p)); + vpx_memcpy(x->fc.mbskip_probs, vp9_default_mbskip_probs, + sizeof(vp9_default_mbskip_probs)); +} + +const vp9_tree_index vp9_switchable_interp_tree[VP9_SWITCHABLE_FILTERS*2-2] = { + -0, 2, + -1, -2 +}; +struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; +const INTERPOLATIONFILTERTYPE vp9_switchable_interp[VP9_SWITCHABLE_FILTERS] = { + EIGHTTAP, EIGHTTAP_SMOOTH, EIGHTTAP_SHARP}; +const int vp9_switchable_interp_map[SWITCHABLE+1] = {1, 0, 2, -1, -1}; +const vp9_prob vp9_switchable_interp_prob [VP9_SWITCHABLE_FILTERS+1] + [VP9_SWITCHABLE_FILTERS-1] = { + { 235, 162, }, + { 36, 255, }, + { 34, 3, }, + { 149, 144, }, +}; + +// Indicates if the filter is interpolating or non-interpolating +const int vp9_is_interpolating_filter[SWITCHABLE + 1] = {1, 1, 1, 1, -1}; + +void vp9_entropy_mode_init() { + vp9_tokens_from_tree(vp9_intra_mode_encodings, vp9_intra_mode_tree); + vp9_tokens_from_tree(vp9_switchable_interp_encodings, + vp9_switchable_interp_tree); + vp9_tokens_from_tree(vp9_partition_encodings, vp9_partition_tree); + + vp9_tokens_from_tree_offset(vp9_sb_mv_ref_encoding_array, + vp9_sb_mv_ref_tree, NEARESTMV); +} + +void vp9_init_mode_contexts(VP9_COMMON *pc) { + vpx_memset(pc->fc.inter_mode_counts, 0, sizeof(pc->fc.inter_mode_counts)); + vpx_memcpy(pc->fc.inter_mode_probs, + vp9_default_inter_mode_probs, + sizeof(vp9_default_inter_mode_probs)); +} + +void vp9_accum_mv_refs(VP9_COMMON *pc, + MB_PREDICTION_MODE m, + const int context) { + unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = + pc->fc.inter_mode_counts; + + if (m == ZEROMV) { + ++inter_mode_counts[context][0][0]; + } else { + ++inter_mode_counts[context][0][1]; + if (m == NEARESTMV) { + ++inter_mode_counts[context][1][0]; + } else { + ++inter_mode_counts[context][1][1]; + if (m == NEARMV) { + ++inter_mode_counts[context][2][0]; + } else { + ++inter_mode_counts[context][2][1]; + } + } + } +} + +#define MVREF_COUNT_SAT 20 +#define MVREF_MAX_UPDATE_FACTOR 128 +void vp9_adapt_mode_context(VP9_COMMON *pc) { + int i, j; + unsigned int (*inter_mode_counts)[VP9_INTER_MODES - 1][2] = + pc->fc.inter_mode_counts; + vp9_prob (*mode_context)[VP9_INTER_MODES - 1] = pc->fc.inter_mode_probs; + + for (j = 0; j < INTER_MODE_CONTEXTS; j++) { + for (i = 0; i < VP9_INTER_MODES - 1; i++) { + int count = inter_mode_counts[j][i][0] + inter_mode_counts[j][i][1]; + int factor; + count = count > MVREF_COUNT_SAT ? MVREF_COUNT_SAT : count; + factor = (MVREF_MAX_UPDATE_FACTOR * count / MVREF_COUNT_SAT); + mode_context[j][i] = weighted_prob( + pc->fc.pre_inter_mode_probs[j][i], + get_binary_prob(inter_mode_counts[j][i][0], + inter_mode_counts[j][i][1]), + factor); + } + } +} + +#define MODE_COUNT_SAT 20 +#define MODE_MAX_UPDATE_FACTOR 128 +static int update_mode_ct(vp9_prob pre_prob, vp9_prob prob, + unsigned int branch_ct[2]) { + int factor, count = branch_ct[0] + branch_ct[1]; + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + return weighted_prob(pre_prob, prob, factor); +} + +static void update_mode_probs(int n_modes, + const vp9_tree_index *tree, unsigned int *cnt, + vp9_prob *pre_probs, vp9_prob *dst_probs, + unsigned int tok0_offset) { +#define MAX_PROBS 32 + vp9_prob probs[MAX_PROBS]; + unsigned int branch_ct[MAX_PROBS][2]; + int t; + + assert(n_modes - 1 < MAX_PROBS); + vp9_tree_probs_from_distribution(tree, probs, branch_ct, cnt, tok0_offset); + for (t = 0; t < n_modes - 1; ++t) + dst_probs[t] = update_mode_ct(pre_probs[t], probs[t], branch_ct[t]); +} + +static int update_mode_ct2(vp9_prob pre_prob, unsigned int branch_ct[2]) { + return update_mode_ct(pre_prob, get_binary_prob(branch_ct[0], + branch_ct[1]), branch_ct); +} + +// #define MODE_COUNT_TESTING +void vp9_adapt_mode_probs(VP9_COMMON *cm) { + int i, j; + FRAME_CONTEXT *fc = &cm->fc; +#ifdef MODE_COUNT_TESTING + int t; + + printf("static const unsigned int\nymode_counts" + "[VP9_INTRA_MODES] = {\n"); + for (t = 0; t < VP9_INTRA_MODES; ++t) + printf("%d, ", fc->ymode_counts[t]); + printf("};\n"); + printf("static const unsigned int\nuv_mode_counts" + "[VP9_INTRA_MODES] [VP9_INTRA_MODES] = {\n"); + for (i = 0; i < VP9_INTRA_MODES; ++i) { + printf(" {"); + for (t = 0; t < VP9_INTRA_MODES; ++t) + printf("%d, ", fc->uv_mode_counts[i][t]); + printf("},\n"); + } + printf("};\n"); + printf("static const unsigned int\nbmode_counts" + "[VP9_NKF_BINTRAMODES] = {\n"); + for (t = 0; t < VP9_NKF_BINTRAMODES; ++t) + printf("%d, ", fc->bmode_counts[t]); + printf("};\n"); + printf("static const unsigned int\ni8x8_mode_counts" + "[VP9_I8X8_MODES] = {\n"); + for (t = 0; t < VP9_I8X8_MODES; ++t) + printf("%d, ", fc->i8x8_mode_counts[t]); + printf("};\n"); + printf("static const unsigned int\nmbsplit_counts" + "[VP9_NUMMBSPLITS] = {\n"); + for (t = 0; t < VP9_NUMMBSPLITS; ++t) + printf("%d, ", fc->mbsplit_counts[t]); + printf("};\n"); +#endif + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + fc->intra_inter_prob[i] = update_mode_ct2(fc->pre_intra_inter_prob[i], + fc->intra_inter_count[i]); + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + fc->comp_inter_prob[i] = update_mode_ct2(fc->pre_comp_inter_prob[i], + fc->comp_inter_count[i]); + for (i = 0; i < REF_CONTEXTS; i++) + fc->comp_ref_prob[i] = update_mode_ct2(fc->pre_comp_ref_prob[i], + fc->comp_ref_count[i]); + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + fc->single_ref_prob[i][j] = update_mode_ct2(fc->pre_single_ref_prob[i][j], + fc->single_ref_count[i][j]); + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, + fc->y_mode_counts[i], fc->pre_y_mode_prob[i], + fc->y_mode_prob[i], 0); + + for (i = 0; i < VP9_INTRA_MODES; ++i) + update_mode_probs(VP9_INTRA_MODES, vp9_intra_mode_tree, + fc->uv_mode_counts[i], fc->pre_uv_mode_prob[i], + fc->uv_mode_prob[i], 0); + + for (i = 0; i < NUM_PARTITION_CONTEXTS; i++) + update_mode_probs(PARTITION_TYPES, vp9_partition_tree, + fc->partition_counts[i], fc->pre_partition_prob[i], + fc->partition_prob[INTER_FRAME][i], 0); + + if (cm->mcomp_filter_type == SWITCHABLE) { + for (i = 0; i <= VP9_SWITCHABLE_FILTERS; i++) { + update_mode_probs(VP9_SWITCHABLE_FILTERS, vp9_switchable_interp_tree, + fc->switchable_interp_count[i], + fc->pre_switchable_interp_prob[i], + fc->switchable_interp_prob[i], 0); + } + } + if (cm->txfm_mode == TX_MODE_SELECT) { + int j; + unsigned int branch_ct_8x8p[TX_SIZE_MAX_SB - 3][2]; + unsigned int branch_ct_16x16p[TX_SIZE_MAX_SB - 2][2]; + unsigned int branch_ct_32x32p[TX_SIZE_MAX_SB - 1][2]; + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + tx_counts_to_branch_counts_8x8(cm->fc.tx_count_8x8p[i], + branch_ct_8x8p); + for (j = 0; j < TX_SIZE_MAX_SB - 3; ++j) { + int factor; + int count = branch_ct_8x8p[j][0] + branch_ct_8x8p[j][1]; + vp9_prob prob = get_binary_prob(branch_ct_8x8p[j][0], + branch_ct_8x8p[j][1]); + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + cm->fc.tx_probs_8x8p[i][j] = weighted_prob( + cm->fc.pre_tx_probs_8x8p[i][j], prob, factor); + } + } + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + tx_counts_to_branch_counts_16x16(cm->fc.tx_count_16x16p[i], + branch_ct_16x16p); + for (j = 0; j < TX_SIZE_MAX_SB - 2; ++j) { + int factor; + int count = branch_ct_16x16p[j][0] + branch_ct_16x16p[j][1]; + vp9_prob prob = get_binary_prob(branch_ct_16x16p[j][0], + branch_ct_16x16p[j][1]); + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + cm->fc.tx_probs_16x16p[i][j] = weighted_prob( + cm->fc.pre_tx_probs_16x16p[i][j], prob, factor); + } + } + for (i = 0; i < TX_SIZE_CONTEXTS; ++i) { + tx_counts_to_branch_counts_32x32(cm->fc.tx_count_32x32p[i], + branch_ct_32x32p); + for (j = 0; j < TX_SIZE_MAX_SB - 1; ++j) { + int factor; + int count = branch_ct_32x32p[j][0] + branch_ct_32x32p[j][1]; + vp9_prob prob = get_binary_prob(branch_ct_32x32p[j][0], + branch_ct_32x32p[j][1]); + count = count > MODE_COUNT_SAT ? MODE_COUNT_SAT : count; + factor = (MODE_MAX_UPDATE_FACTOR * count / MODE_COUNT_SAT); + cm->fc.tx_probs_32x32p[i][j] = weighted_prob( + cm->fc.pre_tx_probs_32x32p[i][j], prob, factor); + } + } + } + for (i = 0; i < MBSKIP_CONTEXTS; ++i) + fc->mbskip_probs[i] = update_mode_ct2(fc->pre_mbskip_probs[i], + fc->mbskip_count[i]); +} + +static void set_default_lf_deltas(MACROBLOCKD *xd) { + xd->mode_ref_lf_delta_enabled = 1; + xd->mode_ref_lf_delta_update = 1; + + xd->ref_lf_deltas[INTRA_FRAME] = 1; + xd->ref_lf_deltas[LAST_FRAME] = 0; + xd->ref_lf_deltas[GOLDEN_FRAME] = -1; + xd->ref_lf_deltas[ALTREF_FRAME] = -1; + + xd->mode_lf_deltas[0] = 0; // Zero + xd->mode_lf_deltas[1] = 0; // New mv +} + +void vp9_setup_past_independence(VP9_COMMON *cm, MACROBLOCKD *xd) { + // Reset the segment feature data to the default stats: + // Features disabled, 0, with delta coding (Default state). + int i; + vp9_clearall_segfeatures(xd); + xd->mb_segment_abs_delta = SEGMENT_DELTADATA; + if (cm->last_frame_seg_map) + vpx_memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols)); + + // Reset the mode ref deltas for loop filter + vpx_memset(xd->last_ref_lf_deltas, 0, sizeof(xd->last_ref_lf_deltas)); + vpx_memset(xd->last_mode_lf_deltas, 0, sizeof(xd->last_mode_lf_deltas)); + set_default_lf_deltas(xd); + + vp9_default_coef_probs(cm); + vp9_init_mbmode_probs(cm); + vpx_memcpy(cm->kf_y_mode_prob, vp9_kf_default_bmode_probs, + sizeof(vp9_kf_default_bmode_probs)); + vp9_init_mv_probs(cm); + + // To force update of the sharpness + cm->last_sharpness_level = -1; + + vp9_init_mode_contexts(cm); + + if ((cm->frame_type == KEY_FRAME) || + cm->error_resilient_mode || (cm->reset_frame_context == 3)) { + // Reset all frame contexts. + for (i = 0; i < NUM_FRAME_CONTEXTS; ++i) + vpx_memcpy(&cm->frame_contexts[i], &cm->fc, sizeof(cm->fc)); + } else if (cm->reset_frame_context == 2) { + // Reset only the frame context specified in the frame header. + vpx_memcpy(&cm->frame_contexts[cm->frame_context_idx], &cm->fc, + sizeof(cm->fc)); + } + + vpx_memset(cm->prev_mip, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + vpx_memset(cm->mip, 0, + cm->mode_info_stride * (cm->mi_rows + 1) * sizeof(MODE_INFO)); + + vp9_update_mode_info_border(cm, cm->mip); + vp9_update_mode_info_in_image(cm, cm->mi); + + vp9_update_mode_info_border(cm, cm->prev_mip); + vp9_update_mode_info_in_image(cm, cm->prev_mi); + + vpx_memset(cm->ref_frame_sign_bias, 0, sizeof(cm->ref_frame_sign_bias)); + + cm->frame_context_idx = 0; +} diff --git a/libvpx/vp9/common/vp9_entropymode.h b/libvpx/vp9/common/vp9_entropymode.h new file mode 100644 index 000000000..aa8aec7d2 --- /dev/null +++ b/libvpx/vp9/common/vp9_entropymode.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENTROPYMODE_H_ +#define VP9_COMMON_VP9_ENTROPYMODE_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_treecoder.h" + +#define SUBMVREF_COUNT 5 +#define TX_SIZE_CONTEXTS 2 + +#define VP9_MODE_UPDATE_PROB 252 + +// #define MODE_STATS + +extern int vp9_mv_cont(const int_mv *l, const int_mv *a); + + +extern const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES] + [VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1]; + +extern const vp9_tree_index vp9_intra_mode_tree[]; +extern const vp9_tree_index vp9_sb_mv_ref_tree[]; + +extern struct vp9_token vp9_intra_mode_encodings[VP9_INTRA_MODES]; + +/* Inter mode values do not start at zero */ + +extern struct vp9_token vp9_sb_mv_ref_encoding_array[VP9_INTER_MODES]; + +// probability models for partition information +extern const vp9_tree_index vp9_partition_tree[]; +extern struct vp9_token vp9_partition_encodings[PARTITION_TYPES]; +extern const vp9_prob vp9_partition_probs[NUM_FRAME_TYPES] + [NUM_PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; + +void vp9_entropy_mode_init(void); + +struct VP9Common; + +/* sets up common features to forget past dependence */ +void vp9_setup_past_independence(struct VP9Common *cm, MACROBLOCKD *xd); + +void vp9_init_mbmode_probs(struct VP9Common *x); + +extern void vp9_init_mode_contexts(struct VP9Common *pc); + +extern void vp9_adapt_mode_context(struct VP9Common *pc); + +extern void vp9_accum_mv_refs(struct VP9Common *pc, + MB_PREDICTION_MODE m, + const int context); + +void vp9_adapt_mode_probs(struct VP9Common *); + +#define VP9_SWITCHABLE_FILTERS 3 /* number of switchable filters */ + +extern const INTERPOLATIONFILTERTYPE vp9_switchable_interp + [VP9_SWITCHABLE_FILTERS]; + +extern const int vp9_switchable_interp_map[SWITCHABLE + 1]; + +extern const int vp9_is_interpolating_filter[SWITCHABLE + 1]; + +extern const vp9_tree_index vp9_switchable_interp_tree + [2 * (VP9_SWITCHABLE_FILTERS - 1)]; + +extern struct vp9_token vp9_switchable_interp_encodings[VP9_SWITCHABLE_FILTERS]; + +extern const vp9_prob vp9_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS - 1]; + +extern const vp9_prob vp9_default_tx_probs_32x32p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 1]; +extern const vp9_prob vp9_default_tx_probs_16x16p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 2]; +extern const vp9_prob vp9_default_tx_probs_8x8p[TX_SIZE_CONTEXTS] + [TX_SIZE_MAX_SB - 3]; + +extern void tx_counts_to_branch_counts_32x32(unsigned int *tx_count_32x32p, + unsigned int (*ct_32x32p)[2]); +extern void tx_counts_to_branch_counts_16x16(unsigned int *tx_count_16x16p, + unsigned int (*ct_16x16p)[2]); +extern void tx_counts_to_branch_counts_8x8(unsigned int *tx_count_8x8p, + unsigned int (*ct_8x8p)[2]); +#endif // VP9_COMMON_VP9_ENTROPYMODE_H_ diff --git a/libvpx/vp9/common/vp9_entropymv.c b/libvpx/vp9/common/vp9_entropymv.c new file mode 100644 index 000000000..e07e43c8b --- /dev/null +++ b/libvpx/vp9/common/vp9_entropymv.c @@ -0,0 +1,452 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_entropymv.h" + +//#define MV_COUNT_TESTING + +#define MV_COUNT_SAT 20 +#define MV_MAX_UPDATE_FACTOR 128 + +/* Integer pel reference mv threshold for use of high-precision 1/8 mv */ +#define COMPANDED_MVREF_THRESH 8 + +/* Smooth or bias the mv-counts before prob computation */ +/* #define SMOOTH_MV_COUNTS */ + +const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2] = { + -MV_JOINT_ZERO, 2, + -MV_JOINT_HNZVZ, 4, + -MV_JOINT_HZVNZ, -MV_JOINT_HNZVNZ +}; +struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; + +const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { + -MV_CLASS_0, 2, + -MV_CLASS_1, 4, + 6, 8, + -MV_CLASS_2, -MV_CLASS_3, + 10, 12, + -MV_CLASS_4, -MV_CLASS_5, + -MV_CLASS_6, 14, + 16, 18, + -MV_CLASS_7, -MV_CLASS_8, + -MV_CLASS_9, -MV_CLASS_10, +}; +struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; + +const vp9_tree_index vp9_mv_class0_tree [2 * CLASS0_SIZE - 2] = { + -0, -1, +}; +struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; + +const vp9_tree_index vp9_mv_fp_tree [2 * 4 - 2] = { + -0, 2, + -1, 4, + -2, -3 +}; +struct vp9_token vp9_mv_fp_encodings[4]; + +const nmv_context vp9_default_nmv_context = { + {32, 64, 96}, + { + { /* vert component */ + 128, /* sign */ + {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ + {216}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + }, + { /* hor component */ + 128, /* sign */ + {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ + {208}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ + } + }, +}; + +MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv) { + if (mv->row == 0 && mv->col == 0) + return MV_JOINT_ZERO; + else if (mv->row == 0 && mv->col != 0) + return MV_JOINT_HNZVZ; + else if (mv->row != 0 && mv->col == 0) + return MV_JOINT_HZVNZ; + else + return MV_JOINT_HNZVNZ; +} + +#define mv_class_base(c) ((c) ? (CLASS0_SIZE << (c + 2)) : 0) + +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { + MV_CLASS_TYPE c; + if (z < CLASS0_SIZE * 8) c = MV_CLASS_0; + else if (z < CLASS0_SIZE * 16) c = MV_CLASS_1; + else if (z < CLASS0_SIZE * 32) c = MV_CLASS_2; + else if (z < CLASS0_SIZE * 64) c = MV_CLASS_3; + else if (z < CLASS0_SIZE * 128) c = MV_CLASS_4; + else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; + else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; + else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; + else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; + else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; + else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10; + else assert(0); + if (offset) + *offset = z - mv_class_base(c); + return c; +} + +int vp9_use_nmv_hp(const MV *ref) { + return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && + (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; +} + +int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { + return mv_class_base(c) + offset; +} + +static void increment_nmv_component_count(int v, + nmv_component_counts *mvcomp, + int incr, + int usehp) { + assert (v != 0); /* should not be zero */ + mvcomp->mvcount[MV_MAX + v] += incr; +} + +static void increment_nmv_component(int v, + nmv_component_counts *mvcomp, + int incr, + int usehp) { + int s, z, c, o, d, e, f; + if (!incr) + return; + assert (v != 0); /* should not be zero */ + s = v < 0; + mvcomp->sign[s] += incr; + z = (s ? -v : v) - 1; /* magnitude - 1 */ + + c = vp9_get_mv_class(z, &o); + mvcomp->classes[c] += incr; + + d = (o >> 3); /* int mv data */ + f = (o >> 1) & 3; /* fractional pel mv data */ + e = (o & 1); /* high precision mv data */ + if (c == MV_CLASS_0) { + mvcomp->class0[d] += incr; + } else { + int i; + int b = c + CLASS0_BITS - 1; // number of bits + for (i = 0; i < b; ++i) + mvcomp->bits[i][((d >> i) & 1)] += incr; + } + + /* Code the fractional pel bits */ + if (c == MV_CLASS_0) { + mvcomp->class0_fp[d][f] += incr; + } else { + mvcomp->fp[f] += incr; + } + + /* Code the high precision bit */ + if (usehp) { + if (c == MV_CLASS_0) { + mvcomp->class0_hp[e] += incr; + } else { + mvcomp->hp[e] += incr; + } + } +} + +#ifdef SMOOTH_MV_COUNTS +static void smooth_counts(nmv_component_counts *mvcomp) { + static const int flen = 3; // (filter_length + 1) / 2 + static const int fval[] = {8, 3, 1}; + static const int fvalbits = 4; + int i; + unsigned int smvcount[MV_VALS]; + vpx_memcpy(smvcount, mvcomp->mvcount, sizeof(smvcount)); + smvcount[MV_MAX] = (smvcount[MV_MAX - 1] + smvcount[MV_MAX + 1]) >> 1; + for (i = flen - 1; i <= MV_VALS - flen; ++i) { + int j, s = smvcount[i] * fval[0]; + for (j = 1; j < flen; ++j) + s += (smvcount[i - j] + smvcount[i + j]) * fval[j]; + mvcomp->mvcount[i] = (s + (1 << (fvalbits - 1))) >> fvalbits; + } +} +#endif + +static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { + int v; + vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); + for (v = 1; v <= MV_MAX; v++) { + increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); + increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); + } +} + +void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, + int usehp) { + const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); + mvctx->joints[j]++; + usehp = usehp && vp9_use_nmv_hp(ref); + if (mv_joint_vertical(j)) + increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp); + + if (mv_joint_horizontal(j)) + increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp); +} + +static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) { + const int count = MIN(ct[0] + ct[1], MV_COUNT_SAT); + if (count) { + const vp9_prob newp = get_binary_prob(ct[0], ct[1]); + const int factor = MV_MAX_UPDATE_FACTOR * count / MV_COUNT_SAT; + *dest = weighted_prob(prep, newp, factor); + } else { + *dest = prep; + } +} + +void vp9_counts_process(nmv_context_counts *nmv_count, int usehp) { + counts_to_context(&nmv_count->comps[0], usehp); + counts_to_context(&nmv_count->comps[1], usehp); +} + +void vp9_counts_to_nmv_context( + nmv_context_counts *nmv_count, + nmv_context *prob, + int usehp, + unsigned int (*branch_ct_joint)[2], + unsigned int (*branch_ct_sign)[2], + unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], + unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], + unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], + unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], + unsigned int (*branch_ct_fp)[4 - 1][2], + unsigned int (*branch_ct_class0_hp)[2], + unsigned int (*branch_ct_hp)[2]) { + int i, j, k; + vp9_counts_process(nmv_count, usehp); + vp9_tree_probs_from_distribution(vp9_mv_joint_tree, + prob->joints, + branch_ct_joint, + nmv_count->joints, 0); + for (i = 0; i < 2; ++i) { + const uint32_t s0 = nmv_count->comps[i].sign[0]; + const uint32_t s1 = nmv_count->comps[i].sign[1]; + + prob->comps[i].sign = get_binary_prob(s0, s1); + branch_ct_sign[i][0] = s0; + branch_ct_sign[i][1] = s1; + vp9_tree_probs_from_distribution(vp9_mv_class_tree, + prob->comps[i].classes, + branch_ct_classes[i], + nmv_count->comps[i].classes, 0); + vp9_tree_probs_from_distribution(vp9_mv_class0_tree, + prob->comps[i].class0, + branch_ct_class0[i], + nmv_count->comps[i].class0, 0); + for (j = 0; j < MV_OFFSET_BITS; ++j) { + const uint32_t b0 = nmv_count->comps[i].bits[j][0]; + const uint32_t b1 = nmv_count->comps[i].bits[j][1]; + + prob->comps[i].bits[j] = get_binary_prob(b0, b1); + branch_ct_bits[i][j][0] = b0; + branch_ct_bits[i][j][1] = b1; + } + } + for (i = 0; i < 2; ++i) { + for (k = 0; k < CLASS0_SIZE; ++k) { + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, + prob->comps[i].class0_fp[k], + branch_ct_class0_fp[i][k], + nmv_count->comps[i].class0_fp[k], 0); + } + vp9_tree_probs_from_distribution(vp9_mv_fp_tree, + prob->comps[i].fp, + branch_ct_fp[i], + nmv_count->comps[i].fp, 0); + } + if (usehp) { + for (i = 0; i < 2; ++i) { + const uint32_t c0_hp0 = nmv_count->comps[i].class0_hp[0]; + const uint32_t c0_hp1 = nmv_count->comps[i].class0_hp[1]; + const uint32_t hp0 = nmv_count->comps[i].hp[0]; + const uint32_t hp1 = nmv_count->comps[i].hp[1]; + + prob->comps[i].class0_hp = get_binary_prob(c0_hp0, c0_hp1); + branch_ct_class0_hp[i][0] = c0_hp0; + branch_ct_class0_hp[i][1] = c0_hp1; + + prob->comps[i].hp = get_binary_prob(hp0, hp1); + branch_ct_hp[i][0] = hp0; + branch_ct_hp[i][1] = hp1; + } + } +} + +static unsigned int adapt_probs(unsigned int i, + vp9_tree tree, + vp9_prob this_probs[], + const vp9_prob last_probs[], + const unsigned int num_events[]) { + vp9_prob this_prob; + + const uint32_t left = tree[i] <= 0 + ? num_events[-tree[i]] + : adapt_probs(tree[i], tree, this_probs, last_probs, num_events); + + const uint32_t right = tree[i + 1] <= 0 + ? num_events[-tree[i + 1]] + : adapt_probs(tree[i + 1], tree, this_probs, last_probs, num_events); + + uint32_t weight = left + right; + if (weight) { + this_prob = get_binary_prob(left, right); + weight = weight > MV_COUNT_SAT ? MV_COUNT_SAT : weight; + this_prob = weighted_prob(last_probs[i >> 1], this_prob, + MV_MAX_UPDATE_FACTOR * weight / MV_COUNT_SAT); + } else { + this_prob = last_probs[i >> 1]; + } + this_probs[i >> 1] = this_prob; + return left + right; +} + + +void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { + int i, j; +#ifdef MV_COUNT_TESTING + printf("joints count: "); + for (j = 0; j < MV_JOINTS; ++j) printf("%d ", cm->fc.NMVcount.joints[j]); + printf("\n"); fflush(stdout); + printf("signs count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].sign[0], cm->fc.NMVcount.comps[i].sign[1]); + printf("\n"); fflush(stdout); + printf("classes count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < MV_CLASSES; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].classes[j]); + printf("\n"); fflush(stdout); + } + printf("class0 count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].class0[j]); + printf("\n"); fflush(stdout); + } + printf("bits count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < MV_OFFSET_BITS; ++j) + printf("%d/%d ", cm->fc.NMVcount.comps[i].bits[j][0], + cm->fc.NMVcount.comps[i].bits[j][1]); + printf("\n"); fflush(stdout); + } + printf("class0_fp count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) { + printf("{"); + for (k = 0; k < 4; ++k) + printf("%d ", cm->fc.NMVcount.comps[i].class0_fp[j][k]); + printf("}, "); + } + printf("\n"); fflush(stdout); + } + printf("fp count:\n"); + for (i = 0; i < 2; ++i) { + for (j = 0; j < 4; ++j) + printf("%d ", cm->fc.NMVcount.comps[i].fp[j]); + printf("\n"); fflush(stdout); + } + if (usehp) { + printf("class0_hp count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].class0_hp[0], + cm->fc.NMVcount.comps[i].class0_hp[1]); + printf("\n"); fflush(stdout); + printf("hp count:\n"); + for (i = 0; i < 2; ++i) + printf("%d/%d ", cm->fc.NMVcount.comps[i].hp[0], + cm->fc.NMVcount.comps[i].hp[1]); + printf("\n"); fflush(stdout); + } +#endif +#ifdef SMOOTH_MV_COUNTS + smooth_counts(&cm->fc.NMVcount.comps[0]); + smooth_counts(&cm->fc.NMVcount.comps[1]); +#endif + vp9_counts_process(&cm->fc.NMVcount, usehp); + + adapt_probs(0, vp9_mv_joint_tree, + cm->fc.nmvc.joints, cm->fc.pre_nmvc.joints, + cm->fc.NMVcount.joints); + + for (i = 0; i < 2; ++i) { + adapt_prob(&cm->fc.nmvc.comps[i].sign, + cm->fc.pre_nmvc.comps[i].sign, + cm->fc.NMVcount.comps[i].sign); + adapt_probs(0, vp9_mv_class_tree, + cm->fc.nmvc.comps[i].classes, cm->fc.pre_nmvc.comps[i].classes, + cm->fc.NMVcount.comps[i].classes); + adapt_probs(0, vp9_mv_class0_tree, + cm->fc.nmvc.comps[i].class0, cm->fc.pre_nmvc.comps[i].class0, + cm->fc.NMVcount.comps[i].class0); + for (j = 0; j < MV_OFFSET_BITS; ++j) { + adapt_prob(&cm->fc.nmvc.comps[i].bits[j], + cm->fc.pre_nmvc.comps[i].bits[j], + cm->fc.NMVcount.comps[i].bits[j]); + } + } + for (i = 0; i < 2; ++i) { + for (j = 0; j < CLASS0_SIZE; ++j) { + adapt_probs(0, vp9_mv_fp_tree, + cm->fc.nmvc.comps[i].class0_fp[j], + cm->fc.pre_nmvc.comps[i].class0_fp[j], + cm->fc.NMVcount.comps[i].class0_fp[j]); + } + adapt_probs(0, vp9_mv_fp_tree, + cm->fc.nmvc.comps[i].fp, + cm->fc.pre_nmvc.comps[i].fp, + cm->fc.NMVcount.comps[i].fp); + } + if (usehp) { + for (i = 0; i < 2; ++i) { + adapt_prob(&cm->fc.nmvc.comps[i].class0_hp, + cm->fc.pre_nmvc.comps[i].class0_hp, + cm->fc.NMVcount.comps[i].class0_hp); + adapt_prob(&cm->fc.nmvc.comps[i].hp, + cm->fc.pre_nmvc.comps[i].hp, + cm->fc.NMVcount.comps[i].hp); + } + } +} + +void vp9_entropy_mv_init() { + vp9_tokens_from_tree(vp9_mv_joint_encodings, vp9_mv_joint_tree); + vp9_tokens_from_tree(vp9_mv_class_encodings, vp9_mv_class_tree); + vp9_tokens_from_tree(vp9_mv_class0_encodings, vp9_mv_class0_tree); + vp9_tokens_from_tree(vp9_mv_fp_encodings, vp9_mv_fp_tree); +} + +void vp9_init_mv_probs(VP9_COMMON *cm) { + vpx_memcpy(&cm->fc.nmvc, &vp9_default_nmv_context, sizeof(nmv_context)); +} diff --git a/libvpx/vp9/common/vp9_entropymv.h b/libvpx/vp9/common/vp9_entropymv.h new file mode 100644 index 000000000..15994a6ae --- /dev/null +++ b/libvpx/vp9/common/vp9_entropymv.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_ENTROPYMV_H_ +#define VP9_COMMON_VP9_ENTROPYMV_H_ + +#include "vp9/common/vp9_treecoder.h" +#include "vpx_config.h" +#include "vp9/common/vp9_blockd.h" + +struct VP9Common; + +void vp9_entropy_mv_init(); +void vp9_init_mv_probs(struct VP9Common *cm); + +void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp); +int vp9_use_nmv_hp(const MV *ref); + +#define VP9_NMV_UPDATE_PROB 252 + +//#define MV_GROUP_UPDATE + +#define LOW_PRECISION_MV_UPDATE /* Use 7 bit forward update */ + +/* Symbols for coding which components are zero jointly */ +#define MV_JOINTS 4 +typedef enum { + MV_JOINT_ZERO = 0, /* Zero vector */ + MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */ + MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */ + MV_JOINT_HNZVNZ = 3, /* Both components nonzero */ +} MV_JOINT_TYPE; + +static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) { + return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ; +} + +static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) { + return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ; +} + +extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; +extern struct vp9_token vp9_mv_joint_encodings[MV_JOINTS]; + +/* Symbols for coding magnitude class of nonzero components */ +#define MV_CLASSES 11 +typedef enum { + MV_CLASS_0 = 0, /* (0, 2] integer pel */ + MV_CLASS_1 = 1, /* (2, 4] integer pel */ + MV_CLASS_2 = 2, /* (4, 8] integer pel */ + MV_CLASS_3 = 3, /* (8, 16] integer pel */ + MV_CLASS_4 = 4, /* (16, 32] integer pel */ + MV_CLASS_5 = 5, /* (32, 64] integer pel */ + MV_CLASS_6 = 6, /* (64, 128] integer pel */ + MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ +} MV_CLASS_TYPE; + +extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; +extern struct vp9_token vp9_mv_class_encodings[MV_CLASSES]; + +#define CLASS0_BITS 1 /* bits at integer precision for class 0 */ +#define CLASS0_SIZE (1 << CLASS0_BITS) +#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2) + +#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2) +#define MV_MAX ((1 << MV_MAX_BITS) - 1) +#define MV_VALS ((MV_MAX << 1) + 1) + +extern const vp9_tree_index vp9_mv_class0_tree[2 * CLASS0_SIZE - 2]; +extern struct vp9_token vp9_mv_class0_encodings[CLASS0_SIZE]; + +extern const vp9_tree_index vp9_mv_fp_tree[2 * 4 - 2]; +extern struct vp9_token vp9_mv_fp_encodings[4]; + +typedef struct { + vp9_prob sign; + vp9_prob classes[MV_CLASSES - 1]; + vp9_prob class0[CLASS0_SIZE - 1]; + vp9_prob bits[MV_OFFSET_BITS]; + vp9_prob class0_fp[CLASS0_SIZE][4 - 1]; + vp9_prob fp[4 - 1]; + vp9_prob class0_hp; + vp9_prob hp; +} nmv_component; + +typedef struct { + vp9_prob joints[MV_JOINTS - 1]; + nmv_component comps[2]; +} nmv_context; + +MV_JOINT_TYPE vp9_get_mv_joint(const MV *mv); +MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset); +int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset); + + +typedef struct { + unsigned int mvcount[MV_VALS]; + unsigned int sign[2]; + unsigned int classes[MV_CLASSES]; + unsigned int class0[CLASS0_SIZE]; + unsigned int bits[MV_OFFSET_BITS][2]; + unsigned int class0_fp[CLASS0_SIZE][4]; + unsigned int fp[4]; + unsigned int class0_hp[2]; + unsigned int hp[2]; +} nmv_component_counts; + +typedef struct { + unsigned int joints[MV_JOINTS]; + nmv_component_counts comps[2]; +} nmv_context_counts; + +void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, + int usehp); +extern const nmv_context vp9_default_nmv_context; +void vp9_counts_to_nmv_context( + nmv_context_counts *NMVcount, + nmv_context *prob, + int usehp, + unsigned int (*branch_ct_joint)[2], + unsigned int (*branch_ct_sign)[2], + unsigned int (*branch_ct_classes)[MV_CLASSES - 1][2], + unsigned int (*branch_ct_class0)[CLASS0_SIZE - 1][2], + unsigned int (*branch_ct_bits)[MV_OFFSET_BITS][2], + unsigned int (*branch_ct_class0_fp)[CLASS0_SIZE][4 - 1][2], + unsigned int (*branch_ct_fp)[4 - 1][2], + unsigned int (*branch_ct_class0_hp)[2], + unsigned int (*branch_ct_hp)[2]); +void vp9_counts_process(nmv_context_counts *NMVcount, int usehp); + +#endif // VP9_COMMON_VP9_ENTROPYMV_H_ diff --git a/libvpx/vp9/common/vp9_enums.h b/libvpx/vp9/common/vp9_enums.h new file mode 100644 index 000000000..e18d353d3 --- /dev/null +++ b/libvpx/vp9/common/vp9_enums.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ENUMS_H_ +#define VP9_COMMON_VP9_ENUMS_H_ + +#include "./vpx_config.h" + +#define LOG2_MI_SIZE 3 + +#define MI_SIZE (1 << LOG2_MI_SIZE) +#define MI_MASK ((64 >> LOG2_MI_SIZE) - 1) + +typedef enum BLOCK_SIZE_TYPE { + BLOCK_SIZE_AB4X4, + BLOCK_SIZE_SB4X8, + BLOCK_SIZE_SB8X4, + BLOCK_SIZE_SB8X8, + BLOCK_SIZE_SB8X16, + BLOCK_SIZE_SB16X8, + BLOCK_SIZE_MB16X16, + BLOCK_SIZE_SB16X32, + BLOCK_SIZE_SB32X16, + BLOCK_SIZE_SB32X32, + BLOCK_SIZE_SB32X64, + BLOCK_SIZE_SB64X32, + BLOCK_SIZE_SB64X64, + BLOCK_SIZE_TYPES +} BLOCK_SIZE_TYPE; + +typedef enum PARTITION_TYPE { + PARTITION_NONE, + PARTITION_HORZ, + PARTITION_VERT, + PARTITION_SPLIT, + PARTITION_TYPES +} PARTITION_TYPE; + +#define PARTITION_PLOFFSET 4 // number of probability models per block size +#define NUM_PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET) + +#endif // VP9_COMMON_VP9_ENUMS_H_ diff --git a/libvpx/vp9/common/vp9_extend.c b/libvpx/vp9/common/vp9_extend.c new file mode 100644 index 000000000..95ec59061 --- /dev/null +++ b/libvpx/vp9/common/vp9_extend.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_extend.h" +#include "vpx_mem/vpx_mem.h" + +static void copy_and_extend_plane(const uint8_t *src, int src_pitch, + uint8_t *dst, int dst_pitch, + int w, int h, + int extend_top, int extend_left, + int extend_bottom, int extend_right) { + int i, linesize; + + // copy the left and right most columns out + const uint8_t *src_ptr1 = src; + const uint8_t *src_ptr2 = src + w - 1; + uint8_t *dst_ptr1 = dst - extend_left; + uint8_t *dst_ptr2 = dst + w; + + for (i = 0; i < h; i++) { + vpx_memset(dst_ptr1, src_ptr1[0], extend_left); + vpx_memcpy(dst_ptr1 + extend_left, src_ptr1, w); + vpx_memset(dst_ptr2, src_ptr2[0], extend_right); + src_ptr1 += src_pitch; + src_ptr2 += src_pitch; + dst_ptr1 += dst_pitch; + dst_ptr2 += dst_pitch; + } + + // Now copy the top and bottom lines into each line of the respective + // borders + src_ptr1 = dst - extend_left; + src_ptr2 = dst + dst_pitch * (h - 1) - extend_left; + dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left; + dst_ptr2 = dst + dst_pitch * (h) - extend_left; + linesize = extend_left + extend_right + w; + + for (i = 0; i < extend_top; i++) { + vpx_memcpy(dst_ptr1, src_ptr1, linesize); + dst_ptr1 += dst_pitch; + } + + for (i = 0; i < extend_bottom; i++) { + vpx_memcpy(dst_ptr2, src_ptr2, linesize); + dst_ptr2 += dst_pitch; + } +} + +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { + const int et_y = dst->border; + const int el_y = dst->border; + const int eb_y = dst->border + dst->y_height - src->y_height; + const int er_y = dst->border + dst->y_width - src->y_width; + + const int et_uv = dst->border >> (dst->uv_height != dst->y_height); + const int el_uv = dst->border >> (dst->uv_width != dst->y_width); + const int eb_uv = et_uv + dst->uv_height - src->uv_height; + const int er_uv = el_uv + dst->uv_width - src->uv_width; + +#if CONFIG_ALPHA + const int et_a = dst->border >> (dst->alpha_height != dst->y_height); + const int el_a = dst->border >> (dst->alpha_width != dst->y_width); + const int eb_a = et_a + dst->alpha_height - src->alpha_height; + const int er_a = el_a + dst->alpha_width - src->alpha_width; + + copy_and_extend_plane(src->alpha_buffer, src->alpha_stride, + dst->alpha_buffer, dst->alpha_stride, + src->alpha_width, src->alpha_height, + et_a, el_a, eb_a, er_a); +#endif + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_width, src->y_height, + et_y, el_y, eb_y, er_y); + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_width, src->uv_height, + et_uv, el_uv, eb_uv, er_uv); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_width, src->uv_height, + et_uv, el_uv, eb_uv, er_uv); +} + +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw) { + // If the side is not touching the bounder then don't extend. + const int et_y = srcy ? 0 : dst->border; + const int el_y = srcx ? 0 : dst->border; + const int eb_y = srcy + srch != src->y_height ? 0 : + dst->border + dst->y_height - src->y_height; + const int er_y = srcx + srcw != src->y_width ? 0 : + dst->border + dst->y_width - src->y_width; + const int src_y_offset = srcy * src->y_stride + srcx; + const int dst_y_offset = srcy * dst->y_stride + srcx; + + const int et_uv = (et_y + 1) >> 1; + const int el_uv = (el_y + 1) >> 1; + const int eb_uv = (eb_y + 1) >> 1; + const int er_uv = (er_y + 1) >> 1; + const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1); + const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1); + const int srch_uv = (srch + 1) >> 1; + const int srcw_uv = (srcw + 1) >> 1; + + copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride, + dst->y_buffer + dst_y_offset, dst->y_stride, + srcw, srch, + et_y, el_y, eb_y, er_y); + + copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride, + dst->u_buffer + dst_uv_offset, dst->uv_stride, + srcw_uv, srch_uv, + et_uv, el_uv, eb_uv, er_uv); + + copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride, + dst->v_buffer + dst_uv_offset, dst->uv_stride, + srcw_uv, srch_uv, + et_uv, el_uv, eb_uv, er_uv); +} diff --git a/libvpx/vp9/common/vp9_extend.h b/libvpx/vp9/common/vp9_extend.h new file mode 100644 index 000000000..7ff79b7b6 --- /dev/null +++ b/libvpx/vp9/common/vp9_extend.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_EXTEND_H_ +#define VP9_COMMON_VP9_EXTEND_H_ + +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" + + +void vp9_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); + +void vp9_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst, + int srcy, int srcx, + int srch, int srcw); +#endif // VP9_COMMON_VP9_EXTEND_H_ diff --git a/libvpx/vp9/common/vp9_filter.c b/libvpx/vp9/common/vp9_filter.c new file mode 100644 index 000000000..e5503cdd9 --- /dev/null +++ b/libvpx/vp9/common/vp9_filter.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <stdlib.h> +#include "vp9/common/vp9_filter.h" +#include "vpx_ports/mem.h" +#include "vp9_rtcd.h" +#include "vp9/common/vp9_common.h" + +DECLARE_ALIGNED(256, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = { + { 0, 0, 0, 128, 0, 0, 0, 0 }, + { 0, 0, 0, 120, 8, 0, 0, 0 }, + { 0, 0, 0, 112, 16, 0, 0, 0 }, + { 0, 0, 0, 104, 24, 0, 0, 0 }, + { 0, 0, 0, 96, 32, 0, 0, 0 }, + { 0, 0, 0, 88, 40, 0, 0, 0 }, + { 0, 0, 0, 80, 48, 0, 0, 0 }, + { 0, 0, 0, 72, 56, 0, 0, 0 }, + { 0, 0, 0, 64, 64, 0, 0, 0 }, + { 0, 0, 0, 56, 72, 0, 0, 0 }, + { 0, 0, 0, 48, 80, 0, 0, 0 }, + { 0, 0, 0, 40, 88, 0, 0, 0 }, + { 0, 0, 0, 32, 96, 0, 0, 0 }, + { 0, 0, 0, 24, 104, 0, 0, 0 }, + { 0, 0, 0, 16, 112, 0, 0, 0 }, + { 0, 0, 0, 8, 120, 0, 0, 0 } +}; + +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]) = { + /* Lagrangian interpolation filter */ + { 0, 0, 0, 128, 0, 0, 0, 0}, + { 0, 1, -5, 126, 8, -3, 1, 0}, + { -1, 3, -10, 122, 18, -6, 2, 0}, + { -1, 4, -13, 118, 27, -9, 3, -1}, + { -1, 4, -16, 112, 37, -11, 4, -1}, + { -1, 5, -18, 105, 48, -14, 4, -1}, + { -1, 5, -19, 97, 58, -16, 5, -1}, + { -1, 6, -19, 88, 68, -18, 5, -1}, + { -1, 6, -19, 78, 78, -19, 6, -1}, + { -1, 5, -18, 68, 88, -19, 6, -1}, + { -1, 5, -16, 58, 97, -19, 5, -1}, + { -1, 4, -14, 48, 105, -18, 5, -1}, + { -1, 4, -11, 37, 112, -16, 4, -1}, + { -1, 3, -9, 27, 118, -13, 4, -1}, + { 0, 2, -6, 18, 122, -10, 3, -1}, + { 0, 1, -3, 8, 126, -5, 1, 0} +}; + +DECLARE_ALIGNED(256, const int16_t, vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]) + = { + /* dct based filter */ + {0, 0, 0, 128, 0, 0, 0, 0}, + {-1, 3, -7, 127, 8, -3, 1, 0}, + {-2, 5, -13, 125, 17, -6, 3, -1}, + {-3, 7, -17, 121, 27, -10, 5, -2}, + {-4, 9, -20, 115, 37, -13, 6, -2}, + {-4, 10, -23, 108, 48, -16, 8, -3}, + {-4, 10, -24, 100, 59, -19, 9, -3}, + {-4, 11, -24, 90, 70, -21, 10, -4}, + {-4, 11, -23, 80, 80, -23, 11, -4}, + {-4, 10, -21, 70, 90, -24, 11, -4}, + {-3, 9, -19, 59, 100, -24, 10, -4}, + {-3, 8, -16, 48, 108, -23, 10, -4}, + {-2, 6, -13, 37, 115, -20, 9, -4}, + {-2, 5, -10, 27, 121, -17, 7, -3}, + {-1, 3, -6, 17, 125, -13, 5, -2}, + {0, 1, -3, 8, 127, -7, 3, -1} +}; + +DECLARE_ALIGNED(256, const int16_t, + vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]) = { + /* freqmultiplier = 0.5 */ + { 0, 0, 0, 128, 0, 0, 0, 0}, + {-3, -1, 32, 64, 38, 1, -3, 0}, + {-2, -2, 29, 63, 41, 2, -3, 0}, + {-2, -2, 26, 63, 43, 4, -4, 0}, + {-2, -3, 24, 62, 46, 5, -4, 0}, + {-2, -3, 21, 60, 49, 7, -4, 0}, + {-1, -4, 18, 59, 51, 9, -4, 0}, + {-1, -4, 16, 57, 53, 12, -4, -1}, + {-1, -4, 14, 55, 55, 14, -4, -1}, + {-1, -4, 12, 53, 57, 16, -4, -1}, + { 0, -4, 9, 51, 59, 18, -4, -1}, + { 0, -4, 7, 49, 60, 21, -3, -2}, + { 0, -4, 5, 46, 62, 24, -3, -2}, + { 0, -4, 4, 43, 63, 26, -2, -2}, + { 0, -3, 2, 41, 63, 29, -2, -2}, + { 0, -3, 1, 38, 64, 32, -1, -3} +}; diff --git a/libvpx/vp9/common/vp9_filter.h b/libvpx/vp9/common/vp9_filter.h new file mode 100644 index 000000000..1ccfdaac2 --- /dev/null +++ b/libvpx/vp9/common/vp9_filter.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_FILTER_H_ +#define VP9_COMMON_VP9_FILTER_H_ + +#include "vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" + +#define BLOCK_HEIGHT_WIDTH 4 +#define VP9_FILTER_WEIGHT 128 +#define VP9_FILTER_SHIFT 7 + +#define SUBPEL_SHIFTS 16 + +extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8]; +extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8]; + +// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear +// filter kernel as a 2 tap filter. +#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \ + sizeof(vp9_bilinear_filters[0][0])) +#define BF_OFFSET (BF_LENGTH / 2 - 1) +#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET) + +#endif // VP9_COMMON_VP9_FILTER_H_ diff --git a/libvpx/vp9/common/vp9_findnearmv.c b/libvpx/vp9/common/vp9_findnearmv.c new file mode 100644 index 000000000..a6922715e --- /dev/null +++ b/libvpx/vp9/common/vp9_findnearmv.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <limits.h> + +#include "vp9/common/vp9_findnearmv.h" +#include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_sadmxn.h" + +static void lower_mv_precision(int_mv *mv, int usehp) { + if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) { + if (mv->as_mv.row & 1) + mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); + if (mv->as_mv.col & 1) + mv->as_mv.col += (mv->as_mv.col > 0 ? -1 : 1); + } +} + + +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + int_mv *mvlist, + int_mv *nearest, + int_mv *near) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv); + clamp_mv2(&mvlist[i], xd); + } + *nearest = mvlist[0]; + *near = mvlist[1]; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd, + int_mv *dst_nearest, + int_mv *dst_near, + int block_idx, int ref_idx) { + int_mv dst_list[MAX_MV_REF_CANDIDATES]; + int_mv mv_list[MAX_MV_REF_CANDIDATES]; + MODE_INFO *mi = xd->mode_info_context; + MB_MODE_INFO *const mbmi = &mi->mbmi; + + assert(ref_idx == 0 || ref_idx == 1); + assert(MAX_MV_REF_CANDIDATES == 2); // makes code here slightly easier + + vp9_find_mv_refs_idx(cm, xd, xd->mode_info_context, + xd->prev_mode_info_context, + mbmi->ref_frame[ref_idx], + mv_list, cm->ref_frame_sign_bias, block_idx); + + dst_list[1].as_int = 0; + if (block_idx == 0) { + memcpy(dst_list, mv_list, MAX_MV_REF_CANDIDATES * sizeof(int_mv)); + } else if (block_idx == 1 || block_idx == 2) { + int dst = 0, n; + union b_mode_info *bmi = mi->bmi; + + dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; + for (n = 0; dst < MAX_MV_REF_CANDIDATES && + n < MAX_MV_REF_CANDIDATES; n++) + if (mv_list[n].as_int != dst_list[0].as_int) + dst_list[dst++].as_int = mv_list[n].as_int; + } else { + int dst = 0, n; + union b_mode_info *bmi = mi->bmi; + + assert(block_idx == 3); + dst_list[dst++].as_int = bmi[2].as_mv[ref_idx].as_int; + if (dst_list[0].as_int != bmi[1].as_mv[ref_idx].as_int) + dst_list[dst++].as_int = bmi[1].as_mv[ref_idx].as_int; + if (dst < MAX_MV_REF_CANDIDATES && + dst_list[0].as_int != bmi[0].as_mv[ref_idx].as_int) + dst_list[dst++].as_int = bmi[0].as_mv[ref_idx].as_int; + for (n = 0; dst < MAX_MV_REF_CANDIDATES && + n < MAX_MV_REF_CANDIDATES; n++) + if (mv_list[n].as_int != dst_list[0].as_int) + dst_list[dst++].as_int = mv_list[n].as_int; + } + + dst_nearest->as_int = dst_list[0].as_int; + dst_near->as_int = dst_list[1].as_int; +} diff --git a/libvpx/vp9/common/vp9_findnearmv.h b/libvpx/vp9/common/vp9_findnearmv.h new file mode 100644 index 000000000..d4ae2102d --- /dev/null +++ b/libvpx/vp9/common/vp9_findnearmv.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_FINDNEARMV_H_ +#define VP9_COMMON_VP9_FINDNEARMV_H_ + +#include "vp9/common/vp9_mv.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_treecoder.h" +#include "vp9/common/vp9_onyxc_int.h" + +#define LEFT_TOP_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) + +// check a list of motion vectors by sad score using a number rows of pixels +// above and a number cols of pixels in the left to select the one with best +// score to use as ref motion vector +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + int_mv *mvlist, + int_mv *nearest, + int_mv *near); + +static void mv_bias(int refmb_ref_frame_sign_bias, int refframe, + int_mv *mvp, const int *ref_frame_sign_bias) { + MV xmv = mvp->as_mv; + + if (refmb_ref_frame_sign_bias != ref_frame_sign_bias[refframe]) { + xmv.row *= -1; + xmv.col *= -1; + } + + mvp->as_mv = xmv; +} + +// TODO(jingning): this mv clamping function should be block size dependent. +static void clamp_mv(int_mv *mv, + int mb_to_left_edge, + int mb_to_right_edge, + int mb_to_top_edge, + int mb_to_bottom_edge) { + mv->as_mv.col = clamp(mv->as_mv.col, mb_to_left_edge, mb_to_right_edge); + mv->as_mv.row = clamp(mv->as_mv.row, mb_to_top_edge, mb_to_bottom_edge); +} + +static int clamp_mv2(int_mv *mv, const MACROBLOCKD *xd) { + int_mv tmp_mv; + tmp_mv.as_int = mv->as_int; + clamp_mv(mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); + return tmp_mv.as_int != mv->as_int; +} + +static int check_mv_bounds(int_mv *mv, + int mb_to_left_edge, int mb_to_right_edge, + int mb_to_top_edge, int mb_to_bottom_edge) { + return mv->as_mv.col < mb_to_left_edge || + mv->as_mv.col > mb_to_right_edge || + mv->as_mv.row < mb_to_top_edge || + mv->as_mv.row > mb_to_bottom_edge; +} + +void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *pc, + MACROBLOCKD *xd, + int_mv *dst_nearest, + int_mv *dst_near, + int block_idx, int ref_idx); + +static MB_PREDICTION_MODE left_block_mode(const MODE_INFO *cur_mb, int b) { + // FIXME(rbultje, jingning): temporary hack because jenkins doesn't + // understand this condition. This will go away soon. + if (b == 0 || b == 2) { + /* On L edge, get from MB to left of us */ + --cur_mb; + + if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { + return DC_PRED; + } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + return ((cur_mb->bmi + 1 + b)->as_mode.first); + } else { + return cur_mb->mbmi.mode; + } + } + assert(b == 1 || b == 3); + return (cur_mb->bmi + b - 1)->as_mode.first; +} + +static MB_PREDICTION_MODE above_block_mode(const MODE_INFO *cur_mb, + int b, int mi_stride) { + if (!(b >> 1)) { + /* On top edge, get from MB above us */ + cur_mb -= mi_stride; + + if (cur_mb->mbmi.ref_frame[0] != INTRA_FRAME) { + return DC_PRED; + } else if (cur_mb->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + return ((cur_mb->bmi + 2 + b)->as_mode.first); + } else { + return cur_mb->mbmi.mode; + } + } + + return (cur_mb->bmi + b - 2)->as_mode.first; +} + +#endif // VP9_COMMON_VP9_FINDNEARMV_H_ diff --git a/libvpx/vp9/common/vp9_idct.c b/libvpx/vp9/common/vp9_idct.c new file mode 100644 index 000000000..dcc7f0330 --- /dev/null +++ b/libvpx/vp9/common/vp9_idct.c @@ -0,0 +1,1310 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <math.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_systemdependent.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { +/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, + 0.5 shifts per pixel. */ + int i; + int16_t output[16]; + int a1, b1, c1, d1, e1; + int16_t *ip = input; + int16_t *op = output; + + for (i = 0; i < 4; i++) { + a1 = ip[0] >> WHT_UPSCALE_FACTOR; + c1 = ip[1] >> WHT_UPSCALE_FACTOR; + d1 = ip[2] >> WHT_UPSCALE_FACTOR; + b1 = ip[3] >> WHT_UPSCALE_FACTOR; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + op[0] = a1; + op[1] = b1; + op[2] = c1; + op[3] = d1; + ip += 4; + op += 4; + } + + ip = output; + for (i = 0; i < 4; i++) { + a1 = ip[4 * 0]; + c1 = ip[4 * 1]; + d1 = ip[4 * 2]; + b1 = ip[4 * 3]; + a1 += c1; + d1 -= b1; + e1 = (a1 - d1) >> 1; + b1 = e1 - b1; + c1 = e1 - c1; + a1 -= b1; + d1 += c1; + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1); + + ip++; + dest++; + } +} + +void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) { + int i; + int a1, e1; + int16_t tmp[4]; + int16_t *ip = in; + int16_t *op = tmp; + + a1 = ip[0] >> WHT_UPSCALE_FACTOR; + e1 = a1 >> 1; + a1 -= e1; + op[0] = a1; + op[1] = op[2] = op[3] = e1; + + ip = tmp; + for (i = 0; i < 4; i++) { + e1 = ip[0] >> 1; + a1 = ip[0] - e1; + dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); + dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); + dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); + dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); + ip++; + dest++; + } +} + +void vp9_idct4_1d_c(int16_t *input, int16_t *output) { + int16_t step[4]; + int temp1, temp2; + // stage 1 + temp1 = (input[0] + input[2]) * cospi_16_64; + temp2 = (input[0] - input[2]) * cospi_16_64; + step[0] = dct_const_round_shift(temp1); + step[1] = dct_const_round_shift(temp2); + temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; + temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; + step[2] = dct_const_round_shift(temp1); + step[3] = dct_const_round_shift(temp2); + + // stage 2 + output[0] = step[0] + step[3]; + output[1] = step[1] + step[2]; + output[2] = step[1] - step[2]; + output[3] = step[0] - step[3]; +} + +void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int16_t out[4 * 4]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[4], temp_out[4]; + + // Rows + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = input[j]; + vp9_idct4_1d(temp_in, outptr); + input += 4; + outptr += 4; + } + + // Columns + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + vp9_idct4_1d(temp_in, temp_out); + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + } +} + +void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int i; + int a1; + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (i = 0; i < 4; i++) { + dest[0] = clip_pixel(dest[0] + a1); + dest[1] = clip_pixel(dest[1] + a1); + dest[2] = clip_pixel(dest[2] + a1); + dest[3] = clip_pixel(dest[3] + a1); + dest += dest_stride; + } +} + +void vp9_dc_only_idct_add_c(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int r, c; + int16_t out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + for (r = 0; r < 4; r++) { + for (c = 0; c < 4; c++) + dst_ptr[c] = clip_pixel(a1 + pred_ptr[c]); + + dst_ptr += stride; + pred_ptr += pitch; + } +} + +static void idct8_1d(int16_t *input, int16_t *output) { + int16_t step1[8], step2[8]; + int temp1, temp2; + // stage 1 + step1[0] = input[0]; + step1[2] = input[4]; + step1[1] = input[2]; + step1[3] = input[6]; + temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; + temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; + temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + // stage 2 & stage 3 - even half + vp9_idct4_1d(step1, step1); + + // stage 2 - odd half + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + // stage 3 -odd half + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + // stage 4 + output[0] = step1[0] + step1[7]; + output[1] = step1[1] + step1[6]; + output[2] = step1[2] + step1[5]; + output[3] = step1[3] + step1[4]; + output[4] = step1[3] - step1[4]; + output[5] = step1[2] - step1[5]; + output[6] = step1[1] - step1[6]; + output[7] = step1[0] - step1[7]; +} + +void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int16_t out[8 * 8]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[8], temp_out[8]; + + // First transform rows + for (i = 0; i < 8; ++i) { + idct8_1d(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_1d(temp_in, temp_out); + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } +} + +static void iadst4_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[0]; + int x1 = input[1]; + int x2 = input[2]; + int x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + output[0] = output[1] = output[2] = output[3] = 0; + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = x0 - x2 + x3; + + x0 = s0 + s3 + s5; + x1 = s1 - s4 - s6; + x2 = sinpi_3_9 * s7; + x3 = s2; + + s0 = x0 + x3; + s1 = x1 + x3; + s2 = x2; + s3 = x0 + x1 - x3; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = dct_const_round_shift(s0); + output[1] = dct_const_round_shift(s1); + output[2] = dct_const_round_shift(s2); + output[3] = dct_const_round_shift(s3); +} + +void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride, + int tx_type) { + const transform_2d IHT_4[] = { + { vp9_idct4_1d, vp9_idct4_1d }, // DCT_DCT = 0 + { iadst4_1d, vp9_idct4_1d }, // ADST_DCT = 1 + { vp9_idct4_1d, iadst4_1d }, // DCT_ADST = 2 + { iadst4_1d, iadst4_1d } // ADST_ADST = 3 + }; + + int i, j; + int16_t out[4 * 4]; + int16_t *outptr = out; + int16_t temp_in[4], temp_out[4]; + + // inverse transform row vectors + for (i = 0; i < 4; ++i) { + IHT_4[tx_type].rows(input, outptr); + input += 4; + outptr += 4; + } + + // inverse transform column vectors + for (i = 0; i < 4; ++i) { + for (j = 0; j < 4; ++j) + temp_in[j] = out[j * 4 + i]; + IHT_4[tx_type].cols(temp_in, temp_out); + for (j = 0; j < 4; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * dest_stride + i]); + } +} +static void iadst8_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + + int x0 = input[7]; + int x1 = input[0]; + int x2 = input[5]; + int x3 = input[2]; + int x4 = input[3]; + int x5 = input[4]; + int x6 = input[1]; + int x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = 0; + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = dct_const_round_shift(s0 + s4); + x1 = dct_const_round_shift(s1 + s5); + x2 = dct_const_round_shift(s2 + s6); + x3 = dct_const_round_shift(s3 + s7); + x4 = dct_const_round_shift(s0 - s4); + x5 = dct_const_round_shift(s1 - s5); + x6 = dct_const_round_shift(s2 - s6); + x7 = dct_const_round_shift(s3 - s7); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + + output[0] = x0; + output[1] = -x4; + output[2] = x6; + output[3] = -x2; + output[4] = x3; + output[5] = -x7; + output[6] = x5; + output[7] = -x1; +} + +static const transform_2d IHT_8[] = { + { idct8_1d, idct8_1d }, // DCT_DCT = 0 + { iadst8_1d, idct8_1d }, // ADST_DCT = 1 + { idct8_1d, iadst8_1d }, // DCT_ADST = 2 + { iadst8_1d, iadst8_1d } // ADST_ADST = 3 +}; + +void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, + int tx_type) { + int i, j; + int16_t out[8 * 8]; + int16_t *outptr = out; + int16_t temp_in[8], temp_out[8]; + const transform_2d ht = IHT_8[tx_type]; + + // inverse transform row vectors + for (i = 0; i < 8; ++i) { + ht.rows(input, outptr); + input += 8; + outptr += 8; + } + + // inverse transform column vectors + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); } +} + +void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t out[8 * 8]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[8], temp_out[8]; + + vpx_memset(out, 0, sizeof(out)); + // First transform rows + // only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_1d(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) + temp_in[j] = out[j * 8 + i]; + idct8_1d(temp_in, temp_out); + for (j = 0; j < 8; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * dest_stride + i]); + } +} + +void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 5); +} + +static void idct16_1d(int16_t *input, int16_t *output) { + int16_t step1[16], step2[16]; + int temp1, temp2; + + // stage 1 + step1[0] = input[0/2]; + step1[1] = input[16/2]; + step1[2] = input[8/2]; + step1[3] = input[24/2]; + step1[4] = input[4/2]; + step1[5] = input[20/2]; + step1[6] = input[12/2]; + step1[7] = input[28/2]; + step1[8] = input[2/2]; + step1[9] = input[18/2]; + step1[10] = input[10/2]; + step1[11] = input[26/2]; + step1[12] = input[6/2]; + step1[13] = input[22/2]; + step1[14] = input[14/2]; + step1[15] = input[30/2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = dct_const_round_shift(temp1); + step2[15] = dct_const_round_shift(temp2); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + step1[8] = step2[8] + step2[9]; + step1[9] = step2[8] - step2[9]; + step1[10] = -step2[10] + step2[11]; + step1[11] = step2[10] + step2[11]; + step1[12] = step2[12] + step2[13]; + step1[13] = step2[12] - step2[13]; + step1[14] = -step2[14] + step2[15]; + step1[15] = step2[14] + step2[15]; + + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = dct_const_round_shift(temp1); + step2[1] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + step2[11] = step1[11]; + step2[12] = step1[12]; + + // stage 5 + step1[0] = step2[0] + step2[3]; + step1[1] = step2[1] + step2[2]; + step1[2] = step2[1] - step2[2]; + step1[3] = step2[0] - step2[3]; + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + step1[8] = step2[8] + step2[11]; + step1[9] = step2[9] + step2[10]; + step1[10] = step2[9] - step2[10]; + step1[11] = step2[8] - step2[11]; + step1[12] = -step2[12] + step2[15]; + step1[13] = -step2[13] + step2[14]; + step1[14] = step2[13] + step2[14]; + step1[15] = step2[12] + step2[15]; + + // stage 6 + step2[0] = step1[0] + step1[7]; + step2[1] = step1[1] + step1[6]; + step2[2] = step1[2] + step1[5]; + step2[3] = step1[3] + step1[4]; + step2[4] = step1[3] - step1[4]; + step2[5] = step1[2] - step1[5]; + step2[6] = step1[1] - step1[6]; + step2[7] = step1[0] - step1[7]; + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + output[0] = step2[0] + step2[15]; + output[1] = step2[1] + step2[14]; + output[2] = step2[2] + step2[13]; + output[3] = step2[3] + step2[12]; + output[4] = step2[4] + step2[11]; + output[5] = step2[5] + step2[10]; + output[6] = step2[6] + step2[9]; + output[7] = step2[7] + step2[8]; + output[8] = step2[7] - step2[8]; + output[9] = step2[6] - step2[9]; + output[10] = step2[5] - step2[10]; + output[11] = step2[4] - step2[11]; + output[12] = step2[3] - step2[12]; + output[13] = step2[2] - step2[13]; + output[14] = step2[1] - step2[14]; + output[15] = step2[0] - step2[15]; +} + +void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int16_t out[16 * 16]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[16], temp_out[16]; + + // First transform rows + for (i = 0; i < 16; ++i) { + idct16_1d(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + idct16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); + } +} + +void iadst16_1d(int16_t *input, int16_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; + + int x0 = input[15]; + int x1 = input[0]; + int x2 = input[13]; + int x3 = input[2]; + int x4 = input[11]; + int x5 = input[4]; + int x6 = input[9]; + int x7 = input[6]; + int x8 = input[7]; + int x9 = input[8]; + int x10 = input[5]; + int x11 = input[10]; + int x12 = input[3]; + int x13 = input[12]; + int x14 = input[1]; + int x15 = input[14]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 + | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { + output[0] = output[1] = output[2] = output[3] = output[4] + = output[5] = output[6] = output[7] = output[8] + = output[9] = output[10] = output[11] = output[12] + = output[13] = output[14] = output[15] = 0; + return; + } + + // stage 1 + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; + + x0 = dct_const_round_shift(s0 + s8); + x1 = dct_const_round_shift(s1 + s9); + x2 = dct_const_round_shift(s2 + s10); + x3 = dct_const_round_shift(s3 + s11); + x4 = dct_const_round_shift(s4 + s12); + x5 = dct_const_round_shift(s5 + s13); + x6 = dct_const_round_shift(s6 + s14); + x7 = dct_const_round_shift(s7 + s15); + x8 = dct_const_round_shift(s0 - s8); + x9 = dct_const_round_shift(s1 - s9); + x10 = dct_const_round_shift(s2 - s10); + x11 = dct_const_round_shift(s3 - s11); + x12 = dct_const_round_shift(s4 - s12); + x13 = dct_const_round_shift(s5 - s13); + x14 = dct_const_round_shift(s6 - s14); + x15 = dct_const_round_shift(s7 - s15); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = s0 + s4; + x1 = s1 + s5; + x2 = s2 + s6; + x3 = s3 + s7; + x4 = s0 - s4; + x5 = s1 - s5; + x6 = s2 - s6; + x7 = s3 - s7; + x8 = dct_const_round_shift(s8 + s12); + x9 = dct_const_round_shift(s9 + s13); + x10 = dct_const_round_shift(s10 + s14); + x11 = dct_const_round_shift(s11 + s15); + x12 = dct_const_round_shift(s8 - s12); + x13 = dct_const_round_shift(s9 - s13); + x14 = dct_const_round_shift(s10 - s14); + x15 = dct_const_round_shift(s11 - s15); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; + + x0 = s0 + s2; + x1 = s1 + s3; + x2 = s0 - s2; + x3 = s1 - s3; + x4 = dct_const_round_shift(s4 + s6); + x5 = dct_const_round_shift(s5 + s7); + x6 = dct_const_round_shift(s4 - s6); + x7 = dct_const_round_shift(s5 - s7); + x8 = s8 + s10; + x9 = s9 + s11; + x10 = s8 - s10; + x11 = s9 - s11; + x12 = dct_const_round_shift(s12 + s14); + x13 = dct_const_round_shift(s13 + s15); + x14 = dct_const_round_shift(s12 - s14); + x15 = dct_const_round_shift(s13 - s15); + + // stage 4 + s2 = (- cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (- x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (- x10 + x11); + s14 = (- cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); + + x2 = dct_const_round_shift(s2); + x3 = dct_const_round_shift(s3); + x6 = dct_const_round_shift(s6); + x7 = dct_const_round_shift(s7); + x10 = dct_const_round_shift(s10); + x11 = dct_const_round_shift(s11); + x14 = dct_const_round_shift(s14); + x15 = dct_const_round_shift(s15); + + output[0] = x0; + output[1] = -x8; + output[2] = x12; + output[3] = -x4; + output[4] = x6; + output[5] = x14; + output[6] = x10; + output[7] = x2; + output[8] = x3; + output[9] = x11; + output[10] = x15; + output[11] = x7; + output[12] = x5; + output[13] = -x13; + output[14] = x9; + output[15] = -x1; +} + +static const transform_2d IHT_16[] = { + { idct16_1d, idct16_1d }, // DCT_DCT = 0 + { iadst16_1d, idct16_1d }, // ADST_DCT = 1 + { idct16_1d, iadst16_1d }, // DCT_ADST = 2 + { iadst16_1d, iadst16_1d } // ADST_ADST = 3 +}; + +void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, + int tx_type) { + int i, j; + int16_t out[16 * 16]; + int16_t *outptr = out; + int16_t temp_in[16], temp_out[16]; + const transform_2d ht = IHT_16[tx_type]; + + // Rows + for (i = 0; i < 16; ++i) { + ht.rows(input, outptr); + input += 16; + outptr += 16; + } + + // Columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j * 16 + i]; + ht.cols(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); } +} + +void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t out[16 * 16]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[16], temp_out[16]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + idct16_1d(input, outptr); + input += 16; + outptr += 16; + } + + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) + temp_in[j] = out[j*16 + i]; + idct16_1d(temp_in, temp_out); + for (j = 0; j < 16; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); + } +} + +void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 6); +} + +static void idct32_1d(int16_t *input, int16_t *output) { + int16_t step1[32], step2[32]; + int temp1, temp2; + + // stage 1 + step1[0] = input[0]; + step1[1] = input[16]; + step1[2] = input[8]; + step1[3] = input[24]; + step1[4] = input[4]; + step1[5] = input[20]; + step1[6] = input[12]; + step1[7] = input[28]; + step1[8] = input[2]; + step1[9] = input[18]; + step1[10] = input[10]; + step1[11] = input[26]; + step1[12] = input[6]; + step1[13] = input[22]; + step1[14] = input[14]; + step1[15] = input[30]; + + temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; + temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; + step1[16] = dct_const_round_shift(temp1); + step1[31] = dct_const_round_shift(temp2); + + temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; + temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; + step1[17] = dct_const_round_shift(temp1); + step1[30] = dct_const_round_shift(temp2); + + temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; + temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + + temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; + temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; + step1[19] = dct_const_round_shift(temp1); + step1[28] = dct_const_round_shift(temp2); + + temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; + temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + + temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; + temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + + temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; + temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + + temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; + temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; + step1[23] = dct_const_round_shift(temp1); + step1[24] = dct_const_round_shift(temp2); + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + + temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; + temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; + step2[8] = dct_const_round_shift(temp1); + step2[15] = dct_const_round_shift(temp2); + + temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; + temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + + temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; + temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + + temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; + temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + + step2[16] = step1[16] + step1[17]; + step2[17] = step1[16] - step1[17]; + step2[18] = -step1[18] + step1[19]; + step2[19] = step1[18] + step1[19]; + step2[20] = step1[20] + step1[21]; + step2[21] = step1[20] - step1[21]; + step2[22] = -step1[22] + step1[23]; + step2[23] = step1[22] + step1[23]; + step2[24] = step1[24] + step1[25]; + step2[25] = step1[24] - step1[25]; + step2[26] = -step1[26] + step1[27]; + step2[27] = step1[26] + step1[27]; + step2[28] = step1[28] + step1[29]; + step2[29] = step1[28] - step1[29]; + step2[30] = -step1[30] + step1[31]; + step2[31] = step1[30] + step1[31]; + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + + temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; + temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; + step1[4] = dct_const_round_shift(temp1); + step1[7] = dct_const_round_shift(temp2); + temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; + temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + + step1[8] = step2[8] + step2[9]; + step1[9] = step2[8] - step2[9]; + step1[10] = -step2[10] + step2[11]; + step1[11] = step2[10] + step2[11]; + step1[12] = step2[12] + step2[13]; + step1[13] = step2[12] - step2[13]; + step1[14] = -step2[14] + step2[15]; + step1[15] = step2[14] + step2[15]; + + step1[16] = step2[16]; + step1[31] = step2[31]; + temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; + temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; + step1[17] = dct_const_round_shift(temp1); + step1[30] = dct_const_round_shift(temp2); + temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; + temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + step1[19] = step2[19]; + step1[20] = step2[20]; + temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; + temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; + temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[27] = step2[27]; + step1[28] = step2[28]; + + // stage 4 + temp1 = (step1[0] + step1[1]) * cospi_16_64; + temp2 = (step1[0] - step1[1]) * cospi_16_64; + step2[0] = dct_const_round_shift(temp1); + step2[1] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; + temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + step2[4] = step1[4] + step1[5]; + step2[5] = step1[4] - step1[5]; + step2[6] = -step1[6] + step1[7]; + step2[7] = step1[6] + step1[7]; + + step2[8] = step1[8]; + step2[15] = step1[15]; + temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; + temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; + step2[9] = dct_const_round_shift(temp1); + step2[14] = dct_const_round_shift(temp2); + temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; + temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + step2[11] = step1[11]; + step2[12] = step1[12]; + + step2[16] = step1[16] + step1[19]; + step2[17] = step1[17] + step1[18]; + step2[18] = step1[17] - step1[18]; + step2[19] = step1[16] - step1[19]; + step2[20] = -step1[20] + step1[23]; + step2[21] = -step1[21] + step1[22]; + step2[22] = step1[21] + step1[22]; + step2[23] = step1[20] + step1[23]; + + step2[24] = step1[24] + step1[27]; + step2[25] = step1[25] + step1[26]; + step2[26] = step1[25] - step1[26]; + step2[27] = step1[24] - step1[27]; + step2[28] = -step1[28] + step1[31]; + step2[29] = -step1[29] + step1[30]; + step2[30] = step1[29] + step1[30]; + step2[31] = step1[28] + step1[31]; + + // stage 5 + step1[0] = step2[0] + step2[3]; + step1[1] = step2[1] + step2[2]; + step1[2] = step2[1] - step2[2]; + step1[3] = step2[0] - step2[3]; + step1[4] = step2[4]; + temp1 = (step2[6] - step2[5]) * cospi_16_64; + temp2 = (step2[5] + step2[6]) * cospi_16_64; + step1[5] = dct_const_round_shift(temp1); + step1[6] = dct_const_round_shift(temp2); + step1[7] = step2[7]; + + step1[8] = step2[8] + step2[11]; + step1[9] = step2[9] + step2[10]; + step1[10] = step2[9] - step2[10]; + step1[11] = step2[8] - step2[11]; + step1[12] = -step2[12] + step2[15]; + step1[13] = -step2[13] + step2[14]; + step1[14] = step2[13] + step2[14]; + step1[15] = step2[12] + step2[15]; + + step1[16] = step2[16]; + step1[17] = step2[17]; + temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; + temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; + step1[18] = dct_const_round_shift(temp1); + step1[29] = dct_const_round_shift(temp2); + temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; + temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; + step1[19] = dct_const_round_shift(temp1); + step1[28] = dct_const_round_shift(temp2); + temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; + temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; + temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + step1[22] = step2[22]; + step1[23] = step2[23]; + step1[24] = step2[24]; + step1[25] = step2[25]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // stage 6 + step2[0] = step1[0] + step1[7]; + step2[1] = step1[1] + step1[6]; + step2[2] = step1[2] + step1[5]; + step2[3] = step1[3] + step1[4]; + step2[4] = step1[3] - step1[4]; + step2[5] = step1[2] - step1[5]; + step2[6] = step1[1] - step1[6]; + step2[7] = step1[0] - step1[7]; + step2[8] = step1[8]; + step2[9] = step1[9]; + temp1 = (-step1[10] + step1[13]) * cospi_16_64; + temp2 = (step1[10] + step1[13]) * cospi_16_64; + step2[10] = dct_const_round_shift(temp1); + step2[13] = dct_const_round_shift(temp2); + temp1 = (-step1[11] + step1[12]) * cospi_16_64; + temp2 = (step1[11] + step1[12]) * cospi_16_64; + step2[11] = dct_const_round_shift(temp1); + step2[12] = dct_const_round_shift(temp2); + step2[14] = step1[14]; + step2[15] = step1[15]; + + step2[16] = step1[16] + step1[23]; + step2[17] = step1[17] + step1[22]; + step2[18] = step1[18] + step1[21]; + step2[19] = step1[19] + step1[20]; + step2[20] = step1[19] - step1[20]; + step2[21] = step1[18] - step1[21]; + step2[22] = step1[17] - step1[22]; + step2[23] = step1[16] - step1[23]; + + step2[24] = -step1[24] + step1[31]; + step2[25] = -step1[25] + step1[30]; + step2[26] = -step1[26] + step1[29]; + step2[27] = -step1[27] + step1[28]; + step2[28] = step1[27] + step1[28]; + step2[29] = step1[26] + step1[29]; + step2[30] = step1[25] + step1[30]; + step2[31] = step1[24] + step1[31]; + + // stage 7 + step1[0] = step2[0] + step2[15]; + step1[1] = step2[1] + step2[14]; + step1[2] = step2[2] + step2[13]; + step1[3] = step2[3] + step2[12]; + step1[4] = step2[4] + step2[11]; + step1[5] = step2[5] + step2[10]; + step1[6] = step2[6] + step2[9]; + step1[7] = step2[7] + step2[8]; + step1[8] = step2[7] - step2[8]; + step1[9] = step2[6] - step2[9]; + step1[10] = step2[5] - step2[10]; + step1[11] = step2[4] - step2[11]; + step1[12] = step2[3] - step2[12]; + step1[13] = step2[2] - step2[13]; + step1[14] = step2[1] - step2[14]; + step1[15] = step2[0] - step2[15]; + + step1[16] = step2[16]; + step1[17] = step2[17]; + step1[18] = step2[18]; + step1[19] = step2[19]; + temp1 = (-step2[20] + step2[27]) * cospi_16_64; + temp2 = (step2[20] + step2[27]) * cospi_16_64; + step1[20] = dct_const_round_shift(temp1); + step1[27] = dct_const_round_shift(temp2); + temp1 = (-step2[21] + step2[26]) * cospi_16_64; + temp2 = (step2[21] + step2[26]) * cospi_16_64; + step1[21] = dct_const_round_shift(temp1); + step1[26] = dct_const_round_shift(temp2); + temp1 = (-step2[22] + step2[25]) * cospi_16_64; + temp2 = (step2[22] + step2[25]) * cospi_16_64; + step1[22] = dct_const_round_shift(temp1); + step1[25] = dct_const_round_shift(temp2); + temp1 = (-step2[23] + step2[24]) * cospi_16_64; + temp2 = (step2[23] + step2[24]) * cospi_16_64; + step1[23] = dct_const_round_shift(temp1); + step1[24] = dct_const_round_shift(temp2); + step1[28] = step2[28]; + step1[29] = step2[29]; + step1[30] = step2[30]; + step1[31] = step2[31]; + + // final stage + output[0] = step1[0] + step1[31]; + output[1] = step1[1] + step1[30]; + output[2] = step1[2] + step1[29]; + output[3] = step1[3] + step1[28]; + output[4] = step1[4] + step1[27]; + output[5] = step1[5] + step1[26]; + output[6] = step1[6] + step1[25]; + output[7] = step1[7] + step1[24]; + output[8] = step1[8] + step1[23]; + output[9] = step1[9] + step1[22]; + output[10] = step1[10] + step1[21]; + output[11] = step1[11] + step1[20]; + output[12] = step1[12] + step1[19]; + output[13] = step1[13] + step1[18]; + output[14] = step1[14] + step1[17]; + output[15] = step1[15] + step1[16]; + output[16] = step1[15] - step1[16]; + output[17] = step1[14] - step1[17]; + output[18] = step1[13] - step1[18]; + output[19] = step1[12] - step1[19]; + output[20] = step1[11] - step1[20]; + output[21] = step1[10] - step1[21]; + output[22] = step1[9] - step1[22]; + output[23] = step1[8] - step1[23]; + output[24] = step1[7] - step1[24]; + output[25] = step1[6] - step1[25]; + output[26] = step1[5] - step1[26]; + output[27] = step1[4] - step1[27]; + output[28] = step1[3] - step1[28]; + output[29] = step1[2] - step1[29]; + output[30] = step1[1] - step1[30]; + output[31] = step1[0] - step1[31]; +} + +void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { + int16_t out[32 * 32]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[32], temp_out[32]; + + // Rows + for (i = 0; i < 32; ++i) { + idct32_1d(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); + } +} + +void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { + int16_t out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + output[0] = ROUND_POWER_OF_TWO(out, 6); +} + +void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int16_t out[32 * 32]; + int16_t *outptr = out; + int i, j; + int16_t temp_in[32], temp_out[32]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + idct32_1d(input, outptr); + input += 32; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + + dest[j * dest_stride + i]); + } +} diff --git a/libvpx/vp9/common/vp9_idct.h b/libvpx/vp9/common/vp9_idct.h new file mode 100644 index 000000000..64f14c993 --- /dev/null +++ b/libvpx/vp9/common/vp9_idct.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_IDCT_H_ +#define VP9_COMMON_VP9_IDCT_H_ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + + +// Constants and Macros used by all idct/dct functions +#define DCT_CONST_BITS 14 +#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1)) + +#define pair_set_epi16(a, b) \ + _mm_set1_epi32(((uint16_t)(a)) + (((uint16_t)(b)) << 16)) + +// Constants are round(16384 * cos(k*Pi/64)) where k = 1 to 31. +// Note: sin(k*Pi/64) = cos((32-k)*Pi/64) +static const int cospi_1_64 = 16364; +static const int cospi_2_64 = 16305; +static const int cospi_3_64 = 16207; +static const int cospi_4_64 = 16069; +static const int cospi_5_64 = 15893; +static const int cospi_6_64 = 15679; +static const int cospi_7_64 = 15426; +static const int cospi_8_64 = 15137; +static const int cospi_9_64 = 14811; +static const int cospi_10_64 = 14449; +static const int cospi_11_64 = 14053; +static const int cospi_12_64 = 13623; +static const int cospi_13_64 = 13160; +static const int cospi_14_64 = 12665; +static const int cospi_15_64 = 12140; +static const int cospi_16_64 = 11585; +static const int cospi_17_64 = 11003; +static const int cospi_18_64 = 10394; +static const int cospi_19_64 = 9760; +static const int cospi_20_64 = 9102; +static const int cospi_21_64 = 8423; +static const int cospi_22_64 = 7723; +static const int cospi_23_64 = 7005; +static const int cospi_24_64 = 6270; +static const int cospi_25_64 = 5520; +static const int cospi_26_64 = 4756; +static const int cospi_27_64 = 3981; +static const int cospi_28_64 = 3196; +static const int cospi_29_64 = 2404; +static const int cospi_30_64 = 1606; +static const int cospi_31_64 = 804; + +// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3 +static const int sinpi_1_9 = 5283; +static const int sinpi_2_9 = 9929; +static const int sinpi_3_9 = 13377; +static const int sinpi_4_9 = 15212; + +static INLINE int dct_const_round_shift(int input) { + int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); + assert(INT16_MIN <= rv && rv <= INT16_MAX); + return rv; +} + +typedef void (*transform_1d)(int16_t*, int16_t*); + +typedef struct { + transform_1d cols, rows; // vertical and horizontal +} transform_2d; + +#endif // VP9_COMMON_VP9_IDCT_H_ diff --git a/libvpx/vp9/common/vp9_implicit_segmentation.c b/libvpx/vp9/common/vp9_implicit_segmentation.c new file mode 100644 index 000000000..2a1d35f9a --- /dev/null +++ b/libvpx/vp9/common/vp9_implicit_segmentation.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" + +#define MAX_REGIONS 24000 +#ifndef NULL +#define NULL 0 +#endif + +#define min_mbs_in_region 3 + +// this linked list structure holds equivalences for connected +// component labeling +struct list_el { + int label; + int seg_value; + int count; + struct list_el *next; +}; +typedef struct list_el item; + +// connected colorsegments +typedef struct { + int min_x; + int min_y; + int max_x; + int max_y; + int64_t sum_x; + int64_t sum_y; + int pixels; + int seg_value; + int label; +} segment_info; + + +typedef enum { + SEGMENT_MODE, + SEGMENT_MV, + SEGMENT_REFFRAME, + SEGMENT_SKIPPED +} SEGMENT_TYPE; + + +// this merges the two equivalence lists and +// then makes sure that every label points to the same +// equivalence list +void merge(item *labels, int u, int v) { + item *a = labels[u].next; + item *b = labels[v].next; + item c; + item *it = &c; + int count; + + // check if they are already merged + if (u == v || a == b) + return; + + count = a->count + b->count; + + // merge 2 sorted linked lists. + while (a != NULL && b != NULL) { + if (a->label < b->label) { + it->next = a; + a = a->next; + } else { + it->next = b; + b = b->next; + } + + it = it->next; + } + + if (a == NULL) + it->next = b; + else + it->next = a; + + it = c.next; + + // make sure every equivalence in the linked list points to this new ll + while (it != NULL) { + labels[it->label].next = c.next; + it = it->next; + } + c.next->count = count; + +} + +void segment_via_mode_info(VP9_COMMON *oci, int how) { + MODE_INFO *mi = oci->mi; + int i, j; + int mb_index = 0; + + int label = 1; + int pitch = oci->mb_cols; + + // holds linked list equivalences + // the max should probably be allocated at a higher level in oci + item equivalences[MAX_REGIONS]; + int eq_ptr = 0; + item labels[MAX_REGIONS]; + segment_info segments[MAX_REGIONS]; + int label_count = 1; + int labeling[400 * 300]; + int *lp = labeling; + + label_count = 1; + memset(labels, 0, sizeof(labels)); + memset(segments, 0, sizeof(segments)); + + /* Go through each macroblock first pass labelling */ + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + // int above seg_value, left seg_value, this seg_value... + int a = -1, l = -1, n = -1; + + // above label, left label + int al = -1, ll = -1; + if (i) { + al = lp[j - pitch]; + a = labels[al].next->seg_value; + } + if (j) { + ll = lp[j - 1]; + l = labels[ll].next->seg_value; + } + + // what setting are we going to do the implicit segmentation on + switch (how) { + case SEGMENT_MODE: + n = mi[mb_index].mbmi.mode; + break; + case SEGMENT_MV: + n = mi[mb_index].mbmi.mv[0].as_int; + if (mi[mb_index].mbmi.ref_frame[0] == INTRA_FRAME) + n = -9999999; + break; + case SEGMENT_REFFRAME: + n = mi[mb_index].mbmi.ref_frame[0]; + break; + case SEGMENT_SKIPPED: + n = mi[mb_index].mbmi.mb_skip_coeff; + break; + } + + // above and left both have the same seg_value + if (n == a && n == l) { + // pick the lowest label + lp[j] = (al < ll ? al : ll); + labels[lp[j]].next->count++; + + // merge the above and left equivalencies + merge(labels, al, ll); + } + // this matches above seg_value + else if (n == a) { + // give it the same label as above + lp[j] = al; + labels[al].next->count++; + } + // this matches left seg_value + else if (n == l) { + // give it the same label as above + lp[j] = ll; + labels[ll].next->count++; + } else { + // new label doesn't match either + item *e = &labels[label]; + item *nl = &equivalences[eq_ptr++]; + lp[j] = label; + nl->label = label; + nl->next = 0; + nl->seg_value = n; + nl->count = 1; + e->next = nl; + label++; + } + mb_index++; + } + mb_index++; + } + lp = labeling; + + // give new labels to regions + for (i = 1; i < label; i++) + if (labels[i].next->count > min_mbs_in_region && + labels[labels[i].next->label].label == 0) { + segment_info *cs = &segments[label_count]; + cs->label = label_count; + labels[labels[i].next->label].label = label_count++; + labels[labels[i].next->label].seg_value = labels[i].next->seg_value; + cs->seg_value = labels[labels[i].next->label].seg_value; + cs->min_x = oci->mb_cols; + cs->min_y = oci->mb_rows; + cs->max_x = 0; + cs->max_y = 0; + cs->sum_x = 0; + cs->sum_y = 0; + cs->pixels = 0; + } + + lp = labeling; + + // this is just to gather stats... + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + const int old_lab = labels[lp[j]].next->label; + const int lab = labels[old_lab].label; + segment_info *cs = &segments[lab]; + + cs->min_x = MIN(cs->min_x, j); + cs->max_x = MAX(cs->max_x, j); + cs->min_y = MIN(cs->min_y, i); + cs->max_y = MAX(cs->max_y, i); + cs->sum_x += j; + cs->sum_y += i; + cs->pixels++; + + lp[j] = lab; + mb_index++; + } + mb_index++; + } + + { + lp = labeling; + printf("labelling \n"); + mb_index = 0; + for (i = 0; i < oci->mb_rows; i++, lp += pitch) { + for (j = 0; j < oci->mb_cols; j++) { + printf("%4d", lp[j]); + } + printf(" "); + for (j = 0; j < oci->mb_cols; j++, mb_index++) { + // printf("%3d",mi[mb_index].mbmi.mode ); + printf("%4d:%4d", mi[mb_index].mbmi.mv[0].as_mv.row, + mi[mb_index].mbmi.mv[0].as_mv.col); + } + printf("\n"); + ++mb_index; + } + printf("\n"); + } +} + diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c new file mode 100644 index 000000000..7b3f0be24 --- /dev/null +++ b/libvpx/vp9/common/vp9_loopfilter.c @@ -0,0 +1,407 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_reconinter.h" +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_seg_common.h" + +static void lf_init_lut(loop_filter_info_n *lfi) { + lfi->mode_lf_lut[DC_PRED] = 0; + lfi->mode_lf_lut[D45_PRED] = 0; + lfi->mode_lf_lut[D135_PRED] = 0; + lfi->mode_lf_lut[D117_PRED] = 0; + lfi->mode_lf_lut[D153_PRED] = 0; + lfi->mode_lf_lut[D27_PRED] = 0; + lfi->mode_lf_lut[D63_PRED] = 0; + lfi->mode_lf_lut[V_PRED] = 0; + lfi->mode_lf_lut[H_PRED] = 0; + lfi->mode_lf_lut[TM_PRED] = 0; + lfi->mode_lf_lut[ZEROMV] = 0; + lfi->mode_lf_lut[NEARESTMV] = 1; + lfi->mode_lf_lut[NEARMV] = 1; + lfi->mode_lf_lut[NEWMV] = 1; +} + +void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl) { + int i; + + /* For each possible value for the loop filter fill out limits */ + for (i = 0; i <= MAX_LOOP_FILTER; i++) { + int filt_lvl = i; + int block_inside_limit = 0; + + /* Set loop filter paramaeters that control sharpness. */ + block_inside_limit = filt_lvl >> (sharpness_lvl > 0); + block_inside_limit = block_inside_limit >> (sharpness_lvl > 4); + + if (sharpness_lvl > 0) { + if (block_inside_limit > (9 - sharpness_lvl)) + block_inside_limit = (9 - sharpness_lvl); + } + + if (block_inside_limit < 1) + block_inside_limit = 1; + + vpx_memset(lfi->lim[i], block_inside_limit, SIMD_WIDTH); + vpx_memset(lfi->blim[i], (2 * filt_lvl + block_inside_limit), + SIMD_WIDTH); + vpx_memset(lfi->mblim[i], (2 * (filt_lvl + 2) + block_inside_limit), + SIMD_WIDTH); + } +} + +void vp9_loop_filter_init(VP9_COMMON *cm) { + loop_filter_info_n *lfi = &cm->lf_info; + int i; + + // init limits for given sharpness + vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + + // init LUT for lvl and hev thr picking + lf_init_lut(lfi); + + // init hev threshold const vectors + for (i = 0; i < 4; i++) + vpx_memset(lfi->hev_thr[i], i, SIMD_WIDTH); +} + +void vp9_loop_filter_frame_init(VP9_COMMON *cm, + MACROBLOCKD *xd, + int default_filt_lvl) { + int seg, // segment number + ref, // index in ref_lf_deltas + mode; // index in mode_lf_deltas + // n_shift is the a multiplier for lf_deltas + // the multiplier is 1 for when filter_lvl is between 0 and 31; + // 2 when filter_lvl is between 32 and 63 + int n_shift = default_filt_lvl >> 5; + + loop_filter_info_n *lfi = &cm->lf_info; + + /* update limits if sharpness has changed */ + // printf("vp9_loop_filter_frame_init %d\n", default_filt_lvl); + // printf("sharpness level: %d [%d]\n", + // cm->sharpness_level, cm->last_sharpness_level); + if (cm->last_sharpness_level != cm->sharpness_level) { + vp9_loop_filter_update_sharpness(lfi, cm->sharpness_level); + cm->last_sharpness_level = cm->sharpness_level; + } + + for (seg = 0; seg < MAX_MB_SEGMENTS; seg++) { + int lvl_seg = default_filt_lvl; + int lvl_ref, lvl_mode; + + + // Set the baseline filter values for each segment + if (vp9_segfeature_active(xd, seg, SEG_LVL_ALT_LF)) { + /* Abs value */ + if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) { + lvl_seg = vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); + } else { /* Delta Value */ + lvl_seg += vp9_get_segdata(xd, seg, SEG_LVL_ALT_LF); + lvl_seg = clamp(lvl_seg, 0, 63); + } + } + + if (!xd->mode_ref_lf_delta_enabled) { + /* we could get rid of this if we assume that deltas are set to + * zero when not in use; encoder always uses deltas + */ + vpx_memset(lfi->lvl[seg][0], lvl_seg, 4 * 4); + continue; + } + + lvl_ref = lvl_seg; + + /* INTRA_FRAME */ + ref = INTRA_FRAME; + + /* Apply delta for reference frame */ + lvl_ref += xd->ref_lf_deltas[ref] << n_shift; + + mode = 0; /* all the rest of Intra modes */ + lvl_mode = lvl_ref; + lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63); + + /* LAST, GOLDEN, ALT */ + for (ref = 1; ref < MAX_REF_FRAMES; ref++) { + int lvl_ref = lvl_seg; + + /* Apply delta for reference frame */ + lvl_ref += xd->ref_lf_deltas[ref] << n_shift; + + /* Apply delta for Inter modes */ + for (mode = 0; mode < MAX_MODE_LF_DELTAS; mode++) { + lvl_mode = lvl_ref + (xd->mode_lf_deltas[mode] << n_shift); + lfi->lvl[seg][ref][mode] = clamp(lvl_mode, 0, 63); + } + } + } +} + +static int build_lfi(const VP9_COMMON *cm, const MB_MODE_INFO *mbmi, + struct loop_filter_info *lfi) { + const loop_filter_info_n *lfi_n = &cm->lf_info; + int mode = mbmi->mode; + int mode_index = lfi_n->mode_lf_lut[mode]; + int seg = mbmi->segment_id; + int ref_frame = mbmi->ref_frame[0]; + int filter_level = lfi_n->lvl[seg][ref_frame][mode_index]; + + if (filter_level) { + const int hev_index = filter_level >> 4; + lfi->mblim = lfi_n->mblim[filter_level]; + lfi->blim = lfi_n->blim[filter_level]; + lfi->lim = lfi_n->lim[filter_level]; + lfi->hev_thr = lfi_n->hev_thr[hev_index]; + return 1; + } + return 0; +} + +static void filter_selectively_vert(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + const struct loop_filter_info *lfi) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) { + if (mask & 1) { + if (mask_16x16 & 1) { + vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + assert(!(mask_8x8 & 1)); + assert(!(mask_4x4 & 1)); + assert(!(mask_4x4_int & 1)); + } else if (mask_8x8 & 1) { + vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + assert(!(mask_16x16 & 1)); + assert(!(mask_4x4 & 1)); + } else if (mask_4x4 & 1) { + vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + assert(!(mask_16x16 & 1)); + assert(!(mask_8x8 & 1)); + } else { + assert(0); + } + + if (mask_4x4_int & 1) + vp9_loop_filter_vertical_edge(s + 4, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + } + s += 8; + lfi++; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +static void filter_selectively_horiz(uint8_t *s, int pitch, + unsigned int mask_16x16, + unsigned int mask_8x8, + unsigned int mask_4x4, + unsigned int mask_4x4_int, + int only_4x4_1, + const struct loop_filter_info *lfi) { + unsigned int mask; + + for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= 1) { + if (mask & 1) { + if (!only_4x4_1) { + if (mask_16x16 & 1) { + vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr); + assert(!(mask_8x8 & 1)); + assert(!(mask_4x4 & 1)); + assert(!(mask_4x4_int & 1)); + } else if (mask_8x8 & 1) { + vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + assert(!(mask_16x16 & 1)); + assert(!(mask_4x4 & 1)); + } else if (mask_4x4 & 1) { + vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, + lfi->hev_thr, 1); + assert(!(mask_16x16 & 1)); + assert(!(mask_8x8 & 1)); + } else { + assert(0); + } + } + + if (mask_4x4_int & 1) + vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, 1); + } + s += 8; + lfi++; + mask_16x16 >>= 1; + mask_8x8 >>= 1; + mask_4x4 >>= 1; + mask_4x4_int >>= 1; + } +} + +static void filter_block_plane(VP9_COMMON *cm, MACROBLOCKD *xd, + int plane, int mi_row, int mi_col) { + const int ss_x = xd->plane[plane].subsampling_x; + const int ss_y = xd->plane[plane].subsampling_y; + const int row_step = 1 << xd->plane[plane].subsampling_y; + const int col_step = 1 << xd->plane[plane].subsampling_x; + struct buf_2d * const dst = &xd->plane[plane].dst; + uint8_t* const dst0 = dst->buf; + MODE_INFO* const mi0 = xd->mode_info_context; + unsigned int mask_16x16[64 / MI_SIZE] = {0}; + unsigned int mask_8x8[64 / MI_SIZE] = {0}; + unsigned int mask_4x4[64 / MI_SIZE] = {0}; + unsigned int mask_4x4_int[64 / MI_SIZE] = {0}; + struct loop_filter_info lfi[64 / MI_SIZE][64 / MI_SIZE]; + int r, c; + + for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + unsigned int mask_16x16_c = 0; + unsigned int mask_8x8_c = 0; + unsigned int mask_4x4_c = 0; + unsigned int border_mask; + + // Determine the vertical edges that need filtering + for (c = 0; c < 64 / MI_SIZE && mi_col + c < cm->mi_cols; c += col_step) { + const MODE_INFO * const mi = xd->mode_info_context; + const int skip_this = mi[c].mbmi.mb_skip_coeff + && mi[c].mbmi.ref_frame[0] != INTRA_FRAME; + // left edge of current unit is block/partition edge -> no skip + const int block_edge_left = b_width_log2(mi[c].mbmi.sb_type) ? + !(c & ((1 << (b_width_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; + const int skip_this_c = skip_this && !block_edge_left; + // top edge of current unit is block/partition edge -> no skip + const int block_edge_above = b_height_log2(mi[c].mbmi.sb_type) ? + !(r & ((1 << (b_height_log2(mi[c].mbmi.sb_type)-1)) - 1)) : 1; + const int skip_this_r = skip_this && !block_edge_above; + const TX_SIZE tx_size = plane ? get_uv_tx_size(&mi[c].mbmi) + : mi[c].mbmi.txfm_size; + const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + + // Filter level can vary per MI + if (!build_lfi(cm, &mi[c].mbmi, + lfi[r] + (c >> xd->plane[plane].subsampling_x))) + continue; + + // Build masks based on the transform size of each block + if (tx_size == TX_32X32) { + if (!skip_this_c && ((c >> ss_x) & 3) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 3) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else if (tx_size == TX_16X16) { + if (!skip_this_c && ((c >> ss_x) & 1) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= 1 << (c >> ss_x); + else + mask_8x8_c |= 1 << (c >> ss_x); + } + if (!skip_this_r && ((r >> ss_y) & 1) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= 1 << (c >> ss_x); + else + mask_8x8[r] |= 1 << (c >> ss_x); + } + } else { + // force 8x8 filtering on 32x32 boundaries + if (!skip_this_c) { + if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0) + mask_8x8_c |= 1 << (c >> ss_x); + else + mask_4x4_c |= 1 << (c >> ss_x); + } + + if (!skip_this_r) { + if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0) + mask_8x8[r] |= 1 << (c >> ss_x); + else + mask_4x4[r] |= 1 << (c >> ss_x); + } + + if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c) + mask_4x4_int[r] |= 1 << (c >> ss_x); + } + } + + // Disable filtering on the leftmost column + border_mask = ~(mi_col == 0); + filter_selectively_vert(dst->buf, dst->stride, + mask_16x16_c & border_mask, + mask_8x8_c & border_mask, + mask_4x4_c & border_mask, + mask_4x4_int[r], lfi[r]); + dst->buf += 8 * dst->stride; + xd->mode_info_context += cm->mode_info_stride * row_step; + } + + // Now do horizontal pass + dst->buf = dst0; + xd->mode_info_context = mi0; + for (r = 0; r < 64 / MI_SIZE && mi_row + r < cm->mi_rows; r += row_step) { + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; + + filter_selectively_horiz(dst->buf, dst->stride, + mask_16x16[r], + mask_8x8[r], + mask_4x4[r], + mask_4x4_int_r, mi_row + r == 0, lfi[r]); + dst->buf += 8 * dst->stride; + xd->mode_info_context += cm->mode_info_stride * row_step; + } +} + +void vp9_loop_filter_frame(VP9_COMMON *cm, + MACROBLOCKD *xd, + int frame_filter_level, + int y_only) { + int mi_row, mi_col; + + // Initialize the loop filter for this frame. + vp9_loop_filter_frame_init(cm, xd, frame_filter_level); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 64 / MI_SIZE) { + MODE_INFO* const mi = cm->mi + mi_row * cm->mode_info_stride; + + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 64 / MI_SIZE) { + int plane; + + setup_dst_planes(xd, cm->frame_to_show, mi_row, mi_col); + for (plane = 0; plane < (y_only ? 1 : MAX_MB_PLANE); plane++) { + xd->mode_info_context = mi + mi_col; + filter_block_plane(cm, xd, plane, mi_row, mi_col); + } + } + } +} diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h new file mode 100644 index 000000000..ce954c0c3 --- /dev/null +++ b/libvpx/vp9/common/vp9_loopfilter.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_LOOPFILTER_H_ +#define VP9_COMMON_VP9_LOOPFILTER_H_ + +#include "vpx_ports/mem.h" +#include "vpx_config.h" +#include "vp9/common/vp9_blockd.h" + +#define MAX_LOOP_FILTER 63 +#define SIMD_WIDTH 16 + +/* Need to align this structure so when it is declared and + * passed it can be loaded into vector registers. + */ +typedef struct { + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + mblim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + blim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + lim[MAX_LOOP_FILTER + 1][SIMD_WIDTH]); + DECLARE_ALIGNED(SIMD_WIDTH, unsigned char, + hev_thr[4][SIMD_WIDTH]); + unsigned char lvl[MAX_MB_SEGMENTS][4][4]; + unsigned char mode_lf_lut[MB_MODE_COUNT]; +} loop_filter_info_n; + +struct loop_filter_info { + const unsigned char *mblim; + const unsigned char *blim; + const unsigned char *lim; + const unsigned char *hev_thr; +}; + +#define prototype_loopfilter(sym) \ + void sym(uint8_t *src, int pitch, const unsigned char *blimit, \ + const unsigned char *limit, const unsigned char *thresh, int count) + +#define prototype_loopfilter_block(sym) \ + void sym(uint8_t *y, uint8_t *u, uint8_t *v, \ + int ystride, int uv_stride, struct loop_filter_info *lfi) + +#if ARCH_X86 || ARCH_X86_64 +#include "x86/vp9_loopfilter_x86.h" +#endif + +typedef void loop_filter_uvfunction(uint8_t *u, /* source pointer */ + int p, /* pitch */ + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + uint8_t *v); + +/* assorted loopfilter functions which get used elsewhere */ +struct VP9Common; +struct macroblockd; + +void vp9_loop_filter_init(struct VP9Common *cm); + +void vp9_loop_filter_frame_init(struct VP9Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp9_loop_filter_frame(struct VP9Common *cm, + struct macroblockd *mbd, + int filter_level, + int y_only); + +void vp9_loop_filter_partial_frame(struct VP9Common *cm, + struct macroblockd *mbd, + int default_filt_lvl); + +void vp9_loop_filter_update_sharpness(loop_filter_info_n *lfi, + int sharpness_lvl); + +#endif // VP9_COMMON_VP9_LOOPFILTER_H_ diff --git a/libvpx/vp9/common/vp9_loopfilter_filters.c b/libvpx/vp9/common/vp9_loopfilter_filters.c new file mode 100644 index 000000000..0efbcafe0 --- /dev/null +++ b/libvpx/vp9/common/vp9_loopfilter_filters.c @@ -0,0 +1,308 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_config.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_onyxc_int.h" + +static INLINE int8_t signed_char_clamp(int t) { + return (int8_t)clamp(t, -128, 127); +} + +// should we apply any filter at all: 11111111 yes, 00000000 no +static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t mask = 0; + mask |= (abs(p3 - p2) > limit) * -1; + mask |= (abs(p2 - p1) > limit) * -1; + mask |= (abs(p1 - p0) > limit) * -1; + mask |= (abs(q1 - q0) > limit) * -1; + mask |= (abs(q2 - q1) > limit) * -1; + mask |= (abs(q3 - q2) > limit) * -1; + mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + return ~mask; +} + +// is there high edge variance internal edge: 11111111 yes, 00000000 no +static INLINE int8_t hevmask(uint8_t thresh, uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1) { + int8_t hev = 0; + hev |= (abs(p1 - p0) > thresh) * -1; + hev |= (abs(q1 - q0) > thresh) * -1; + return hev; +} + +static INLINE void filter(int8_t mask, uint8_t hev, uint8_t *op1, + uint8_t *op0, uint8_t *oq0, uint8_t *oq1) { + int8_t filter1, filter2; + + const int8_t ps1 = (int8_t) *op1 ^ 0x80; + const int8_t ps0 = (int8_t) *op0 ^ 0x80; + const int8_t qs0 = (int8_t) *oq0 ^ 0x80; + const int8_t qs1 = (int8_t) *oq1 ^ 0x80; + + // add outer taps if we have high edge variance + int8_t filter = signed_char_clamp(ps1 - qs1) & hev; + + // inner taps + filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; + + // save bottom 3 bits so that we round one side +4 and the other +3 + // if it equals 4 we'll set to adjust by -1 to account for the fact + // we'd round 3 the other way + filter1 = signed_char_clamp(filter + 4) >> 3; + filter2 = signed_char_clamp(filter + 3) >> 3; + + *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80; + *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80; + + // outer tap adjustments + filter = ((filter1 + 1) >> 1) & ~hev; + + *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80; + *op1 = signed_char_clamp(ps1 + filter) ^ 0x80; +} + +void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); + filter(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } +} + +void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); + filter(mask, hev, s - 2, s - 1, s, s + 1); + s += pitch; + } +} + +static INLINE int8_t flatmask4(uint8_t thresh, + uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, + uint8_t q2, uint8_t q3) { + int8_t flat = 0; + flat |= (abs(p1 - p0) > thresh) * -1; + flat |= (abs(q1 - q0) > thresh) * -1; + flat |= (abs(p0 - p2) > thresh) * -1; + flat |= (abs(q0 - q2) > thresh) * -1; + flat |= (abs(p3 - p0) > thresh) * -1; + flat |= (abs(q3 - q0) > thresh) * -1; + return ~flat; +} +static INLINE signed char flatmask5(uint8_t thresh, + uint8_t p4, uint8_t p3, uint8_t p2, + uint8_t p1, uint8_t p0, + uint8_t q0, uint8_t q1, uint8_t q2, + uint8_t q3, uint8_t q4) { + int8_t flat = 0; + flat |= (abs(p4 - p0) > thresh) * -1; + flat |= (abs(q4 - q0) > thresh) * -1; + flat = ~flat; + return flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3); +} + + +static INLINE void mbfilter(int8_t mask, uint8_t hev, uint8_t flat, + uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, + uint8_t *oq0, uint8_t *oq1, + uint8_t *oq2, uint8_t *oq3) { + // use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line + if (flat && mask) { + const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3; + + *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3); + *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3); + *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3); + *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3); + *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3); + *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3); + } else { + filter(mask, hev, op1, op0, oq0, oq1); + } +} + +void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); + const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + mbfilter(mask, hev, flat, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } +} + +void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, + int count) { + int i; + + for (i = 0; i < 8 * count; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(thresh[0], p1, p0, q0, q1); + const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + mbfilter(mask, hev, flat, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3); + s += pitch; + } +} + +static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, + uint8_t flat, uint8_t flat2, + uint8_t *op7, uint8_t *op6, uint8_t *op5, + uint8_t *op4, uint8_t *op3, uint8_t *op2, + uint8_t *op1, uint8_t *op0, uint8_t *oq0, + uint8_t *oq1, uint8_t *oq2, uint8_t *oq3, + uint8_t *oq4, uint8_t *oq5, uint8_t *oq6, + uint8_t *oq7) { + // use a 15 tap filter [1,1,1,1,1,1,1,2,1,1,1,1,1,1,1] for flat line + if (flat2 && flat && mask) { + const uint8_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4, + p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0; + + const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, + q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7; + + *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 + p3 + p2 + p1 + p0 + + q0, 4); + *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 + p2 + p1 + p0 + + q0 + q1, 4); + *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 + p1 + p0 + + q0 + q1 + q2, 4); + *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 + p1 + p0 + + q0 + q1 + q2 + q3, 4); + *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4, 4); + *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 + + q0 + q1 + q2 + q3 + q4 + q5, 4); + *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 + + q0 + q1 + q2 + q3 + q4 + q5 + q6, 4); + *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + + q0 * 2 + q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); + *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + + q0 + q1 * 2 + q2 + q3 + q4 + q5 + q6 + q7 * 2, 4); + *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + + q0 + q1 + q2 * 2 + q3 + q4 + q5 + q6 + q7 * 3, 4); + *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + + q0 + q1 + q2 + q3 * 2 + q4 + q5 + q6 + q7 * 4, 4); + *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + + q0 + q1 + q2 + q3 + q4 * 2 + q5 + q6 + q7 * 5, 4); + *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + + q0 + q1 + q2 + q3 + q4 + q5 * 2 + q6 + q7 * 6, 4); + *oq6 = ROUND_POWER_OF_TWO(p0 + + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 + q7 * 7, 4); + } else { + mbfilter(mask, hev, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3); + } +} + +void vp9_mb_lpf_horizontal_edge_w_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + + // loop filter designed to work using chars so that we can make maximum use + // of 8 bit simd instructions. + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); + const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flatmask5(1, + s[-8 * p], s[-7 * p], s[-6 * p], s[-5 * p], p0, + q0, s[4 * p], s[5 * p], s[6 * p], s[7 * p]); + + wide_mbfilter(mask, hev, flat, flat2, + s - 8 * p, s - 7 * p, s - 6 * p, s - 5 * p, + s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p, + s + 4 * p, s + 5 * p, s + 6 * p, s + 7 * p); + + ++s; + } +} + +void vp9_mb_lpf_vertical_edge_w_c(uint8_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh) { + int i; + + for (i = 0; i < 8; ++i) { + const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; + const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hevmask(*thresh, p1, p0, q0, q1); + const int8_t flat = flatmask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t flat2 = flatmask5(1, s[-8], s[-7], s[-6], s[-5], p0, + q0, s[4], s[5], s[6], s[7]); + + wide_mbfilter(mask, hev, flat, flat2, + s - 8, s - 7, s - 6, s - 5, s - 4, s - 3, s - 2, s - 1, + s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6, s + 7); + s += p; + } +} diff --git a/libvpx/vp9/common/vp9_maskingmv.c b/libvpx/vp9/common/vp9_maskingmv.c new file mode 100644 index 000000000..326201bbe --- /dev/null +++ b/libvpx/vp9/common/vp9_maskingmv.c @@ -0,0 +1,803 @@ +/* + ============================================================================ + Name : vp9_maskingmv.c + Author : jimbankoski + Version : + Copyright : Your copyright notice + Description : Hello World in C, Ansi-style + ============================================================================ + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +unsigned int vp9_sad16x16_sse3( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_err); + +int vp8_growmaskmb_sse3( + unsigned char *om, + unsigned char *nm); + +void vp8_makemask_sse3( + unsigned char *y, + unsigned char *u, + unsigned char *v, + unsigned char *ym, + int yp, + int uvp, + int ys, + int us, + int vs, + int yt, + int ut, + int vt); + +unsigned int vp9_sad16x16_unmasked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp9_sad16x16_masked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp8_masked_predictor_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_masked_predictor_uv_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_uv_from_y_mask( + unsigned char *ymask, + unsigned char *uvmask); +int yp = 16; +unsigned char sxy[] = { + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 80, 120, 120, 90, 90, 90, 90, 90, 80, 120, 120, 90, 90, 90, 90, 90 +}; + +unsigned char sts[] = { + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, +}; +unsigned char str[] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; + +unsigned char y[] = { + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40, + 40, 40, 40, 60, 60, 60, 60, 40, 40, 40, 40, 60, 60, 60, 60, 40 +}; +int uvp = 8; +unsigned char u[] = { + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 84, 70, 70, 90, 90, 90, 17, 17, + 84, 70, 70, 90, 90, 90, 17, 17, + 80, 70, 70, 90, 90, 90, 17, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17, + 90, 80, 70, 70, 90, 90, 90, 17 +}; + +unsigned char v[] = { + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80 +}; + +unsigned char ym[256]; +unsigned char uvm[64]; +typedef struct { + unsigned char y; + unsigned char yt; + unsigned char u; + unsigned char ut; + unsigned char v; + unsigned char vt; + unsigned char use; +} COLOR_SEG_ELEMENT; + +/* +COLOR_SEG_ELEMENT segmentation[]= +{ + { 60,4,80,17,80,10, 1}, + { 40,4,15,10,80,10, 1}, +}; +*/ + +COLOR_SEG_ELEMENT segmentation[] = { + { 79, 44, 92, 44, 237, 60, 1}, +}; + +unsigned char pixel_mask(unsigned char y, unsigned char u, unsigned char v, + COLOR_SEG_ELEMENT sgm[], + int c) { + COLOR_SEG_ELEMENT *s = sgm; + unsigned char m = 0; + int i; + for (i = 0; i < c; i++, s++) + m |= (abs(y - s->y) < s->yt && + abs(u - s->u) < s->ut && + abs(v - s->v) < s->vt ? 255 : 0); + + return m; +} +int neighbors[256][8]; +int makeneighbors(void) { + int i, j; + for (i = 0; i < 256; i++) { + int r = (i >> 4), c = (i & 15); + int ni = 0; + for (j = 0; j < 8; j++) + neighbors[i][j] = i; + for (j = 0; j < 256; j++) { + int nr = (j >> 4), nc = (j & 15); + if (abs(nr - r) < 2 && abs(nc - c) < 2) + neighbors[i][ni++] = j; + } + } + return 0; +} +void grow_ymask(unsigned char *ym) { + unsigned char nym[256]; + int i, j; + + for (i = 0; i < 256; i++) { + nym[i] = ym[i]; + for (j = 0; j < 8; j++) { + nym[i] |= ym[neighbors[i][j]]; + } + } + for (i = 0; i < 256; i++) + ym[i] = nym[i]; +} + +void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, + unsigned char *ym, unsigned char *uvm, + int yp, int uvp, + COLOR_SEG_ELEMENT sgm[], + int count) { + int r, c; + unsigned char *oym = ym; + + memset(ym, 20, 256); + for (r = 0; r < 8; r++, uvm += 8, u += uvp, v += uvp, y += (yp + yp), ym += 32) + for (c = 0; c < 8; c++) { + int y1 = y[c << 1]; + int u1 = u[c]; + int v1 = v[c]; + int m = pixel_mask(y1, u1, v1, sgm, count); + uvm[c] = m; + ym[c << 1] = uvm[c]; // = pixel_mask(y[c<<1],u[c],v[c],sgm,count); + ym[(c << 1) + 1] = pixel_mask(y[1 + (c << 1)], u[c], v[c], sgm, count); + ym[(c << 1) + 16] = pixel_mask(y[yp + (c << 1)], u[c], v[c], sgm, count); + ym[(c << 1) + 17] = pixel_mask(y[1 + yp + (c << 1)], u[c], v[c], sgm, count); + } + grow_ymask(oym); +} + +int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) + for (j = 0; j < 16; j++) + if (ym[j]) + sad += abs(src[j] - dst[j]); + + return sad; +} + +int compare_masks(unsigned char *sym, unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, sym += 16, ym += 16) + for (j = 0; j < 16; j++) + sad += (sym[j] != ym[j] ? 1 : 0); + + return sad; +} + +int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym) { + int i, j; + unsigned sad = 0; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16) + for (j = 0; j < 16; j++) + if (!ym[j]) + sad += abs(src[j] - dst[j]); + + return sad; +} + +int masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) { + int i, j; + + unsigned char ym[256]; + unsigned char uvm[64]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0; + int beste = 256; + int bmi = -32, bmj = -32; + int bui = -32, buj = -32; + int beste1 = 256; + int bmi1 = -32, bmj1 = -32; + int bui1 = -32, buj1 = -32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); + + e = unmasked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + // bui=0;buj=0; + // best mv masked destination + make_mb_mask(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, + dym, duvm, dyp, duvp, sgm, count); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = masked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + beste1 = beste + obeste; + bmi1 = bmi; + bmj1 = bmj; + bui1 = bui; + buj1 = buj; + + beste = 0xffffffff; + // source mask + make_mb_mask(y, u, v, ym, uvm, yp, uvp, sgm, count); + + // find best mask + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + make_mb_mask(dyz + j, duz + j / 2, dvz + j / 2, dym, duvm, dyp, duvp, sgm, count); + + e = compare_masks(ym, dym); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + + + // best mv masked destination + make_mb_mask(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, + dym, duvm, dyp, duvp, sgm, count); + + obeste = masked_sad(y, yp, dy + bmi * dyp + bmj, dyp, dym); + + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = unmasked_sad(y, yp, dyz + j, dyp, dym); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + beste += obeste; + + + if (beste < beste1) { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } else { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + + } + return 0; +} + +int predict(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym, unsigned char *prd) { + int i, j; + for (i = 0; i < 16; i++, src += p, dst += dp, ym += 16, prd += 16) + for (j = 0; j < 16; j++) + prd[j] = (ym[j] ? src[j] : dst[j]); + return 0; +} + +int fast_masked_motion_search(unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) { + int i, j; + + unsigned char ym[256]; + unsigned char ym2[256]; + unsigned char uvm[64]; + unsigned char dym2[256]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0; + int beste = 256; + int bmi = -32, bmj = -32; + int bui = -32, buj = -32; + int beste1 = 256; + int bmi1 = -32, bmj1 = -32; + int bui1 = -32, buj1 = -32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + +#if 0 + for (i = 0; i < 16; i++) { + unsigned char *dy = i * yp + y; + for (j = 0; j < 16; j++) + printf("%2x", dy[j]); + printf("\n"); + } + printf("\n"); + + for (i = -32; i < 48; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 48; j++) + printf("%2x", dyz[j]); + printf("\n"); + } +#endif + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + // bui=0;buj=0; + // best mv masked destination + + vp8_makemask_sse3(dy + bui * dyp + buj, du + bui / 2 * duvp + buj / 2, dv + bui / 2 * duvp + buj / 2, + dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = vp9_sad16x16_masked_wmt(y, yp, dyz + j, dyp, dym2); + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + beste1 = beste + obeste; + bmi1 = bmi; + bmj1 = bmj; + bui1 = bui; + buj1 = buj; + + // source mask + vp8_makemask_sse3(y, u, v, + ym, yp, uvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(ym, ym2); + + // find best mask + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + unsigned char *duz = i / 2 * duvp + du; + unsigned char *dvz = i / 2 * duvp + dv; + for (j = -32; j < 32; j++) { + // 0,0 masked destination + vp8_makemask_sse3(dyz + j, duz + j / 2, dvz + j / 2, dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + e = compare_masks(ym2, dym2); + + if (e < beste) { + bmi = i; + bmj = j; + beste = e; + } + } + } + + vp8_makemask_sse3(dy + bmi * dyp + bmj, du + bmi / 2 * duvp + bmj / 2, dv + bmi / 2 * duvp + bmj / 2, + dym, dyp, duvp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + + obeste = vp9_sad16x16_masked_wmt(y, yp, dy + bmi * dyp + bmj, dyp, dym2); + + beste = 0xffffffff; + + // find best unmasked mv + for (i = -32; i < 32; i++) { + unsigned char *dyz = i * dyp + dy; + for (j = -32; j < 32; j++) { + e = vp9_sad16x16_unmasked_wmt(y, yp, dyz + j, dyp, dym2); + + if (e < beste) { + bui = i; + buj = j; + beste = e; + } + } + } + beste += obeste; + + if (beste < beste1) { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } else { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + beste = beste1; + + } + return beste; +} + +int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm, + int ymp, int uvmp, + unsigned char *yp, unsigned char *up, unsigned char *vp, + int ypp, int uvpp, + COLOR_SEG_ELEMENT sgm[], + int count, + int mi, + int mj, + int ui, + int uj, + int wm) { + int i, j; + unsigned char dym[256]; + unsigned char dym2[256]; + unsigned char duvm[64]; + unsigned char *yu = ym, *uu = um, *vu = vm; + + unsigned char *dym3 = dym2; + + ym += mi * ymp + mj; + um += mi / 2 * uvmp + mj / 2; + vm += mi / 2 * uvmp + mj / 2; + + yu += ui * ymp + uj; + uu += ui / 2 * uvmp + uj / 2; + vu += ui / 2 * uvmp + uj / 2; + + // best mv masked destination + if (wm) + vp8_makemask_sse3(ym, um, vm, dym, ymp, uvmp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + else + vp8_makemask_sse3(yu, uu, vu, dym, ymp, uvmp, + sgm[0].y, sgm[0].u, sgm[0].v, + sgm[0].yt, sgm[0].ut, sgm[0].vt); + + vp8_growmaskmb_sse3(dym, dym2); + vp8_masked_predictor_wmt(ym, yu, ymp, yp, ypp, dym3); + vp8_uv_from_y_mask(dym3, duvm); + vp8_masked_predictor_uv_wmt(um, uu, uvmp, up, uvpp, duvm); + vp8_masked_predictor_uv_wmt(vm, vu, uvmp, vp, uvpp, duvm); + + return 0; +} + +unsigned char f0p[1280 * 720 * 3 / 2]; +unsigned char f1p[1280 * 720 * 3 / 2]; +unsigned char prd[1280 * 720 * 3 / 2]; +unsigned char msk[1280 * 720 * 3 / 2]; + + +int mainz(int argc, char *argv[]) { + + FILE *f = fopen(argv[1], "rb"); + FILE *g = fopen(argv[2], "wb"); + int w = atoi(argv[3]), h = atoi(argv[4]); + int y_stride = w, uv_stride = w / 2; + int r, c; + unsigned char *f0 = f0p, *f1 = f1p, *t; + unsigned char ym[256], uvm[64]; + unsigned char ym2[256], uvm2[64]; + unsigned char ym3[256], uvm3[64]; + int a, b; + + COLOR_SEG_ELEMENT last = { 20, 20, 20, 20, 230, 20, 1}, best; +#if 0 + makeneighbors(); + COLOR_SEG_ELEMENT segmentation[] = { + { 60, 4, 80, 17, 80, 10, 1}, + { 40, 4, 15, 10, 80, 10, 1}, + }; + make_mb_mask(y, u, v, ym2, uvm2, 16, 8, segmentation, 1); + + vp8_makemask_sse3(y, u, v, ym, (int) 16, (int) 8, + (int) segmentation[0].y, (int) segmentation[0].u, (int) segmentation[0].v, + segmentation[0].yt, segmentation[0].ut, segmentation[0].vt); + + vp8_growmaskmb_sse3(ym, ym3); + + a = vp9_sad16x16_masked_wmt(str, 16, sts, 16, ym3); + b = vp9_sad16x16_unmasked_wmt(str, 16, sts, 16, ym3); + + vp8_masked_predictor_wmt(str, sts, 16, ym, 16, ym3); + + vp8_uv_from_y_mask(ym3, uvm3); + + return 4; +#endif + makeneighbors(); + + + memset(prd, 128, w * h * 3 / 2); + + fread(f0, w * h * 3 / 2, 1, f); + + while (!feof(f)) { + unsigned char *ys = f1, *yd = f0, *yp = prd; + unsigned char *us = f1 + w * h, *ud = f0 + w * h, *up = prd + w * h; + unsigned char *vs = f1 + w * h * 5 / 4, *vd = f0 + w * h * 5 / 4, *vp = prd + w * h * 5 / 4; + fread(f1, w * h * 3 / 2, 1, f); + + ys += 32 * y_stride; + yd += 32 * y_stride; + yp += 32 * y_stride; + us += 16 * uv_stride; + ud += 16 * uv_stride; + up += 16 * uv_stride; + vs += 16 * uv_stride; + vd += 16 * uv_stride; + vp += 16 * uv_stride; + for (r = 32; r < h - 32; r += 16, + ys += 16 * w, yd += 16 * w, yp += 16 * w, + us += 8 * uv_stride, ud += 8 * uv_stride, up += 8 * uv_stride, + vs += 8 * uv_stride, vd += 8 * uv_stride, vp += 8 * uv_stride) { + for (c = 32; c < w - 32; c += 16) { + int mi, mj, ui, uj, wm; + int bmi, bmj, bui, buj, bwm; + unsigned char ym[256]; + + if (vp9_sad16x16_sse3(ys + c, y_stride, yd + c, y_stride, 0xffff) == 0) + bmi = bmj = bui = buj = bwm = 0; + else { + COLOR_SEG_ELEMENT cs[5]; + int j; + unsigned int beste = 0xfffffff; + unsigned int bestj = 0; + + // try color from last mb segmentation + cs[0] = last; + + // try color segs from 4 pixels in mb recon as segmentation + cs[1].y = yd[c + y_stride + 1]; + cs[1].u = ud[c / 2 + uv_stride]; + cs[1].v = vd[c / 2 + uv_stride]; + cs[1].yt = cs[1].ut = cs[1].vt = 20; + cs[2].y = yd[c + w + 14]; + cs[2].u = ud[c / 2 + uv_stride + 7]; + cs[2].v = vd[c / 2 + uv_stride + 7]; + cs[2].yt = cs[2].ut = cs[2].vt = 20; + cs[3].y = yd[c + w * 14 + 1]; + cs[3].u = ud[c / 2 + uv_stride * 7]; + cs[3].v = vd[c / 2 + uv_stride * 7]; + cs[3].yt = cs[3].ut = cs[3].vt = 20; + cs[4].y = yd[c + w * 14 + 14]; + cs[4].u = ud[c / 2 + uv_stride * 7 + 7]; + cs[4].v = vd[c / 2 + uv_stride * 7 + 7]; + cs[4].yt = cs[4].ut = cs[4].vt = 20; + + for (j = 0; j < 5; j++) { + int e; + + e = fast_masked_motion_search( + ys + c, us + c / 2, vs + c / 2, y_stride, uv_stride, + yd + c, ud + c / 2, vd + c / 2, y_stride, uv_stride, + &cs[j], 1, &mi, &mj, &ui, &uj, &wm); + + if (e < beste) { + bmi = mi; + bmj = mj; + bui = ui; + buj = uj, bwm = wm; + bestj = j; + beste = e; + } + } + best = cs[bestj]; + // best = segmentation[0]; + last = best; + } + predict_all(yd + c, ud + c / 2, vd + c / 2, w, uv_stride, + yp + c, up + c / 2, vp + c / 2, w, uv_stride, + &best, 1, bmi, bmj, bui, buj, bwm); + + } + } + fwrite(prd, w * h * 3 / 2, 1, g); + t = f0; + f0 = f1; + f1 = t; + + } + fclose(f); + fclose(g); + return 0; +} diff --git a/libvpx/vp9/common/vp9_mbpitch.c b/libvpx/vp9/common/vp9_mbpitch.c new file mode 100644 index 000000000..3cf37ffab --- /dev/null +++ b/libvpx/vp9/common/vp9_mbpitch.c @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_blockd.h" + +void vp9_setup_block_dptrs(MACROBLOCKD *mb, + int subsampling_x, int subsampling_y) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) { + mb->plane[i].plane_type = i ? PLANE_TYPE_UV : PLANE_TYPE_Y_WITH_DC; + mb->plane[i].subsampling_x = i ? subsampling_x : 0; + mb->plane[i].subsampling_y = i ? subsampling_y : 0; + } +#if CONFIG_ALPHA + // TODO(jkoleszar): Using the Y w/h for now + mb->plane[3].subsampling_x = 0; + mb->plane[3].subsampling_y = 0; +#endif +} diff --git a/libvpx/vp9/common/vp9_modecont.c b/libvpx/vp9/common/vp9_modecont.c new file mode 100644 index 000000000..5d92cfa00 --- /dev/null +++ b/libvpx/vp9/common/vp9_modecont.c @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_modecont.h" + +const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS] + [VP9_INTER_MODES - 1] = { + {2, 173, 34}, // 0 = both zero mv + {7, 145, 85}, // 1 = one zero mv + one a predicted mv + {7, 166, 63}, // 2 = two predicted mvs + {7, 94, 66}, // 3 = one predicted/zero and one new mv + {8, 64, 46}, // 4 = two new mvs + {17, 81, 31}, // 5 = one intra neighbour + x + {25, 29, 30}, // 6 = two intra neighbours +}; diff --git a/libvpx/vp9/common/vp9_modecont.h b/libvpx/vp9/common/vp9_modecont.h new file mode 100644 index 000000000..3ec607947 --- /dev/null +++ b/libvpx/vp9/common/vp9_modecont.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_MODECONT_H_ +#define VP9_COMMON_VP9_MODECONT_H_ + +#include "vp9/common/vp9_entropy.h" + +extern const vp9_prob vp9_default_inter_mode_probs[INTER_MODE_CONTEXTS] + [VP9_INTER_MODES - 1]; + +#endif // VP9_COMMON_VP9_MODECONT_H_ diff --git a/libvpx/vp9/common/vp9_modecontext.c b/libvpx/vp9/common/vp9_modecontext.c new file mode 100644 index 000000000..a79ab2a6c --- /dev/null +++ b/libvpx/vp9/common/vp9_modecontext.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vp9/common/vp9_entropymode.h" + +const vp9_prob vp9_kf_default_bmode_probs[VP9_INTRA_MODES] + [VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1] = { + { /* above = dc */ + { 137, 30, 42, 148, 151, 207, 70, 52, 91 } /* left = dc */, + { 92, 45, 102, 136, 116, 180, 74, 90, 100 } /* left = v */, + { 73, 32, 19, 187, 222, 215, 46, 34, 100 } /* left = h */, + { 91, 30, 32, 116, 121, 186, 93, 86, 94 } /* left = d45 */, + { 72, 35, 36, 149, 68, 206, 68, 63, 105 } /* left = d135 */, + { 73, 31, 28, 138, 57, 124, 55, 122, 151 } /* left = d117 */, + { 67, 23, 21, 140, 126, 197, 40, 37, 171 } /* left = d153 */, + { 86, 27, 28, 128, 154, 212, 45, 43, 53 } /* left = d27 */, + { 74, 32, 27, 107, 86, 160, 63, 134, 102 } /* left = d63 */, + { 59, 67, 44, 140, 161, 202, 78, 67, 119 } /* left = tm */ + }, { /* above = v */ + { 63, 36, 126, 146, 123, 158, 60, 90, 96 } /* left = dc */, + { 43, 46, 168, 134, 107, 128, 69, 142, 92 } /* left = v */, + { 44, 29, 68, 159, 201, 177, 50, 57, 77 } /* left = h */, + { 58, 38, 76, 114, 97, 172, 78, 133, 92 } /* left = d45 */, + { 46, 41, 76, 140, 63, 184, 69, 112, 57 } /* left = d135 */, + { 38, 32, 85, 140, 46, 112, 54, 151, 133 } /* left = d117 */, + { 39, 27, 61, 131, 110, 175, 44, 75, 136 } /* left = d153 */, + { 52, 30, 74, 113, 130, 175, 51, 64, 58 } /* left = d27 */, + { 47, 35, 80, 100, 74, 143, 64, 163, 74 } /* left = d63 */, + { 36, 61, 116, 114, 128, 162, 80, 125, 82 } /* left = tm */ + }, { /* above = h */ + { 82, 26, 26, 171, 208, 204, 44, 32, 105 } /* left = dc */, + { 55, 44, 68, 166, 179, 192, 57, 57, 108 } /* left = v */, + { 42, 26, 11, 199, 241, 228, 23, 15, 85 } /* left = h */, + { 68, 42, 19, 131, 160, 199, 55, 52, 83 } /* left = d45 */, + { 58, 50, 25, 139, 115, 232, 39, 52, 118 } /* left = d135 */, + { 50, 35, 33, 153, 104, 162, 64, 59, 131 } /* left = d117 */, + { 44, 24, 16, 150, 177, 202, 33, 19, 156 } /* left = d153 */, + { 55, 27, 12, 153, 203, 218, 26, 27, 49 } /* left = d27 */, + { 53, 49, 21, 110, 116, 168, 59, 80, 76 } /* left = d63 */, + { 38, 72, 19, 168, 203, 212, 50, 50, 107 } /* left = tm */ + }, { /* above = d45 */ + { 103, 26, 36, 129, 132, 201, 83, 80, 93 } /* left = dc */, + { 59, 38, 83, 112, 103, 162, 98, 136, 90 } /* left = v */, + { 62, 30, 23, 158, 200, 207, 59, 57, 50 } /* left = h */, + { 67, 30, 29, 84, 86, 191, 102, 91, 59 } /* left = d45 */, + { 60, 32, 33, 112, 71, 220, 64, 89, 104 } /* left = d135 */, + { 53, 26, 34, 130, 56, 149, 84, 120, 103 } /* left = d117 */, + { 53, 21, 23, 133, 109, 210, 56, 77, 172 } /* left = d153 */, + { 77, 19, 29, 112, 142, 228, 55, 66, 36 } /* left = d27 */, + { 61, 29, 29, 93, 97, 165, 83, 175, 162 } /* left = d63 */, + { 47, 47, 43, 114, 137, 181, 100, 99, 95 } /* left = tm */ + }, { /* above = d135 */ + { 69, 23, 29, 128, 83, 199, 46, 44, 101 } /* left = dc */, + { 53, 40, 55, 139, 69, 183, 61, 80, 110 } /* left = v */, + { 40, 29, 19, 161, 180, 207, 43, 24, 91 } /* left = h */, + { 60, 34, 19, 105, 61, 198, 53, 64, 89 } /* left = d45 */, + { 52, 31, 22, 158, 40, 209, 58, 62, 89 } /* left = d135 */, + { 44, 31, 29, 147, 46, 158, 56, 102, 198 } /* left = d117 */, + { 35, 19, 12, 135, 87, 209, 41, 45, 167 } /* left = d153 */, + { 55, 25, 21, 118, 95, 215, 38, 39, 66 } /* left = d27 */, + { 51, 38, 25, 113, 58, 164, 70, 93, 97 } /* left = d63 */, + { 47, 54, 34, 146, 108, 203, 72, 103, 151 } /* left = tm */ + }, { /* above = d117 */ + { 64, 19, 37, 156, 66, 138, 49, 95, 133 } /* left = dc */, + { 46, 27, 80, 150, 55, 124, 55, 121, 135 } /* left = v */, + { 36, 23, 27, 165, 149, 166, 54, 64, 118 } /* left = h */, + { 53, 21, 36, 131, 63, 163, 60, 109, 81 } /* left = d45 */, + { 40, 26, 35, 154, 40, 185, 51, 97, 123 } /* left = d135 */, + { 35, 19, 34, 179, 19, 97, 48, 129, 124 } /* left = d117 */, + { 36, 20, 26, 136, 62, 164, 33, 77, 154 } /* left = d153 */, + { 45, 18, 32, 130, 90, 157, 40, 79, 91 } /* left = d27 */, + { 45, 26, 28, 129, 45, 129, 49, 147, 123 } /* left = d63 */, + { 38, 44, 51, 136, 74, 162, 57, 97, 121 } /* left = tm */ + }, { /* above = d153 */ + { 75, 17, 22, 136, 138, 185, 32, 34, 166 } /* left = dc */, + { 56, 39, 58, 133, 117, 173, 48, 53, 187 } /* left = v */, + { 35, 21, 12, 161, 212, 207, 20, 23, 145 } /* left = h */, + { 56, 29, 19, 117, 109, 181, 55, 68, 112 } /* left = d45 */, + { 47, 29, 17, 153, 64, 220, 59, 51, 114 } /* left = d135 */, + { 46, 16, 24, 136, 76, 147, 41, 64, 172 } /* left = d117 */, + { 34, 17, 11, 108, 152, 187, 13, 15, 209 } /* left = d153 */, + { 51, 24, 14, 115, 133, 209, 32, 26, 104 } /* left = d27 */, + { 55, 30, 18, 122, 79, 179, 44, 88, 116 } /* left = d63 */, + { 37, 49, 25, 129, 168, 164, 41, 54, 148 } /* left = tm */ + }, { /* above = d27 */ + { 82, 22, 32, 127, 143, 213, 39, 41, 70 } /* left = dc */, + { 62, 44, 61, 123, 105, 189, 48, 57, 64 } /* left = v */, + { 47, 25, 17, 175, 222, 220, 24, 30, 86 } /* left = h */, + { 68, 36, 17, 106, 102, 206, 59, 74, 74 } /* left = d45 */, + { 57, 39, 23, 151, 68, 216, 55, 63, 58 } /* left = d135 */, + { 49, 30, 35, 141, 70, 168, 82, 40, 115 } /* left = d117 */, + { 51, 25, 15, 136, 129, 202, 38, 35, 139 } /* left = d153 */, + { 68, 26, 16, 111, 141, 215, 29, 28, 28 } /* left = d27 */, + { 59, 39, 19, 114, 75, 180, 77, 104, 42 } /* left = d63 */, + { 40, 61, 26, 126, 152, 206, 61, 59, 93 } /* left = tm */ + }, { /* above = d63 */ + { 78, 23, 39, 111, 117, 170, 74, 124, 94 } /* left = dc */, + { 48, 34, 86, 101, 92, 146, 78, 179, 134 } /* left = v */, + { 47, 22, 24, 138, 187, 178, 68, 69, 59 } /* left = h */, + { 56, 25, 33, 105, 112, 187, 95, 177, 129 } /* left = d45 */, + { 48, 31, 27, 114, 63, 183, 82, 116, 56 } /* left = d135 */, + { 43, 28, 37, 121, 63, 123, 61, 192, 169 } /* left = d117 */, + { 42, 17, 24, 109, 97, 177, 56, 76, 122 } /* left = d153 */, + { 58, 18, 28, 105, 139, 182, 70, 92, 63 } /* left = d27 */, + { 46, 23, 32, 74, 86, 150, 67, 183, 88 } /* left = d63 */, + { 36, 38, 48, 92, 122, 165, 88, 137, 91 } /* left = tm */ + }, { /* above = tm */ + { 65, 70, 60, 155, 159, 199, 61, 60, 81 } /* left = dc */, + { 44, 78, 115, 132, 119, 173, 71, 112, 93 } /* left = v */, + { 39, 38, 21, 184, 227, 206, 42, 32, 64 } /* left = h */, + { 58, 47, 36, 124, 137, 193, 80, 82, 78 } /* left = d45 */, + { 49, 50, 35, 144, 95, 205, 63, 78, 59 } /* left = d135 */, + { 41, 53, 52, 148, 71, 142, 65, 128, 51 } /* left = d117 */, + { 40, 36, 28, 143, 143, 202, 40, 55, 137 } /* left = d153 */, + { 52, 34, 29, 129, 183, 227, 42, 35, 43 } /* left = d27 */, + { 42, 44, 44, 104, 105, 164, 64, 130, 80 } /* left = d63 */, + { 43, 81, 53, 140, 169, 204, 68, 84, 72 } /* left = tm */ + } +}; diff --git a/libvpx/vp9/common/vp9_mv.h b/libvpx/vp9/common/vp9_mv.h new file mode 100644 index 000000000..a1eef4649 --- /dev/null +++ b/libvpx/vp9/common/vp9_mv.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_MV_H_ +#define VP9_COMMON_VP9_MV_H_ + +#include "vpx/vpx_integer.h" + +typedef struct { + int16_t row; + int16_t col; +} MV; + +typedef union int_mv { + uint32_t as_int; + MV as_mv; +} int_mv; /* facilitates faster equality tests and copies */ + +struct mv32 { + int32_t row; + int32_t col; +}; + +typedef union int_mv32 { + uint64_t as_int; + struct mv32 as_mv; +} int_mv32; /* facilitates faster equality tests and copies */ + +#endif // VP9_COMMON_VP9_MV_H_ diff --git a/libvpx/vp9/common/vp9_mvref_common.c b/libvpx/vp9/common/vp9_mvref_common.c new file mode 100644 index 000000000..78fb2f022 --- /dev/null +++ b/libvpx/vp9/common/vp9_mvref_common.c @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_mvref_common.h" + +#define MVREF_NEIGHBOURS 8 +static int mv_ref_blocks[BLOCK_SIZE_TYPES][MVREF_NEIGHBOURS][2] = { + // SB4X4 + {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, + // SB4X8 + {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, + // SB8X4 + {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, + // SB8X8 + {{0, -1}, {-1, 0}, {-1, -1}, {0, -2}, {-2, 0}, {-1, -2}, {-2, -1}, {-2, -2}}, + // SB8X16 + {{-1, 0}, {0, -1}, {-1, 1}, {-1, -1}, {-2, 0}, {0, -2}, {-1, -2}, {-2, -1}}, + // SB16X8 + {{0, -1}, {-1, 0}, {1, -1}, {-1, -1}, {0, -2}, {-2, 0}, {-2, -1}, {-1, -2}}, + // SB16X16 + {{0, -1}, {-1, 0}, {1, -1}, {-1, 1}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}}, + // SB16X32 + {{-1, 0}, {0, -1}, {-1, 2}, {-1, -1}, {1, -1}, {-3, 0}, {0, -3}, {-3, -3}}, + // SB32X16 + {{0, -1}, {-1, 0}, {2, -1}, {-1, -1}, {-1, 1}, {0, -3}, {-3, 0}, {-3, -3}}, + // SB32X32 + {{1, -1}, {-1, 1}, {2, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {-3, -3}}, + // SB32X64 + {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}}, + // SB64X32 + {{0, -1}, {-1, 0}, {4, -1}, {-1, 2}, {-1, -1}, {0, -3}, {-3, 0}, {2, -1}}, + // SB64X64 + {{3, -1}, {-1, 3}, {4, -1}, {-1, 4}, {-1, -1}, {0, -1}, {-1, 0}, {6, -1}} +}; +// clamp_mv_ref +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units + +static void clamp_mv_ref(const MACROBLOCKD *xd, int_mv *mv) { + mv->as_mv.col = clamp(mv->as_mv.col, xd->mb_to_left_edge - MV_BORDER, + xd->mb_to_right_edge + MV_BORDER); + mv->as_mv.row = clamp(mv->as_mv.row, xd->mb_to_top_edge - MV_BORDER, + xd->mb_to_bottom_edge + MV_BORDER); +} + +// Gets a candidate reference motion vector from the given mode info +// structure if one exists that matches the given reference frame. +static int get_matching_candidate(const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + int_mv *c_mv, int block_idx) { + if (ref_frame == candidate_mi->mbmi.ref_frame[0]) { + if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) + c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[0].as_int; + else + c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; + } else if (ref_frame == candidate_mi->mbmi.ref_frame[1]) { + if (block_idx >= 0 && candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) + c_mv->as_int = candidate_mi->bmi[block_idx].as_mv[1].as_int; + else + c_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + } else { + return 0; + } + + return 1; +} + +// Gets candidate reference motion vector(s) from the given mode info +// structure if they exists and do NOT match the given reference frame. +static void get_non_matching_candidates(const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME *c_ref_frame, + int_mv *c_mv, + MV_REFERENCE_FRAME *c2_ref_frame, + int_mv *c2_mv) { + + c_mv->as_int = 0; + c2_mv->as_int = 0; + *c_ref_frame = INTRA_FRAME; + *c2_ref_frame = INTRA_FRAME; + + // If first candidate not valid neither will be. + if (candidate_mi->mbmi.ref_frame[0] > INTRA_FRAME) { + // First candidate + if (candidate_mi->mbmi.ref_frame[0] != ref_frame) { + *c_ref_frame = candidate_mi->mbmi.ref_frame[0]; + c_mv->as_int = candidate_mi->mbmi.mv[0].as_int; + } + + // Second candidate + if ((candidate_mi->mbmi.ref_frame[1] > INTRA_FRAME) && + (candidate_mi->mbmi.ref_frame[1] != ref_frame) && + (candidate_mi->mbmi.mv[1].as_int != candidate_mi->mbmi.mv[0].as_int)) { + *c2_ref_frame = candidate_mi->mbmi.ref_frame[1]; + c2_mv->as_int = candidate_mi->mbmi.mv[1].as_int; + } + } +} + + +// Performs mv sign inversion if indicated by the reference frame combination. +static void scale_mv(MACROBLOCKD *xd, MV_REFERENCE_FRAME this_ref_frame, + MV_REFERENCE_FRAME candidate_ref_frame, + int_mv *candidate_mv, int *ref_sign_bias) { + + // Sign inversion where appropriate. + if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { + candidate_mv->as_mv.row = -candidate_mv->as_mv.row; + candidate_mv->as_mv.col = -candidate_mv->as_mv.col; + } +} + +// Add a candidate mv. +// Discard if it has already been seen. +static void add_candidate_mv(int_mv *mv_list, int *mv_scores, + int *candidate_count, int_mv candidate_mv, + int weight) { + if (*candidate_count == 0) { + mv_list[0].as_int = candidate_mv.as_int; + mv_scores[0] = weight; + *candidate_count += 1; + } else if ((*candidate_count == 1) && + (candidate_mv.as_int != mv_list[0].as_int)) { + mv_list[1].as_int = candidate_mv.as_int; + mv_scores[1] = weight; + *candidate_count += 1; + } +} + +// This function searches the neighbourhood of a given MB/SB +// to try and find candidate reference vectors. +// +void vp9_find_mv_refs_idx(VP9_COMMON *cm, MACROBLOCKD *xd, MODE_INFO *here, + MODE_INFO *lf_here, MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, int *ref_sign_bias, + int block_idx) { + int i; + MODE_INFO *candidate_mi; + MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi; + int_mv c_refmv; + int_mv c2_refmv; + MV_REFERENCE_FRAME c_ref_frame; + MV_REFERENCE_FRAME c2_ref_frame; + int candidate_scores[MAX_MV_REF_CANDIDATES]; + int refmv_count = 0; + int split_count = 0; + int (*mv_ref_search)[2]; + const int mi_col = get_mi_col(xd); + const int mi_row = get_mi_row(xd); + int intra_count = 0; + int zero_count = 0; + int newmv_count = 0; + int x_idx = 0, y_idx = 0; + + // Blank the reference vector lists and other local structures. + vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REF_CANDIDATES); + vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); + + mv_ref_search = mv_ref_blocks[mbmi->sb_type]; + if (mbmi->sb_type < BLOCK_SIZE_SB8X8) { + x_idx = block_idx & 1; + y_idx = block_idx >> 1; + } + + // We first scan for candidate vectors that match the current reference frame + // Look at nearest neigbours + for (i = 0; i < 2; ++i) { + const int mi_search_col = mi_col + mv_ref_search[i][0]; + const int mi_search_row = mi_row + mv_ref_search[i][1]; + if ((mi_search_col >= cm->cur_tile_mi_col_start) && + (mi_search_col < cm->cur_tile_mi_col_end) && + (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { + int b; + + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + if (block_idx >= 0) { + if (mv_ref_search[i][0]) + b = 1 + y_idx * 2; + else + b = 2 + x_idx; + } else { + b = -1; + } + if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, b)) { + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c_refmv, 16); + } + split_count += (candidate_mi->mbmi.sb_type < BLOCK_SIZE_SB8X8 && + candidate_mi->mbmi.ref_frame[0] != INTRA_FRAME); + + // Count number of neihgbours coded intra and zeromv + intra_count += (candidate_mi->mbmi.mode < NEARESTMV); + zero_count += (candidate_mi->mbmi.mode == ZEROMV); + newmv_count += (candidate_mi->mbmi.mode >= NEWMV); + } + } + + // More distant neigbours + for (i = 2; (i < MVREF_NEIGHBOURS) && + (refmv_count < MAX_MV_REF_CANDIDATES); ++i) { + const int mi_search_col = mi_col + mv_ref_search[i][0]; + const int mi_search_row = mi_row + mv_ref_search[i][1]; + if ((mi_search_col >= cm->cur_tile_mi_col_start) && + (mi_search_col < cm->cur_tile_mi_col_end) && + (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) { + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c_refmv, 16); + } + } + } + + // Look in the last frame if it exists + if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) { + candidate_mi = lf_here; + if (get_matching_candidate(candidate_mi, ref_frame, &c_refmv, -1)) { + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c_refmv, 16); + } + } + + // If we have not found enough candidates consider ones where the + // reference frame does not match. Break out when we have + // MAX_MV_REF_CANDIDATES candidates. + // Look first at spatial neighbours + for (i = 0; (i < MVREF_NEIGHBOURS) && + (refmv_count < MAX_MV_REF_CANDIDATES); ++i) { + const int mi_search_col = mi_col + mv_ref_search[i][0]; + const int mi_search_row = mi_row + mv_ref_search[i][1]; + if ((mi_search_col >= cm->cur_tile_mi_col_start) && + (mi_search_col < cm->cur_tile_mi_col_end) && + (mi_search_row >= 0) && (mi_search_row < cm->mi_rows)) { + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + get_non_matching_candidates(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv, + &c2_ref_frame, &c2_refmv); + + if (c_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c_refmv, 1); + } + + if (c2_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c2_refmv, 1); + } + } + } + + // Look at the last frame if it exists + if (lf_here && (refmv_count < MAX_MV_REF_CANDIDATES)) { + candidate_mi = lf_here; + get_non_matching_candidates(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv, + &c2_ref_frame, &c2_refmv); + + if (c_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias); + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c_refmv, 1); + } + + if (c2_ref_frame != INTRA_FRAME) { + scale_mv(xd, ref_frame, c2_ref_frame, &c2_refmv, ref_sign_bias); + add_candidate_mv(mv_ref_list, candidate_scores, + &refmv_count, c2_refmv, 1); + } + } + + if (!intra_count) { + if (!newmv_count) { + // 0 = both zero mv + // 1 = one zero mv + one a predicted mv + // 2 = two predicted mvs + mbmi->mb_mode_context[ref_frame] = 2 - zero_count; + } else { + // 3 = one predicted/zero and one new mv + // 4 = two new mvs + mbmi->mb_mode_context[ref_frame] = 2 + newmv_count; + } + } else { + // 5 = one intra neighbour + x + // 6 = two intra neighbours + mbmi->mb_mode_context[ref_frame] = 4 + intra_count; + } + + // Clamp vectors + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + clamp_mv_ref(xd, &mv_ref_list[i]); + } +} diff --git a/libvpx/vp9/common/vp9_mvref_common.h b/libvpx/vp9/common/vp9_mvref_common.h new file mode 100644 index 000000000..7290f10ab --- /dev/null +++ b/libvpx/vp9/common/vp9_mvref_common.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifndef VP9_COMMON_VP9_MVREF_COMMON_H_ +#define VP9_COMMON_VP9_MVREF_COMMON_H_ + +void vp9_find_mv_refs_idx(VP9_COMMON *cm, + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int *ref_sign_bias, + int block_idx); + +static INLINE void vp9_find_mv_refs(VP9_COMMON *cm, + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int *ref_sign_bias) { + vp9_find_mv_refs_idx(cm, xd, here, lf_here, ref_frame, + mv_ref_list, ref_sign_bias, -1); +} + +#endif // VP9_COMMON_VP9_MVREF_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_onyx.h b/libvpx/vp9/common/vp9_onyx.h new file mode 100644 index 000000000..b85b88968 --- /dev/null +++ b/libvpx/vp9/common/vp9_onyx.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ONYX_H_ +#define VP9_COMMON_VP9_ONYX_H_ + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "./vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vpx/vp8cx.h" +#include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_ppflags.h" + +#define MAX_MB_SEGMENTS 8 + + typedef int *VP9_PTR; + + /* Create/destroy static data structures. */ + + typedef enum { + NORMAL = 0, + FOURFIVE = 1, + THREEFIVE = 2, + ONETWO = 3 + + } VPX_SCALING; + + typedef enum { + VP9_LAST_FLAG = 1, + VP9_GOLD_FLAG = 2, + VP9_ALT_FLAG = 4 + } VP9_REFFRAME; + + + typedef enum { + USAGE_STREAM_FROM_SERVER = 0x0, + USAGE_LOCAL_FILE_PLAYBACK = 0x1, + USAGE_CONSTRAINED_QUALITY = 0x2 + } END_USAGE; + + + typedef enum { + MODE_GOODQUALITY = 0x1, + MODE_BESTQUALITY = 0x2, + MODE_FIRSTPASS = 0x3, + MODE_SECONDPASS = 0x4, + MODE_SECONDPASS_BEST = 0x5, + } MODE; + + typedef enum { + FRAMEFLAGS_KEY = 1, + FRAMEFLAGS_GOLDEN = 2, + FRAMEFLAGS_ALTREF = 4, + } FRAMETYPE_FLAGS; + + +#include <assert.h> + static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { + switch (mode) { + case NORMAL: + *hr = 1; + *hs = 1; + break; + case FOURFIVE: + *hr = 4; + *hs = 5; + break; + case THREEFIVE: + *hr = 3; + *hs = 5; + break; + case ONETWO: + *hr = 1; + *hs = 2; + break; + default: + *hr = 1; + *hs = 1; + assert(0); + break; + } + } + + typedef struct { + int version; // 4 versions of bitstream defined: + // 0 - best quality/slowest decode, + // 3 - lowest quality/fastest decode + int width; // width of data passed to the compressor + int height; // height of data passed to the compressor + double frame_rate; // set to passed in framerate + int64_t target_bandwidth; // bandwidth to be used in kilobits per second + + int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 + int Sharpness; // parameter used for sharpening output: recommendation 0: + int cpu_used; + unsigned int rc_max_intra_bitrate_pct; + + // mode -> + // (0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing + // a television signal or feed from a live camera). ( speed setting controls how fast ) + // (1)=Good Quality Fast Encoding. The encoder balances quality with the amount of time it takes to + // encode the output. ( speed setting controls how fast ) + // (2)=One Pass - Best Quality. The encoder places priority on the quality of the output over encoding + // speed. The output is compressed at the highest possible quality. This option takes the longest + // amount of time to encode. ( speed setting ignored ) + // (3)=Two Pass - First Pass. The encoder generates a file of statistics for use in the second encoding + // pass. ( speed setting controls how fast ) + // (4)=Two Pass - Second Pass. The encoder uses the statistics that were generated in the first encoding + // pass to create the compressed output. ( speed setting controls how fast ) + // (5)=Two Pass - Second Pass Best. The encoder uses the statistics that were generated in the first + // encoding pass to create the compressed output using the highest possible quality, and taking a + // longer amount of time to encode.. ( speed setting ignored ) + int Mode; // + + // Key Framing Operations + int auto_key; // automatically detect cut scenes and set the keyframes + int key_freq; // maximum distance to key frame. + + int allow_lag; // allow lagged compression (if 0 lagin frames is ignored) + int lag_in_frames; // how many frames lag before we start encoding + + // ---------------------------------------------------------------- + // DATARATE CONTROL OPTIONS + + int end_usage; // vbr or cbr + + // buffer targeting aggressiveness + int under_shoot_pct; + int over_shoot_pct; + + // buffering parameters + int64_t starting_buffer_level; // in seconds + int64_t optimal_buffer_level; + int64_t maximum_buffer_size; + + // controlling quality + int fixed_q; + int worst_allowed_q; + int best_allowed_q; + int cq_level; + int lossless; + + // two pass datarate control + int two_pass_vbrbias; // two pass datarate control tweaks + int two_pass_vbrmin_section; + int two_pass_vbrmax_section; + // END DATARATE CONTROL OPTIONS + // ---------------------------------------------------------------- + + + // these parameters aren't to be used in final build don't use!!! + int play_alternate; + int alt_freq; + + int encode_breakout; // early breakout encode threshold : for video conf recommend 800 + + /* Bitfield defining the error resiliency features to enable. + * Can provide decodable frames after losses in previous + * frames and decodable partitions after losses in the same frame. + */ + unsigned int error_resilient_mode; + + /* Bitfield defining the parallel decoding mode where the + * decoding in successive frames may be conducted in parallel + * just by decoding the frame headers. + */ + unsigned int frame_parallel_decoding_mode; + + int arnr_max_frames; + int arnr_strength; + int arnr_type; + + int tile_columns; + int tile_rows; + + struct vpx_fixed_buf two_pass_stats_in; + struct vpx_codec_pkt_list *output_pkt_list; + + vp8e_tuning tuning; + } VP9_CONFIG; + + + void vp9_initialize_enc(); + + VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf); + void vp9_remove_compressor(VP9_PTR *comp); + + void vp9_change_config(VP9_PTR onyx, VP9_CONFIG *oxcf); + +// receive a frames worth of data caller can assume that a copy of this frame is made +// and not just a copy of the pointer.. + int vp9_receive_raw_frame(VP9_PTR comp, unsigned int frame_flags, + YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + int64_t end_time_stamp); + + int vp9_get_compressed_data(VP9_PTR comp, unsigned int *frame_flags, + unsigned long *size, unsigned char *dest, + int64_t *time_stamp, int64_t *time_end, + int flush); + + int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + + int vp9_use_as_reference(VP9_PTR comp, int ref_frame_flags); + + int vp9_update_reference(VP9_PTR comp, int ref_frame_flags); + + int vp9_copy_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + int vp9_get_reference_enc(VP9_PTR ptr, int index, YV12_BUFFER_CONFIG **fb); + + int vp9_set_reference_enc(VP9_PTR comp, VP9_REFFRAME ref_frame_flag, + YV12_BUFFER_CONFIG *sd); + + int vp9_update_entropy(VP9_PTR comp, int update); + + int vp9_set_roimap(VP9_PTR comp, unsigned char *map, + unsigned int rows, unsigned int cols, + int delta_q[MAX_MB_SEGMENTS], + int delta_lf[MAX_MB_SEGMENTS], + unsigned int threshold[MAX_MB_SEGMENTS]); + + int vp9_set_active_map(VP9_PTR comp, unsigned char *map, + unsigned int rows, unsigned int cols); + + int vp9_set_internal_size(VP9_PTR comp, + VPX_SCALING horiz_mode, VPX_SCALING vert_mode); + + int vp9_get_quantizer(VP9_PTR c); + +#ifdef __cplusplus +} +#endif + +#endif // VP9_COMMON_VP9_ONYX_H_ diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h new file mode 100644 index 000000000..0d8b0f445 --- /dev/null +++ b/libvpx/vp9/common/vp9_onyxc_int.h @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_ONYXC_INT_H_ +#define VP9_COMMON_VP9_ONYXC_INT_H_ + +#include "vpx_config.h" +#include "vpx/internal/vpx_codec_internal.h" +#include "vp9_rtcd.h" +#include "vp9/common/vp9_loopfilter.h" +#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_quant_common.h" + +#if CONFIG_POSTPROC +#include "vp9/common/vp9_postproc.h" +#endif + +/* Create/destroy static data structures. */ + +// Define the number of candidate reference buffers. +#define NUM_REF_FRAMES 8 +#define NUM_REF_FRAMES_LG2 3 + +#define ALLOWED_REFS_PER_FRAME 3 + +// 1 scratch frame for the new frame, 3 for scaled references on the encoder +// TODO(jkoleszar): These 3 extra references could probably come from the +// normal reference pool. +#define NUM_YV12_BUFFERS (NUM_REF_FRAMES + 4) + +#define NUM_FRAME_CONTEXTS_LG2 2 +#define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LG2) + +#define MAX_LAG_BUFFERS 25 + +typedef struct frame_contexts { + vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1]; + vp9_prob uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; + vp9_prob partition_prob[NUM_FRAME_TYPES][NUM_PARTITION_CONTEXTS] + [PARTITION_TYPES - 1]; + + nmv_context nmvc; + nmv_context pre_nmvc; + /* interframe intra mode probs */ + vp9_prob pre_y_mode_prob[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES - 1]; + vp9_prob pre_uv_mode_prob[VP9_INTRA_MODES][VP9_INTRA_MODES - 1]; + vp9_prob pre_partition_prob[NUM_PARTITION_CONTEXTS][PARTITION_TYPES - 1]; + /* interframe intra mode probs */ + unsigned int y_mode_counts[BLOCK_SIZE_GROUPS][VP9_INTRA_MODES]; + unsigned int uv_mode_counts[VP9_INTRA_MODES][VP9_INTRA_MODES]; + unsigned int partition_counts[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + + vp9_coeff_probs_model coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES]; + vp9_coeff_probs_model pre_coef_probs[TX_SIZE_MAX_SB][BLOCK_TYPES]; + vp9_coeff_count_model coef_counts[TX_SIZE_MAX_SB][BLOCK_TYPES]; + unsigned int eob_branch_counts[TX_SIZE_MAX_SB][BLOCK_TYPES][REF_TYPES] + [COEF_BANDS][PREV_COEF_CONTEXTS]; + + nmv_context_counts NMVcount; + vp9_prob switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS - 1]; + vp9_prob pre_switchable_interp_prob[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS - 1]; + unsigned int switchable_interp_count[VP9_SWITCHABLE_FILTERS + 1] + [VP9_SWITCHABLE_FILTERS]; + + vp9_prob inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; + vp9_prob pre_inter_mode_probs[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1]; + unsigned int inter_mode_counts[INTER_MODE_CONTEXTS][VP9_INTER_MODES - 1][2]; + + vp9_prob intra_inter_prob[INTRA_INTER_CONTEXTS]; + vp9_prob comp_inter_prob[COMP_INTER_CONTEXTS]; + vp9_prob single_ref_prob[REF_CONTEXTS][2]; + vp9_prob comp_ref_prob[REF_CONTEXTS]; + vp9_prob pre_intra_inter_prob[INTRA_INTER_CONTEXTS]; + vp9_prob pre_comp_inter_prob[COMP_INTER_CONTEXTS]; + vp9_prob pre_single_ref_prob[REF_CONTEXTS][2]; + vp9_prob pre_comp_ref_prob[REF_CONTEXTS]; + unsigned int intra_inter_count[INTRA_INTER_CONTEXTS][2]; + unsigned int comp_inter_count[COMP_INTER_CONTEXTS][2]; + unsigned int single_ref_count[REF_CONTEXTS][2][2]; + unsigned int comp_ref_count[REF_CONTEXTS][2]; + + vp9_prob tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + vp9_prob tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; + vp9_prob tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; + vp9_prob pre_tx_probs_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + vp9_prob pre_tx_probs_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; + vp9_prob pre_tx_probs_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 3]; + unsigned int tx_count_32x32p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB]; + unsigned int tx_count_16x16p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 1]; + unsigned int tx_count_8x8p[TX_SIZE_CONTEXTS][TX_SIZE_MAX_SB - 2]; + + vp9_prob mbskip_probs[MBSKIP_CONTEXTS]; + vp9_prob pre_mbskip_probs[MBSKIP_CONTEXTS]; + unsigned int mbskip_count[MBSKIP_CONTEXTS][2]; +} FRAME_CONTEXT; + +typedef enum { + SINGLE_PREDICTION_ONLY = 0, + COMP_PREDICTION_ONLY = 1, + HYBRID_PREDICTION = 2, + NB_PREDICTION_TYPES = 3, +} COMPPREDMODE_TYPE; + +typedef enum { + ONLY_4X4 = 0, + ALLOW_8X8 = 1, + ALLOW_16X16 = 2, + ALLOW_32X32 = 3, + TX_MODE_SELECT = 4, + NB_TXFM_MODES = 5, +} TXFM_MODE; + +typedef struct VP9Common { + struct vpx_internal_error_info error; + + DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][2]); + DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][2]); +#if CONFIG_ALPHA + DECLARE_ALIGNED(16, int16_t, a_dequant[QINDEX_RANGE][2]); +#endif + + int width; + int height; + int display_width; + int display_height; + int last_width; + int last_height; + + // TODO(jkoleszar): this implies chroma ss right now, but could vary per + // plane. Revisit as part of the future change to YV12_BUFFER_CONFIG to + // support additional planes. + int subsampling_x; + int subsampling_y; + + YUV_TYPE clr_type; + + YV12_BUFFER_CONFIG *frame_to_show; + + YV12_BUFFER_CONFIG yv12_fb[NUM_YV12_BUFFERS]; + int fb_idx_ref_cnt[NUM_YV12_BUFFERS]; /* reference counts */ + int ref_frame_map[NUM_REF_FRAMES]; /* maps fb_idx to reference slot */ + + // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and + // roll new_fb_idx into it. + + // Each frame can reference ALLOWED_REFS_PER_FRAME buffers + int active_ref_idx[ALLOWED_REFS_PER_FRAME]; + struct scale_factors active_ref_scale[ALLOWED_REFS_PER_FRAME]; + int new_fb_idx; + + + YV12_BUFFER_CONFIG post_proc_buffer; + YV12_BUFFER_CONFIG temp_scale_frame; + + + FRAME_TYPE last_frame_type; /* Save last frame's frame type for motion search. */ + FRAME_TYPE frame_type; + + int show_frame; + int last_show_frame; + + // Flag signaling that the frame is encoded using only INTRA modes. + int intra_only; + + // Flag signaling that the frame context should be reset to default values. + // 0 or 1 implies don't reset, 2 reset just the context specified in the + // frame header, 3 reset all contexts. + int reset_frame_context; + + int frame_flags; + // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in + // MODE_INFO (8-pixel) units. + int MBs; + int mb_rows, mi_rows; + int mb_cols, mi_cols; + int mode_info_stride; + + /* profile settings */ + TXFM_MODE txfm_mode; + + int base_qindex; + int last_kf_gf_q; /* Q used on the last GF or KF */ + + int y_dc_delta_q; + int uv_dc_delta_q; + int uv_ac_delta_q; +#if CONFIG_ALPHA + int a_dc_delta_q; + int a_ac_delta_q; +#endif + + unsigned int frames_since_golden; + unsigned int frames_till_alt_ref_frame; + + /* We allocate a MODE_INFO struct for each macroblock, together with + an extra row on top and column on the left to simplify prediction. */ + + MODE_INFO *mip; /* Base of allocated array */ + MODE_INFO *mi; /* Corresponds to upper left visible macroblock */ + MODE_INFO *prev_mip; /* MODE_INFO array 'mip' from last decoded frame */ + MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */ + + + // Persistent mb segment id map used in prediction. + unsigned char *last_frame_seg_map; + + INTERPOLATIONFILTERTYPE mcomp_filter_type; + + loop_filter_info_n lf_info; + + int filter_level; + int last_sharpness_level; + int sharpness_level; + + int refresh_frame_context; /* Two state 0 = NO, 1 = YES */ + + int ref_frame_sign_bias[MAX_REF_FRAMES]; /* Two state 0, 1 */ + + /* Y,U,V */ + ENTROPY_CONTEXT *above_context[MAX_MB_PLANE]; + ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16]; + + // partition contexts + PARTITION_CONTEXT *above_seg_context; + PARTITION_CONTEXT left_seg_context[8]; + + /* keyframe block modes are predicted by their above, left neighbors */ + + vp9_prob kf_y_mode_prob[VP9_INTRA_MODES] + [VP9_INTRA_MODES] + [VP9_INTRA_MODES - 1]; + vp9_prob kf_uv_mode_prob[VP9_INTRA_MODES] [VP9_INTRA_MODES - 1]; + + // Context probabilities when using predictive coding of segment id + vp9_prob segment_pred_probs[PREDICTION_PROBS]; + unsigned char temporal_update; + + // Context probabilities for reference frame prediction + int allow_comp_inter_inter; + MV_REFERENCE_FRAME comp_fixed_ref; + MV_REFERENCE_FRAME comp_var_ref[2]; + COMPPREDMODE_TYPE comp_pred_mode; + + FRAME_CONTEXT fc; /* this frame entropy */ + FRAME_CONTEXT frame_contexts[NUM_FRAME_CONTEXTS]; + unsigned int frame_context_idx; /* Context to use/update */ + + unsigned int current_video_frame; + int near_boffset[3]; + int version; + + double bitrate; + double framerate; + +#if CONFIG_POSTPROC + struct postproc_state postproc_state; +#endif + + int error_resilient_mode; + int frame_parallel_decoding_mode; + + int tile_columns, log2_tile_columns; + int cur_tile_mi_col_start, cur_tile_mi_col_end, cur_tile_col_idx; + int tile_rows, log2_tile_rows; + int cur_tile_mi_row_start, cur_tile_mi_row_end, cur_tile_row_idx; +} VP9_COMMON; + +static int get_free_fb(VP9_COMMON *cm) { + int i; + for (i = 0; i < NUM_YV12_BUFFERS; i++) + if (cm->fb_idx_ref_cnt[i] == 0) + break; + + assert(i < NUM_YV12_BUFFERS); + cm->fb_idx_ref_cnt[i] = 1; + return i; +} + +static void ref_cnt_fb(int *buf, int *idx, int new_idx) { + if (buf[*idx] > 0) + buf[*idx]--; + + *idx = new_idx; + + buf[new_idx]++; +} + +static int mi_cols_aligned_to_sb(VP9_COMMON *cm) { + return 2 * ((cm->mb_cols + 3) & ~3); +} + +static INLINE void set_partition_seg_context(VP9_COMMON *cm, + MACROBLOCKD *xd, + int mi_row, int mi_col) { + xd->above_seg_context = cm->above_seg_context + mi_col; + xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); +} + +static int check_bsize_coverage(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE_TYPE bsize) { + int bsl = mi_width_log2(bsize), bs = 1 << bsl; + int ms = bs / 2; + + if ((mi_row + ms < cm->mi_rows) && (mi_col + ms < cm->mi_cols)) + return 0; + + // frame width/height are multiples of 8, hence 8x8 block should always + // pass the above check + assert(bsize > BLOCK_SIZE_SB8X8); + + // return the node index in the prob tree for binary coding + // only allow horizontal/split partition types + if ((mi_col + ms < cm->mi_cols) && (mi_row + ms >= cm->mi_rows)) + return 1; + // only allow vertical/split partition types + if ((mi_row + ms < cm->mi_rows) && (mi_col + ms >= cm->mi_cols)) + return 2; + + return -1; +} + +static void set_mi_row_col(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int bh, + int mi_col, int bw) { + xd->mb_to_top_edge = -((mi_row * MI_SIZE) << 3); + xd->mb_to_bottom_edge = ((cm->mi_rows - bh - mi_row) * MI_SIZE) << 3; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) << 3); + xd->mb_to_right_edge = ((cm->mi_cols - bw - mi_col) * MI_SIZE) << 3; + + // Are edges available for intra prediction? + xd->up_available = (mi_row != 0); + xd->left_available = (mi_col > cm->cur_tile_mi_col_start); + xd->right_available = (mi_col + bw < cm->cur_tile_mi_col_end); +} + +static int get_mi_row(const MACROBLOCKD *xd) { + return ((-xd->mb_to_top_edge) >> (3 + LOG2_MI_SIZE)); +} + +static int get_mi_col(const MACROBLOCKD *xd) { + return ((-xd->mb_to_left_edge) >> (3 + LOG2_MI_SIZE)); +} + +static int get_token_alloc(int mb_rows, int mb_cols) { + return mb_rows * mb_cols * (48 * 16 + 4); +} + +static void set_prev_mi(VP9_COMMON *cm) { + const int use_prev_in_find_mv_refs = cm->width == cm->last_width && + cm->height == cm->last_height && + !cm->error_resilient_mode && + !cm->intra_only && + cm->last_show_frame; + // Special case: set prev_mi to NULL when the previous mode info + // context cannot be used. + cm->prev_mi = use_prev_in_find_mv_refs ? + cm->prev_mip + cm->mode_info_stride + 1 : NULL; +} +#endif // VP9_COMMON_VP9_ONYXC_INT_H_ diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c new file mode 100644 index 000000000..4282ddd1c --- /dev/null +++ b/libvpx/vp9/common/vp9_postproc.c @@ -0,0 +1,1017 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "./vpx_config.h" +#include "vpx_scale/yv12config.h" +#include "vp9/common/vp9_postproc.h" +#include "vp9/common/vp9_textblit.h" +#include "vpx_scale/vpx_scale.h" +#include "vp9/common/vp9_systemdependent.h" +#include "./vp9_rtcd.h" +#include "./vpx_scale_rtcd.h" + + +#include <math.h> +#include <stdlib.h> +#include <stdio.h> + +#define RGB_TO_YUV(t) \ + ( (0.257*(float)(t >> 16)) + (0.504*(float)(t >> 8 & 0xff)) + \ + (0.098*(float)(t & 0xff)) + 16), \ + (-(0.148*(float)(t >> 16)) - (0.291*(float)(t >> 8 & 0xff)) + \ + (0.439*(float)(t & 0xff)) + 128), \ + ( (0.439*(float)(t >> 16)) - (0.368*(float)(t >> 8 & 0xff)) - \ + (0.071*(float)(t & 0xff)) + 128) + +/* global constants */ +#if 0 && CONFIG_POSTPROC_VISUALIZER +static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { + { RGB_TO_YUV(0x98FB98) }, /* PaleGreen */ + { RGB_TO_YUV(0x00FF00) }, /* Green */ + { RGB_TO_YUV(0xADFF2F) }, /* GreenYellow */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x008F8F) }, /* Dark Cyan */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x8F0000) }, /* Dark Red */ + { RGB_TO_YUV(0x228B22) }, /* ForestGreen */ + { RGB_TO_YUV(0x006400) }, /* DarkGreen */ + { RGB_TO_YUV(0x98F5FF) }, /* Cadet Blue */ + { RGB_TO_YUV(0x6CA6CD) }, /* Sky Blue */ + { RGB_TO_YUV(0x00008B) }, /* Dark blue */ + { RGB_TO_YUV(0x551A8B) }, /* Purple */ + { RGB_TO_YUV(0xFF0000) } /* Red */ + { RGB_TO_YUV(0xCC33FF) }, /* Magenta */ +}; + +static const unsigned char B_PREDICTION_MODE_colors[VP9_INTRA_MODES][3] = { + { RGB_TO_YUV(0x6633ff) }, /* Purple */ + { RGB_TO_YUV(0xcc33ff) }, /* Magenta */ + { RGB_TO_YUV(0xff33cc) }, /* Pink */ + { RGB_TO_YUV(0xff3366) }, /* Coral */ + { RGB_TO_YUV(0x3366ff) }, /* Blue */ + { RGB_TO_YUV(0xed00f5) }, /* Dark Blue */ + { RGB_TO_YUV(0x2e00b8) }, /* Dark Purple */ + { RGB_TO_YUV(0xff6633) }, /* Orange */ + { RGB_TO_YUV(0x33ccff) }, /* Light Blue */ + { RGB_TO_YUV(0x8ab800) }, /* Green */ + { RGB_TO_YUV(0xffcc33) }, /* Light Orange */ + { RGB_TO_YUV(0x33ffcc) }, /* Aqua */ + { RGB_TO_YUV(0x66ff33) }, /* Light Green */ + { RGB_TO_YUV(0xccff33) }, /* Yellow */ +}; + +static const unsigned char MV_REFERENCE_FRAME_colors[MAX_REF_FRAMES][3] = { + { RGB_TO_YUV(0x00ff00) }, /* Blue */ + { RGB_TO_YUV(0x0000ff) }, /* Green */ + { RGB_TO_YUV(0xffff00) }, /* Yellow */ + { RGB_TO_YUV(0xff0000) }, /* Red */ +}; +#endif + +static const short kernel5[] = { + 1, 1, 4, 1, 1 +}; + +const short vp9_rv[] = { + 8, 5, 2, 2, 8, 12, 4, 9, 8, 3, + 0, 3, 9, 0, 0, 0, 8, 3, 14, 4, + 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, + 8, 6, 10, 0, 0, 8, 9, 0, 3, 14, + 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, + 1, 2, 3, 14, 13, 1, 8, 2, 9, 7, + 3, 3, 1, 13, 13, 6, 6, 5, 2, 7, + 11, 9, 11, 8, 7, 3, 2, 0, 13, 13, + 14, 4, 12, 5, 12, 10, 8, 10, 13, 10, + 4, 14, 4, 10, 0, 8, 11, 1, 13, 7, + 7, 14, 6, 14, 13, 2, 13, 5, 4, 4, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 0, 10, 0, 5, 13, 2, 12, 7, 11, 13, + 8, 0, 4, 10, 7, 2, 7, 2, 2, 5, + 3, 4, 7, 3, 3, 14, 14, 5, 9, 13, + 3, 14, 3, 6, 3, 0, 11, 8, 13, 1, + 13, 1, 12, 0, 10, 9, 7, 6, 2, 8, + 5, 2, 13, 7, 1, 13, 14, 7, 6, 7, + 9, 6, 10, 11, 7, 8, 7, 5, 14, 8, + 4, 4, 0, 8, 7, 10, 0, 8, 14, 11, + 3, 12, 5, 7, 14, 3, 14, 5, 2, 6, + 11, 12, 12, 8, 0, 11, 13, 1, 2, 0, + 5, 10, 14, 7, 8, 0, 4, 11, 0, 8, + 0, 3, 10, 5, 8, 0, 11, 6, 7, 8, + 10, 7, 13, 9, 2, 5, 1, 5, 10, 2, + 4, 3, 5, 6, 10, 8, 9, 4, 11, 14, + 3, 8, 3, 7, 8, 5, 11, 4, 12, 3, + 11, 9, 14, 8, 14, 13, 4, 3, 1, 2, + 14, 6, 5, 4, 4, 11, 4, 6, 2, 1, + 5, 8, 8, 12, 13, 5, 14, 10, 12, 13, + 0, 9, 5, 5, 11, 10, 13, 9, 10, 13, +}; + + +/**************************************************************************** + */ +void vp9_post_proc_down_and_across_c(const uint8_t *src_ptr, + uint8_t *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, + int rows, + int cols, + int flimit) { + uint8_t const *p_src; + uint8_t *p_dst; + int row; + int col; + int i; + int v; + int pitch = src_pixels_per_line; + uint8_t d[8]; + (void)dst_pixels_per_line; + + for (row = 0; row < rows; row++) { + /* post_proc_down for one row */ + p_src = src_ptr; + p_dst = dst_ptr; + + for (col = 0; col < cols; col++) { + + int kernel = 4; + int v = p_src[col]; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i * pitch]) > flimit) + goto down_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i * pitch]; + } + + v = (kernel >> 3); + down_skip_convolve: + p_dst[col] = v; + } + + /* now post_proc_across */ + p_src = dst_ptr; + p_dst = dst_ptr; + + for (i = 0; i < 8; i++) + d[i] = p_src[i]; + + for (col = 0; col < cols; col++) { + int kernel = 4; + v = p_src[col]; + + d[col & 7] = v; + + for (i = -2; i <= 2; i++) { + if (abs(v - p_src[col + i]) > flimit) + goto across_skip_convolve; + + kernel += kernel5[2 + i] * p_src[col + i]; + } + + d[col & 7] = (kernel >> 3); + across_skip_convolve: + + if (col >= 2) + p_dst[col - 2] = d[(col - 2) & 7]; + } + + /* handle the last two pixels */ + p_dst[col - 2] = d[(col - 2) & 7]; + p_dst[col - 1] = d[(col - 1) & 7]; + + + /* next row */ + src_ptr += pitch; + dst_ptr += pitch; + } +} + +static int q2mbl(int x) { + if (x < 20) x = 20; + + x = 50 + (x - 50) * 10 / 8; + return x * x / 3; +} + +void vp9_mbpost_proc_across_ip_c(uint8_t *src, int pitch, + int rows, int cols, int flimit) { + int r, c, i; + + uint8_t *s = src; + uint8_t d[16]; + + + for (r = 0; r < rows; r++) { + int sumsq = 0; + int sum = 0; + + for (i = -8; i <= 6; i++) { + sumsq += s[i] * s[i]; + sum += s[i]; + d[i + 8] = 0; + } + + for (c = 0; c < cols + 8; c++) { + int x = s[c + 7] - s[c - 8]; + int y = s[c + 7] + s[c - 8]; + + sum += x; + sumsq += x * y; + + d[c & 15] = s[c]; + + if (sumsq * 15 - sum * sum < flimit) { + d[c & 15] = (8 + sum + s[c]) >> 4; + } + + s[c - 8] = d[(c - 8) & 15]; + } + + s += pitch; + } +} + +void vp9_mbpost_proc_down_c(uint8_t *dst, int pitch, + int rows, int cols, int flimit) { + int r, c, i; + const short *rv3 = &vp9_rv[63 & rand()]; + + for (c = 0; c < cols; c++) { + uint8_t *s = &dst[c]; + int sumsq = 0; + int sum = 0; + uint8_t d[16]; + const short *rv2 = rv3 + ((c * 17) & 127); + + for (i = -8; i <= 6; i++) { + sumsq += s[i * pitch] * s[i * pitch]; + sum += s[i * pitch]; + } + + for (r = 0; r < rows + 8; r++) { + sumsq += s[7 * pitch] * s[ 7 * pitch] - s[-8 * pitch] * s[-8 * pitch]; + sum += s[7 * pitch] - s[-8 * pitch]; + d[r & 15] = s[0]; + + if (sumsq * 15 - sum * sum < flimit) { + d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + } + + s[-8 * pitch] = d[(r - 8) & 15]; + s += pitch; + } + } +} + +static void deblock_and_de_macro_block(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *post, + int q, + int low_var_thresh, + int flag) { + double level = 6.0e-05 * q * q * q - .0067 * q * q + .306 * q + .0065; + int ppl = (int)(level + .5); + (void) low_var_thresh; + (void) flag; + + vp9_post_proc_down_and_across(source->y_buffer, post->y_buffer, + source->y_stride, post->y_stride, + source->y_height, source->y_width, ppl); + + vp9_mbpost_proc_across_ip(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + + vp9_mbpost_proc_down(post->y_buffer, post->y_stride, post->y_height, + post->y_width, q2mbl(q)); + + vp9_post_proc_down_and_across(source->u_buffer, post->u_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); + vp9_post_proc_down_and_across(source->v_buffer, post->v_buffer, + source->uv_stride, post->uv_stride, + source->uv_height, source->uv_width, ppl); +} + +void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int q) { + const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + + 0.0065 + 0.5); + int i; + + const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, + src->alpha_width}; + const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, + src->alpha_height}; + + uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, + dst->alpha_buffer}; + const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, + dst->alpha_stride}; + + for (i = 0; i < MAX_MB_PLANE; ++i) + vp9_post_proc_down_and_across(srcs[i], dsts[i], + src_strides[i], dst_strides[i], + src_heights[i], src_widths[i], ppl); +} + +void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, + int q) { + const int ppl = (int)(6.0e-05 * q * q * q - 0.0067 * q * q + 0.306 * q + + 0.0065 + 0.5); + int i; + + const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + const int src_widths[4] = {src->y_width, src->uv_width, src->uv_width, + src->alpha_width}; + const int src_heights[4] = {src->y_height, src->uv_height, src->uv_height, + src->alpha_height}; + + uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer, + dst->alpha_buffer}; + const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride, + dst->alpha_stride}; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + const int src_stride = src_strides[i]; + const uint8_t *const src = srcs[i] + 2 * src_stride + 2; + const int src_width = src_widths[i] - 4; + const int src_height = src_heights[i] - 4; + + const int dst_stride = dst_strides[i]; + uint8_t *const dst = dsts[i] + 2 * dst_stride + 2; + + vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride, + src_height, src_width, ppl); + } +} + +double vp9_gaussian(double sigma, double mu, double x) { + return 1 / (sigma * sqrt(2.0 * 3.14159265)) * + (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma))); +} + +static void fillrd(struct postproc_state *state, int q, int a) { + char char_dist[300]; + + double sigma; + int ai = a, qi = q, i; + + vp9_clear_system_state(); + + sigma = ai + .5 + .6 * (63 - qi) / 63.0; + + /* set up a lookup table of 256 entries that matches + * a gaussian distribution with sigma determined by q. + */ + { + double i; + int next, j; + + next = 0; + + for (i = -32; i < 32; i++) { + int a = (int)(.5 + 256 * vp9_gaussian(sigma, 0, i)); + + if (a) { + for (j = 0; j < a; j++) { + char_dist[next + j] = (char) i; + } + + next = next + j; + } + + } + + for (next = next; next < 256; next++) + char_dist[next] = 0; + } + + for (i = 0; i < 3072; i++) { + state->noise[i] = char_dist[rand() & 0xff]; + } + + for (i = 0; i < 16; i++) { + state->blackclamp[i] = -char_dist[0]; + state->whiteclamp[i] = -char_dist[0]; + state->bothclamp[i] = -2 * char_dist[0]; + } + + state->last_q = q; + state->last_noise = a; +} + +/**************************************************************************** + * + * ROUTINE : plane_add_noise_c + * + * INPUTS : unsigned char *Start starting address of buffer to + * add gaussian noise to + * unsigned int width width of plane + * unsigned int height height of plane + * int pitch distance between subsequent lines of frame + * int q quantizer used to determine amount of noise + * to add + * + * OUTPUTS : None. + * + * RETURNS : void. + * + * FUNCTION : adds gaussian noise to a plane of pixels + * + * SPECIAL NOTES : None. + * + ****************************************************************************/ +void vp9_plane_add_noise_c(uint8_t *start, char *noise, + char blackclamp[16], + char whiteclamp[16], + char bothclamp[16], + unsigned int width, unsigned int height, int pitch) { + unsigned int i, j; + + for (i = 0; i < height; i++) { + uint8_t *pos = start + i * pitch; + char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT + + for (j = 0; j < width; j++) { + if (pos[j] < blackclamp[0]) + pos[j] = blackclamp[0]; + + if (pos[j] > 255 + whiteclamp[0]) + pos[j] = 255 + whiteclamp[0]; + + pos[j] += ref[j]; + } + } +} + +/* Blend the macro block with a solid colored square. Leave the + * edges unblended to give distinction to macro blocks in areas + * filled with the same color block. + */ +void vp9_blend_mb_inner_c(uint8_t *y, uint8_t *u, uint8_t *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + y += 2 * stride + 2; + for (i = 0; i < 12; i++) { + for (j = 0; j < 12; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + u += stride + 1; + v += stride + 1; + + for (i = 0; i < 6; i++) { + for (j = 0; j < 6; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + } +} + +/* Blend only the edge of the macro block. Leave center + * unblended to allow for other visualizations to be layered. + */ +void vp9_blend_mb_outer_c(uint8_t *y, uint8_t *u, uint8_t *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + for (i = 0; i < 2; i++) { + for (j = 0; j < 16; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + for (i = 0; i < 12; i++) { + y[0] = (y[0] * alpha + y1_const) >> 16; + y[1] = (y[1] * alpha + y1_const) >> 16; + y[14] = (y[14] * alpha + y1_const) >> 16; + y[15] = (y[15] * alpha + y1_const) >> 16; + y += stride; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 16; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + for (j = 0; j < 8; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + + for (i = 0; i < 6; i++) { + u[0] = (u[0] * alpha + u1_const) >> 16; + v[0] = (v[0] * alpha + v1_const) >> 16; + + u[7] = (u[7] * alpha + u1_const) >> 16; + v[7] = (v[7] * alpha + v1_const) >> 16; + + u += stride; + v += stride; + } + + for (j = 0; j < 8; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } +} + +void vp9_blend_b_c(uint8_t *y, uint8_t *u, uint8_t *v, + int y1, int u1, int v1, int alpha, int stride) { + int i, j; + int y1_const = y1 * ((1 << 16) - alpha); + int u1_const = u1 * ((1 << 16) - alpha); + int v1_const = v1 * ((1 << 16) - alpha); + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + y[j] = (y[j] * alpha + y1_const) >> 16; + } + y += stride; + } + + stride >>= 1; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + u[j] = (u[j] * alpha + u1_const) >> 16; + v[j] = (v[j] * alpha + v1_const) >> 16; + } + u += stride; + v += stride; + } +} + +static void constrain_line(int x0, int *x1, int y0, int *y1, + int width, int height) { + int dx; + int dy; + + if (*x1 > width) { + dx = *x1 - x0; + dy = *y1 - y0; + + *x1 = width; + if (dx) + *y1 = ((width - x0) * dy) / dx + y0; + } + if (*x1 < 0) { + dx = *x1 - x0; + dy = *y1 - y0; + + *x1 = 0; + if (dx) + *y1 = ((0 - x0) * dy) / dx + y0; + } + if (*y1 > height) { + dx = *x1 - x0; + dy = *y1 - y0; + + *y1 = height; + if (dy) + *x1 = ((height - y0) * dx) / dy + x0; + } + if (*y1 < 0) { + dx = *x1 - x0; + dy = *y1 - y0; + + *y1 = 0; + if (dy) + *x1 = ((0 - y0) * dx) / dy + x0; + } +} + +int vp9_post_proc_frame(VP9_COMMON *oci, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *ppflags) { + int q = oci->filter_level * 10 / 6; + int flags = ppflags->post_proc_flag; + int deblock_level = ppflags->deblocking_level; + int noise_level = ppflags->noise_level; + + if (!oci->frame_to_show) + return -1; + + if (q > 63) + q = 63; + + if (!flags) { + *dest = *oci->frame_to_show; + return 0; + } + +#if ARCH_X86||ARCH_X86_64 + vpx_reset_mmx_state(); +#endif + + if (flags & VP9D_DEMACROBLOCK) { + deblock_and_de_macro_block(oci->frame_to_show, &oci->post_proc_buffer, + q + (deblock_level - 5) * 10, 1, 0); + } else if (flags & VP9D_DEBLOCK) { + vp9_deblock(oci->frame_to_show, &oci->post_proc_buffer, q); + } else { + vp8_yv12_copy_frame(oci->frame_to_show, &oci->post_proc_buffer); + } + + if (flags & VP9D_ADDNOISE) { + if (oci->postproc_state.last_q != q + || oci->postproc_state.last_noise != noise_level) { + fillrd(&oci->postproc_state, 63 - q, noise_level); + } + + vp9_plane_add_noise(oci->post_proc_buffer.y_buffer, + oci->postproc_state.noise, + oci->postproc_state.blackclamp, + oci->postproc_state.whiteclamp, + oci->postproc_state.bothclamp, + oci->post_proc_buffer.y_width, + oci->post_proc_buffer.y_height, + oci->post_proc_buffer.y_stride); + } + +#if 0 && CONFIG_POSTPROC_VISUALIZER + if (flags & VP9D_DEBUG_TXT_FRAME_INFO) { + char message[512]; + sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d", + (oci->frame_type == KEY_FRAME), + oci->refresh_golden_frame, + oci->base_qindex, + oci->filter_level, + flags, + oci->mb_cols, oci->mb_rows); + vp9_blit_text(message, oci->post_proc_buffer.y_buffer, + oci->post_proc_buffer.y_stride); + } + + if (flags & VP9D_DEBUG_TXT_MBLK_MODES) { + int i, j; + uint8_t *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp9_filter each macro block */ + for (i = 0; i < mb_rows; i++) { + for (j = 0; j < mb_cols; j++) { + char zz[4]; + + sprintf(zz, "%c", mi[mb_index].mbmi.mode + 'a'); + + vp9_blit_text(zz, y_ptr, post->y_stride); + mb_index++; + y_ptr += 16; + } + + mb_index++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP9D_DEBUG_TXT_DC_DIFF) { + int i, j; + uint8_t *y_ptr; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int mb_rows = post->y_height >> 4; + int mb_cols = post->y_width >> 4; + int mb_index = 0; + MODE_INFO *mi = oci->mi; + + y_ptr = post->y_buffer + 4 * post->y_stride + 4; + + /* vp9_filter each macro block */ + for (i = 0; i < mb_rows; i++) { + for (j = 0; j < mb_cols; j++) { + char zz[4]; + int dc_diff = !(mi[mb_index].mbmi.mode != I4X4_PRED && + mi[mb_index].mbmi.mode != SPLITMV && + mi[mb_index].mbmi.mb_skip_coeff); + + if (oci->frame_type == KEY_FRAME) + sprintf(zz, "a"); + else + sprintf(zz, "%c", dc_diff + '0'); + + vp9_blit_text(zz, y_ptr, post->y_stride); + mb_index++; + y_ptr += 16; + } + + mb_index++; /* border */ + y_ptr += post->y_stride * 16 - post->y_width; + + } + } + + if (flags & VP9D_DEBUG_TXT_RATE_INFO) { + char message[512]; + snprintf(message, sizeof(message), + "Bitrate: %10.2f frame_rate: %10.2f ", + oci->bitrate, oci->framerate); + vp9_blit_text(message, oci->post_proc_buffer.y_buffer, + oci->post_proc_buffer.y_stride); + } + + /* Draw motion vectors */ + if ((flags & VP9D_DEBUG_DRAW_MV) && ppflags->display_mv_flag) { + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + uint8_t *y_buffer = oci->post_proc_buffer.y_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + int x0, y0; + + for (y0 = 0; y0 < height; y0 += 16) { + for (x0 = 0; x0 < width; x0 += 16) { + int x1, y1; + + if (!(ppflags->display_mv_flag & (1 << mi->mbmi.mode))) { + mi++; + continue; + } + + if (mi->mbmi.mode == SPLITMV) { + switch (mi->mbmi.partitioning) { + case PARTITIONING_16X8 : { /* mv_top_bottom */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 8, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 8, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 8 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 8, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 8, x1, y0 + 12, y1, y_buffer, y_stride); + + break; + } + case PARTITIONING_8X16 : { /* mv_left_right */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 8, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 8, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 8 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 8, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 8, y1, y_buffer, y_stride); + + break; + } + case PARTITIONING_8X8 : { /* mv_quarters */ + union b_mode_info *bmi = &mi->bmi[0]; + MV *mv = &bmi->mv.as_mv; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[2]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 4 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 4, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 4, y1, y_buffer, y_stride); + + bmi = &mi->bmi[8]; + + x1 = x0 + 4 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 4, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 4, x1, y0 + 12, y1, y_buffer, y_stride); + + bmi = &mi->bmi[10]; + + x1 = x0 + 12 + (mv->col >> 3); + y1 = y0 + 12 + (mv->row >> 3); + + constrain_line(x0 + 12, &x1, y0 + 12, &y1, width, height); + vp9_blit_line(x0 + 12, x1, y0 + 12, y1, y_buffer, y_stride); + break; + } + case PARTITIONING_4X4: + default : { + union b_mode_info *bmi = mi->bmi; + int bx0, by0; + + for (by0 = y0; by0 < (y0 + 16); by0 += 4) { + for (bx0 = x0; bx0 < (x0 + 16); bx0 += 4) { + MV *mv = &bmi->mv.as_mv; + + x1 = bx0 + 2 + (mv->col >> 3); + y1 = by0 + 2 + (mv->row >> 3); + + constrain_line(bx0 + 2, &x1, by0 + 2, &y1, width, height); + vp9_blit_line(bx0 + 2, x1, by0 + 2, y1, y_buffer, y_stride); + + bmi++; + } + } + } + } + } else if (mi->mbmi.mode >= NEARESTMV) { + MV *mv = &mi->mbmi.mv.as_mv; + const int lx0 = x0 + 8; + const int ly0 = y0 + 8; + + x1 = lx0 + (mv->col >> 3); + y1 = ly0 + (mv->row >> 3); + + if (x1 != lx0 && y1 != ly0) { + constrain_line(lx0, &x1, ly0 - 1, &y1, width, height); + vp9_blit_line(lx0, x1, ly0 - 1, y1, y_buffer, y_stride); + + constrain_line(lx0, &x1, ly0 + 1, &y1, width, height); + vp9_blit_line(lx0, x1, ly0 + 1, y1, y_buffer, y_stride); + } else + vp9_blit_line(lx0, x1, ly0, y1, y_buffer, y_stride); + } + + mi++; + } + mi++; + } + } + + /* Color in block modes */ + if ((flags & VP9D_DEBUG_CLR_BLK_MODES) + && (ppflags->display_mb_modes_flag || ppflags->display_b_modes_flag)) { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + uint8_t *y_ptr = oci->post_proc_buffer.y_buffer; + uint8_t *u_ptr = oci->post_proc_buffer.u_buffer; + uint8_t *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) { + for (x = 0; x < width; x += 16) { + int Y = 0, U = 0, V = 0; + + if (mi->mbmi.mode == I4X4_PRED && + ((ppflags->display_mb_modes_flag & I4X4_PRED) || + ppflags->display_b_modes_flag)) { + int by, bx; + uint8_t *yl, *ul, *vl; + union b_mode_info *bmi = mi->bmi; + + yl = y_ptr + x; + ul = u_ptr + (x >> 1); + vl = v_ptr + (x >> 1); + + for (by = 0; by < 16; by += 4) { + for (bx = 0; bx < 16; bx += 4) { + if ((ppflags->display_b_modes_flag & (1 << mi->mbmi.mode)) + || (ppflags->display_mb_modes_flag & I4X4_PRED)) { + Y = B_PREDICTION_MODE_colors[bmi->as_mode.first][0]; + U = B_PREDICTION_MODE_colors[bmi->as_mode.first][1]; + V = B_PREDICTION_MODE_colors[bmi->as_mode.first][2]; + + vp9_blend_b(yl + bx, ul + (bx >> 1), vl + (bx >> 1), Y, U, V, + 0xc000, y_stride); + } + bmi++; + } + + yl += y_stride * 4; + ul += y_stride * 1; + vl += y_stride * 1; + } + } else if (ppflags->display_mb_modes_flag & (1 << mi->mbmi.mode)) { + Y = MB_PREDICTION_MODE_colors[mi->mbmi.mode][0]; + U = MB_PREDICTION_MODE_colors[mi->mbmi.mode][1]; + V = MB_PREDICTION_MODE_colors[mi->mbmi.mode][2]; + + vp9_blend_mb_inner(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), + Y, U, V, 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride * 16; + u_ptr += y_stride * 4; + v_ptr += y_stride * 4; + + mi++; + } + } + + /* Color in frame reference blocks */ + if ((flags & VP9D_DEBUG_CLR_FRM_REF_BLKS) && + ppflags->display_ref_frame_flag) { + int y, x; + YV12_BUFFER_CONFIG *post = &oci->post_proc_buffer; + int width = post->y_width; + int height = post->y_height; + uint8_t *y_ptr = oci->post_proc_buffer.y_buffer; + uint8_t *u_ptr = oci->post_proc_buffer.u_buffer; + uint8_t *v_ptr = oci->post_proc_buffer.v_buffer; + int y_stride = oci->post_proc_buffer.y_stride; + MODE_INFO *mi = oci->mi; + + for (y = 0; y < height; y += 16) { + for (x = 0; x < width; x += 16) { + int Y = 0, U = 0, V = 0; + + if (ppflags->display_ref_frame_flag & (1 << mi->mbmi.ref_frame)) { + Y = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][0]; + U = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][1]; + V = MV_REFERENCE_FRAME_colors[mi->mbmi.ref_frame][2]; + + vp9_blend_mb_outer(y_ptr + x, u_ptr + (x >> 1), v_ptr + (x >> 1), + Y, U, V, 0xc000, y_stride); + } + + mi++; + } + y_ptr += y_stride * 16; + u_ptr += y_stride * 4; + v_ptr += y_stride * 4; + + mi++; + } + } +#endif + + *dest = oci->post_proc_buffer; + + /* handle problem with extending borders */ + dest->y_width = oci->width; + dest->y_height = oci->height; + dest->uv_height = dest->y_height / 2; + + return 0; +} diff --git a/libvpx/vp9/common/vp9_postproc.h b/libvpx/vp9/common/vp9_postproc.h new file mode 100644 index 000000000..2c0d333b6 --- /dev/null +++ b/libvpx/vp9/common/vp9_postproc.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_VP9_POSTPROC_H_ +#define VP9_COMMON_VP9_POSTPROC_H_ + +#include "vpx_ports/mem.h" + +struct postproc_state { + int last_q; + int last_noise; + char noise[3072]; + DECLARE_ALIGNED(16, char, blackclamp[16]); + DECLARE_ALIGNED(16, char, whiteclamp[16]); + DECLARE_ALIGNED(16, char, bothclamp[16]); +}; + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_ppflags.h" + +int vp9_post_proc_frame(struct VP9Common *oci, YV12_BUFFER_CONFIG *dest, + vp9_ppflags_t *flags); + +void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); + +void vp9_deblock(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q); + +#endif // VP9_COMMON_VP9_POSTPROC_H_ diff --git a/libvpx/vp9/common/vp9_ppflags.h b/libvpx/vp9/common/vp9_ppflags.h new file mode 100644 index 000000000..561c93028 --- /dev/null +++ b/libvpx/vp9/common/vp9_ppflags.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PPFLAGS_H_ +#define VP9_COMMON_VP9_PPFLAGS_H_ + +enum { + VP9D_NOFILTERING = 0, + VP9D_DEBLOCK = 1 << 0, + VP9D_DEMACROBLOCK = 1 << 1, + VP9D_ADDNOISE = 1 << 2, + VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, + VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, + VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, + VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, + VP9D_DEBUG_DRAW_MV = 1 << 7, + VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, + VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9 +}; + +typedef struct { + int post_proc_flag; + int deblocking_level; + int noise_level; + int display_ref_frame_flag; + int display_mb_modes_flag; + int display_b_modes_flag; + int display_mv_flag; +} vp9_ppflags_t; + +#endif // VP9_COMMON_VP9_PPFLAGS_H_ diff --git a/libvpx/vp9/common/vp9_pragmas.h b/libvpx/vp9/common/vp9_pragmas.h new file mode 100644 index 000000000..f079161d6 --- /dev/null +++ b/libvpx/vp9/common/vp9_pragmas.h @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PRAGMAS_H_ +#define VP9_COMMON_VP9_PRAGMAS_H_ + +#ifdef __INTEL_COMPILER +#pragma warning(disable:997 1011 170) +#endif + +#ifdef _MSC_VER +#pragma warning(disable:4799) +#endif + +#endif // VP9_COMMON_VP9_PRAGMAS_H_ diff --git a/libvpx/vp9/common/vp9_pred_common.c b/libvpx/vp9/common/vp9_pred_common.c new file mode 100644 index 000000000..17da4f215 --- /dev/null +++ b/libvpx/vp9/common/vp9_pred_common.c @@ -0,0 +1,520 @@ + +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <limits.h> + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_pred_common.h" +#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_treecoder.h" + +// TBD prediction functions for various bitstream signals + +// Returns a context number for the given MB prediction signal +unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + int pred_context; + const MODE_INFO *const mi = xd->mode_info_context; + const MODE_INFO *const above_mi = mi - cm->mode_info_stride; + const MODE_INFO *const left_mi = mi - 1; + const int left_in_image = xd->left_available && left_mi->mbmi.mb_in_image; + const int above_in_image = xd->up_available && above_mi->mbmi.mb_in_image; + // Note: + // The mode info data structure has a one element border above and to the + // left of the entries correpsonding to real macroblocks. + // The prediction flags in these dummy entries are initialised to 0. + switch (pred_id) { + case PRED_SEG_ID: + pred_context = above_mi->mbmi.seg_id_predicted; + if (xd->left_available) + pred_context += left_mi->mbmi.seg_id_predicted; + break; + + case PRED_MBSKIP: + pred_context = above_mi->mbmi.mb_skip_coeff; + if (xd->left_available) + pred_context += left_mi->mbmi.mb_skip_coeff; + break; + + case PRED_SWITCHABLE_INTERP: { + // left + const int left_mv_pred = is_inter_mode(left_mi->mbmi.mode); + const int left_interp = left_in_image && left_mv_pred ? + vp9_switchable_interp_map[left_mi->mbmi.interp_filter] : + VP9_SWITCHABLE_FILTERS; + + // above + const int above_mv_pred = is_inter_mode(above_mi->mbmi.mode); + const int above_interp = above_in_image && above_mv_pred ? + vp9_switchable_interp_map[above_mi->mbmi.interp_filter] : + VP9_SWITCHABLE_FILTERS; + + assert(left_interp != -1); + assert(above_interp != -1); + + if (left_interp == above_interp) + pred_context = left_interp; + else if (left_interp == VP9_SWITCHABLE_FILTERS && + above_interp != VP9_SWITCHABLE_FILTERS) + pred_context = above_interp; + else if (left_interp != VP9_SWITCHABLE_FILTERS && + above_interp == VP9_SWITCHABLE_FILTERS) + pred_context = left_interp; + else + pred_context = VP9_SWITCHABLE_FILTERS; + + break; + } + + case PRED_INTRA_INTER: { + if (above_in_image && left_in_image) { // both edges available + if (left_mi->mbmi.ref_frame[0] == INTRA_FRAME && + above_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (3) + pred_context = 3; + } else { // intra/inter (1) or inter/inter (0) + pred_context = left_mi->mbmi.ref_frame[0] == INTRA_FRAME || + above_mi->mbmi.ref_frame[0] == INTRA_FRAME; + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge = above_in_image ? above_mi : left_mi; + + // inter: 0, intra: 2 + pred_context = 2 * (edge->mbmi.ref_frame[0] == INTRA_FRAME); + } else { + pred_context = 0; + } + assert(pred_context >= 0 && pred_context < INTRA_INTER_CONTEXTS); + break; + } + + case PRED_COMP_INTER_INTER: { + if (above_in_image && left_in_image) { // both edges available + if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && + left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { + // neither edge uses comp pred (0/1) + pred_context = ((above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref) ^ + (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref)); + } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { + // one of two edges uses comp pred (2/3) + pred_context = 2 + + (above_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref || + above_mi->mbmi.ref_frame[0] == INTRA_FRAME); + } else if (left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { + // one of two edges uses comp pred (2/3) + pred_context = 2 + + (left_mi->mbmi.ref_frame[0] == cm->comp_fixed_ref || + left_mi->mbmi.ref_frame[0] == INTRA_FRAME); + } else { // both edges use comp pred (4) + pred_context = 4; + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge = above_in_image ? above_mi : left_mi; + + if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { + // edge does not use comp pred (0/1) + pred_context = edge->mbmi.ref_frame[0] == cm->comp_fixed_ref; + } else { // edge uses comp pred (3) + pred_context = 3; + } + } else { // no edges available (1) + pred_context = 1; + } + assert(pred_context >= 0 && pred_context < COMP_INTER_CONTEXTS); + break; + } + + case PRED_COMP_REF_P: { + const int fix_ref_idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref]; + const int var_ref_idx = !fix_ref_idx; + + if (above_in_image && left_in_image) { // both edges available + if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/intra (2) + pred_context = 2; + } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { // intra/inter + const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? + left_mi : above_mi; + + if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { // single pred (1/3) + pred_context = 1 + + 2 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1]; + } else { // comp pred (1/3) + pred_context = 1 + + 2 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1]; + } + } else { // inter/inter + int l_sg = left_mi->mbmi.ref_frame[1] <= INTRA_FRAME; + int a_sg = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME; + MV_REFERENCE_FRAME vrfa = a_sg ? above_mi->mbmi.ref_frame[0] : + above_mi->mbmi.ref_frame[var_ref_idx]; + MV_REFERENCE_FRAME vrfl = l_sg ? left_mi->mbmi.ref_frame[0] : + left_mi->mbmi.ref_frame[var_ref_idx]; + + if (vrfa == vrfl && cm->comp_var_ref[1] == vrfa) { + pred_context = 0; + } else if (l_sg && a_sg) { // single/single + if ((vrfa == cm->comp_fixed_ref && vrfl == cm->comp_var_ref[0]) || + (vrfl == cm->comp_fixed_ref && vrfa == cm->comp_var_ref[0])) { + pred_context = 4; + } else if (vrfa == vrfl) { + pred_context = 3; + } else { + pred_context = 1; + } + } else if (l_sg || a_sg) { // single/comp + MV_REFERENCE_FRAME vrfc = l_sg ? vrfa : vrfl; + MV_REFERENCE_FRAME rfs = a_sg ? vrfa : vrfl; + + if (vrfc == cm->comp_var_ref[1] && rfs != cm->comp_var_ref[1]) { + pred_context = 1; + } else if (rfs == cm->comp_var_ref[1] && + vrfc != cm->comp_var_ref[1]) { + pred_context = 2; + } else { + pred_context = 4; + } + } else if (vrfa == vrfl) { // comp/comp + pred_context = 4; + } else { + pred_context = 2; + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge = above_in_image ? above_mi : left_mi; + + if (edge->mbmi.ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (edge->mbmi.ref_frame[1] > INTRA_FRAME) { + pred_context = + 4 * edge->mbmi.ref_frame[var_ref_idx] != cm->comp_var_ref[1]; + } else { + pred_context = 3 * edge->mbmi.ref_frame[0] != cm->comp_var_ref[1]; + } + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + break; + } + + case PRED_SINGLE_REF_P1: { + if (above_in_image && left_in_image) { // both edges available + if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { + const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? + left_mi : above_mi; + + if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { + pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME); + } else { + pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME || + edge->mbmi.ref_frame[1] == LAST_FRAME); + } + } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && + left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { + pred_context = 2 * (above_mi->mbmi.ref_frame[0] == LAST_FRAME) + + 2 * (left_mi->mbmi.ref_frame[0] == LAST_FRAME); + } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME && + left_mi->mbmi.ref_frame[1] > INTRA_FRAME) { + pred_context = 1 + (above_mi->mbmi.ref_frame[0] == LAST_FRAME || + above_mi->mbmi.ref_frame[1] == LAST_FRAME || + left_mi->mbmi.ref_frame[0] == LAST_FRAME || + left_mi->mbmi.ref_frame[1] == LAST_FRAME); + } else { + MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ? + above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; + MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? + above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; + MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? + above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1]; + + if (rfs == LAST_FRAME) { + pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME); + } else { + pred_context = crf1 == LAST_FRAME || crf2 == LAST_FRAME; + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge = above_in_image ? above_mi : left_mi; + + if (edge->mbmi.ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { + pred_context = 4 * (edge->mbmi.ref_frame[0] == LAST_FRAME); + } else { + pred_context = 1 + (edge->mbmi.ref_frame[0] == LAST_FRAME || + edge->mbmi.ref_frame[1] == LAST_FRAME); + } + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + break; + } + + case PRED_SINGLE_REF_P2: { + if (above_in_image && left_in_image) { // both edges available + if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME && + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { + pred_context = 2; + } else if (above_mi->mbmi.ref_frame[0] == INTRA_FRAME || + left_mi->mbmi.ref_frame[0] == INTRA_FRAME) { + const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == INTRA_FRAME ? + left_mi : above_mi; + + if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { + if (edge->mbmi.ref_frame[0] == LAST_FRAME) { + pred_context = 3; + } else { + pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); + } + } else { + pred_context = 1 + 2 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME || + edge->mbmi.ref_frame[1] == GOLDEN_FRAME); + } + } else if (above_mi->mbmi.ref_frame[1] <= INTRA_FRAME && + left_mi->mbmi.ref_frame[1] <= INTRA_FRAME) { + if (above_mi->mbmi.ref_frame[0] == LAST_FRAME && + left_mi->mbmi.ref_frame[0] == LAST_FRAME) { + pred_context = 3; + } else if (above_mi->mbmi.ref_frame[0] == LAST_FRAME || + left_mi->mbmi.ref_frame[0] == LAST_FRAME) { + const MODE_INFO *edge = above_mi->mbmi.ref_frame[0] == LAST_FRAME ? + left_mi : above_mi; + + pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 2 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME) + + 2 * (left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME); + } + } else if (above_mi->mbmi.ref_frame[1] > INTRA_FRAME && + left_mi->mbmi.ref_frame[1] > INTRA_FRAME) { + if (above_mi->mbmi.ref_frame[0] == left_mi->mbmi.ref_frame[0] && + above_mi->mbmi.ref_frame[1] == left_mi->mbmi.ref_frame[1]) { + pred_context = 3 * (above_mi->mbmi.ref_frame[0] == GOLDEN_FRAME || + above_mi->mbmi.ref_frame[1] == GOLDEN_FRAME || + left_mi->mbmi.ref_frame[0] == GOLDEN_FRAME || + left_mi->mbmi.ref_frame[1] == GOLDEN_FRAME); + } else { + pred_context = 2; + } + } else { + MV_REFERENCE_FRAME rfs = above_mi->mbmi.ref_frame[1] <= INTRA_FRAME ? + above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; + MV_REFERENCE_FRAME crf1 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? + above_mi->mbmi.ref_frame[0] : left_mi->mbmi.ref_frame[0]; + MV_REFERENCE_FRAME crf2 = above_mi->mbmi.ref_frame[1] > INTRA_FRAME ? + above_mi->mbmi.ref_frame[1] : left_mi->mbmi.ref_frame[1]; + + if (rfs == GOLDEN_FRAME) { + pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } else if (rfs == ALTREF_FRAME) { + pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME; + } else { + pred_context = + 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME); + } + } + } else if (above_in_image || left_in_image) { // one edge available + const MODE_INFO *edge = above_in_image ? above_mi : left_mi; + + if (edge->mbmi.ref_frame[0] == INTRA_FRAME || + (edge->mbmi.ref_frame[0] == LAST_FRAME && + edge->mbmi.ref_frame[1] <= INTRA_FRAME)) { + pred_context = 2; + } else if (edge->mbmi.ref_frame[1] <= INTRA_FRAME) { + pred_context = 4 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME); + } else { + pred_context = 3 * (edge->mbmi.ref_frame[0] == GOLDEN_FRAME || + edge->mbmi.ref_frame[1] == GOLDEN_FRAME); + } + } else { // no edges available (2) + pred_context = 2; + } + assert(pred_context >= 0 && pred_context < REF_CONTEXTS); + break; + } + + case PRED_TX_SIZE: { + int above_context, left_context; + int max_tx_size; + if (mi->mbmi.sb_type < BLOCK_SIZE_SB8X8) + max_tx_size = TX_4X4; + else if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16) + max_tx_size = TX_8X8; + else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32) + max_tx_size = TX_16X16; + else + max_tx_size = TX_32X32; + above_context = left_context = max_tx_size; + if (above_in_image) { + above_context = (above_mi->mbmi.mb_skip_coeff ? + max_tx_size : above_mi->mbmi.txfm_size); + } + if (left_in_image) { + left_context = (left_mi->mbmi.mb_skip_coeff ? + max_tx_size : left_mi->mbmi.txfm_size); + } + if (!left_in_image) { + left_context = above_context; + } + if (!above_in_image) { + above_context = left_context; + } + pred_context = (above_context + left_context > max_tx_size); + break; + } + + default: + assert(0); + pred_context = 0; // *** add error trap code. + break; + } + + return pred_context; +} + +// This function returns a context probability for coding a given +// prediction signal +vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + const int pred_context = vp9_get_pred_context(cm, xd, pred_id); + + switch (pred_id) { + case PRED_SEG_ID: + return cm->segment_pred_probs[pred_context]; + case PRED_MBSKIP: + return cm->fc.mbskip_probs[pred_context]; + case PRED_INTRA_INTER: + return cm->fc.intra_inter_prob[pred_context]; + case PRED_COMP_INTER_INTER: + return cm->fc.comp_inter_prob[pred_context]; + case PRED_COMP_REF_P: + return cm->fc.comp_ref_prob[pred_context]; + case PRED_SINGLE_REF_P1: + return cm->fc.single_ref_prob[pred_context][0]; + case PRED_SINGLE_REF_P2: + return cm->fc.single_ref_prob[pred_context][1]; + default: + assert(0); + return 128; // *** add error trap code. + } +} + +// This function returns a context probability ptr for coding a given +// prediction signal +const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id) { + const MODE_INFO *const mi = xd->mode_info_context; + const int pred_context = vp9_get_pred_context(cm, xd, pred_id); + + switch (pred_id) { + case PRED_SWITCHABLE_INTERP: + return &cm->fc.switchable_interp_prob[pred_context][0]; + + case PRED_TX_SIZE: + if (mi->mbmi.sb_type < BLOCK_SIZE_MB16X16) + return cm->fc.tx_probs_8x8p[pred_context]; + else if (mi->mbmi.sb_type < BLOCK_SIZE_SB32X32) + return cm->fc.tx_probs_16x16p[pred_context]; + else + return cm->fc.tx_probs_32x32p[pred_context]; + + default: + assert(0); + return NULL; // *** add error trap code. + } +} + +// This function returns the status of the given prediction signal. +// I.e. is the predicted value for the given signal correct. +unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, + PRED_ID pred_id) { + switch (pred_id) { + case PRED_SEG_ID: + return xd->mode_info_context->mbmi.seg_id_predicted; + case PRED_MBSKIP: + return xd->mode_info_context->mbmi.mb_skip_coeff; + default: + assert(0); + return 0; // *** add error trap code. + } +} + +// This function sets the status of the given prediction signal. +// I.e. is the predicted value for the given signal correct. +void vp9_set_pred_flag(MACROBLOCKD *const xd, + PRED_ID pred_id, + unsigned char pred_flag) { + const int mis = xd->mode_info_stride; + BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + const int bh = 1 << mi_height_log2(bsize); + const int bw = 1 << mi_width_log2(bsize); +#define sub(a, b) (b) < 0 ? (a) + (b) : (a) + const int x_mis = sub(bw, xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)); + const int y_mis = sub(bh, xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)); +#undef sub + int x, y; + + switch (pred_id) { + case PRED_SEG_ID: + for (y = 0; y < y_mis; y++) { + for (x = 0; x < x_mis; x++) { + xd->mode_info_context[y * mis + x].mbmi.seg_id_predicted = pred_flag; + } + } + break; + + case PRED_MBSKIP: + for (y = 0; y < y_mis; y++) { + for (x = 0; x < x_mis; x++) { + xd->mode_info_context[y * mis + x].mbmi.mb_skip_coeff = pred_flag; + } + } + break; + + default: + assert(0); + // *** add error trap code. + break; + } +} + + +// The following contain the guts of the prediction code used to +// peredict various bitstream signals. + +// Macroblock segment id prediction function +int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type, + int mi_row, int mi_col) { + const int mi_index = mi_row * cm->mi_cols + mi_col; + const int bw = 1 << mi_width_log2(sb_type); + const int bh = 1 << mi_height_log2(sb_type); + const int ymis = MIN(cm->mi_rows - mi_row, bh); + const int xmis = MIN(cm->mi_cols - mi_col, bw); + int segment_id = INT_MAX; + int x, y; + + for (y = 0; y < ymis; y++) { + for (x = 0; x < xmis; x++) { + const int index = mi_index + (y * cm->mi_cols + x); + segment_id = MIN(segment_id, cm->last_frame_seg_map[index]); + } + } + return segment_id; +} diff --git a/libvpx/vp9/common/vp9_pred_common.h b/libvpx/vp9/common/vp9_pred_common.h new file mode 100644 index 000000000..b728724b7 --- /dev/null +++ b/libvpx/vp9/common/vp9_pred_common.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_PRED_COMMON_H_ +#define VP9_COMMON_VP9_PRED_COMMON_H_ + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" + +// Predicted items +typedef enum { + PRED_SEG_ID = 0, // Segment identifier + PRED_MBSKIP = 1, + PRED_SWITCHABLE_INTERP = 2, + PRED_INTRA_INTER = 3, + PRED_COMP_INTER_INTER = 4, + PRED_SINGLE_REF_P1 = 5, + PRED_SINGLE_REF_P2 = 6, + PRED_COMP_REF_P = 7, + PRED_TX_SIZE = 8 +} PRED_ID; + +unsigned char vp9_get_pred_context(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +vp9_prob vp9_get_pred_prob(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +const vp9_prob *vp9_get_pred_probs(const VP9_COMMON *const cm, + const MACROBLOCKD *const xd, + PRED_ID pred_id); + +unsigned char vp9_get_pred_flag(const MACROBLOCKD *const xd, + PRED_ID pred_id); + +void vp9_set_pred_flag(MACROBLOCKD *const xd, + PRED_ID pred_id, + unsigned char pred_flag); + + +int vp9_get_pred_mi_segid(VP9_COMMON *cm, BLOCK_SIZE_TYPE sb_type, + int mi_row, int mi_col); + +#endif // VP9_COMMON_VP9_PRED_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_quant_common.c b/libvpx/vp9/common/vp9_quant_common.c new file mode 100644 index 000000000..295c8e738 --- /dev/null +++ b/libvpx/vp9/common/vp9_quant_common.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +static int16_t dc_qlookup[QINDEX_RANGE]; +static int16_t ac_qlookup[QINDEX_RANGE]; + +#define ACDC_MIN 8 + +// TODO(dkovalev) move to common and reuse +static double poly3(double a, double b, double c, double d, double x) { + return a*x*x*x + b*x*x + c*x + d; +} + +void vp9_init_quant_tables() { + int i, val = 4; + + // A "real" q of 1.0 forces lossless mode. + // In practice non lossless Q's between 1.0 and 2.0 (represented here by + // integer values from 5-7 give poor rd results (lower psnr and often + // larger size than the lossless encode. To block out those "not very useful" + // values we increment the ac and dc q lookup values by 4 after position 0. + ac_qlookup[0] = val; + dc_qlookup[0] = val; + val += 4; + + for (i = 1; i < QINDEX_RANGE; i++) { + const int ac_val = val; + + val = (int)(val * 1.01975); + if (val == ac_val) + ++val; + + ac_qlookup[i] = (int16_t)ac_val; + dc_qlookup[i] = (int16_t)MAX(ACDC_MIN, poly3(0.000000305, -0.00065, 0.9, + 0.5, ac_val)); + } +} + +int16_t vp9_dc_quant(int qindex, int delta) { + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; +} + +int16_t vp9_ac_quant(int qindex, int delta) { + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; +} + + +int vp9_get_qindex(MACROBLOCKD *xd, int segment_id, int base_qindex) { + if (vp9_segfeature_active(xd, segment_id, SEG_LVL_ALT_Q)) { + const int data = vp9_get_segdata(xd, segment_id, SEG_LVL_ALT_Q); + return xd->mb_segment_abs_delta == SEGMENT_ABSDATA ? + data : // Abs value + clamp(base_qindex + data, 0, MAXQ); // Delta value + } else { + return base_qindex; + } +} + diff --git a/libvpx/vp9/common/vp9_quant_common.h b/libvpx/vp9/common/vp9_quant_common.h new file mode 100644 index 000000000..ded94269a --- /dev/null +++ b/libvpx/vp9/common/vp9_quant_common.h @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ +#define VP9_COMMON_VP9_QUANT_COMMON_H_ + +#include "vp9/common/vp9_blockd.h" + +#define MINQ 0 +#define MAXQ 255 +#define QINDEX_RANGE (MAXQ - MINQ + 1) +#define QINDEX_BITS 8 + +void vp9_init_quant_tables(); + +int16_t vp9_dc_quant(int qindex, int delta); +int16_t vp9_ac_quant(int qindex, int delta); + +int vp9_get_qindex(MACROBLOCKD *mb, int segment_id, int base_qindex); + +#endif // VP9_COMMON_VP9_QUANT_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_reconinter.c b/libvpx/vp9/common/vp9_reconinter.c new file mode 100644 index 000000000..b28d33319 --- /dev/null +++ b/libvpx/vp9/common/vp9_reconinter.c @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_filter.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" + +static int scale_value_x_with_scaling(int val, + const struct scale_factors *scale) { + return (val * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT); +} + +static int scale_value_y_with_scaling(int val, + const struct scale_factors *scale) { + return (val * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT); +} + +static int unscaled_value(int val, const struct scale_factors *scale) { + (void) scale; + return val; +} + +static int_mv32 mv_q3_to_q4_with_scaling(const int_mv *src_mv, + const struct scale_factors *scale) { + // returns mv * scale + offset + int_mv32 result; + const int32_t mv_row_q4 = src_mv->as_mv.row << 1; + const int32_t mv_col_q4 = src_mv->as_mv.col << 1; + + result.as_mv.row = (mv_row_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) + + scale->y_offset_q4; + result.as_mv.col = (mv_col_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) + + scale->x_offset_q4; + return result; +} + +static int_mv32 mv_q3_to_q4_without_scaling(const int_mv *src_mv, + const struct scale_factors *scale) { + // returns mv * scale + offset + int_mv32 result; + + result.as_mv.row = src_mv->as_mv.row << 1; + result.as_mv.col = src_mv->as_mv.col << 1; + return result; +} + +static int32_t mv_component_q4_with_scaling(int mv_q4, int scale_fp, + int offset_q4) { + int32_t scaled_mv; + // returns the scaled and offset value of the mv component. + scaled_mv = (mv_q4 * scale_fp >> VP9_REF_SCALE_SHIFT) + offset_q4; + + return scaled_mv; +} + +static int32_t mv_component_q4_without_scaling(int mv_q4, int scale_fp, + int offset_q4) { + // returns the scaled and offset value of the mv component. + (void)scale_fp; + (void)offset_q4; + return mv_q4; +} + +static void set_offsets_with_scaling(struct scale_factors *scale, + int row, int col) { + const int x_q4 = 16 * col; + const int y_q4 = 16 * row; + + scale->x_offset_q4 = (x_q4 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf; + scale->y_offset_q4 = (y_q4 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT) & 0xf; +} + +static void set_offsets_without_scaling(struct scale_factors *scale, + int row, int col) { + scale->x_offset_q4 = 0; + scale->y_offset_q4 = 0; +} + +static int get_fixed_point_scale_factor(int other_size, int this_size) { + // Calculate scaling factor once for each reference frame + // and use fixed point scaling factors in decoding and encoding routines. + // Hardware implementations can calculate scale factor in device driver + // and use multiplication and shifting on hardware instead of division. + return (other_size << VP9_REF_SCALE_SHIFT) / this_size; +} + +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + int other_w, int other_h, + int this_w, int this_h) { + scale->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w); + scale->x_offset_q4 = 0; // calculated per-mb + scale->x_step_q4 = (16 * scale->x_scale_fp >> VP9_REF_SCALE_SHIFT); + + scale->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h); + scale->y_offset_q4 = 0; // calculated per-mb + scale->y_step_q4 = (16 * scale->y_scale_fp >> VP9_REF_SCALE_SHIFT); + + if ((other_w == this_w) && (other_h == this_h)) { + scale->scale_value_x = unscaled_value; + scale->scale_value_y = unscaled_value; + scale->set_scaled_offsets = set_offsets_without_scaling; + scale->scale_mv_q3_to_q4 = mv_q3_to_q4_without_scaling; + scale->scale_mv_component_q4 = mv_component_q4_without_scaling; + } else { + scale->scale_value_x = scale_value_x_with_scaling; + scale->scale_value_y = scale_value_y_with_scaling; + scale->set_scaled_offsets = set_offsets_with_scaling; + scale->scale_mv_q3_to_q4 = mv_q3_to_q4_with_scaling; + scale->scale_mv_component_q4 = mv_component_q4_with_scaling; + } + + // TODO(agrange): Investigate the best choice of functions to use here + // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what + // to do at full-pel offsets. The current selection, where the filter is + // applied in one direction only, and not at all for 0,0, seems to give the + // best quality, but it may be worth trying an additional mode that does + // do the filtering on full-pel. + if (scale->x_step_q4 == 16) { + if (scale->y_step_q4 == 16) { + // No scaling in either direction. + scale->predict[0][0][0] = vp9_convolve_copy; + scale->predict[0][0][1] = vp9_convolve_avg; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // No scaling in x direction. Must always scale in the y direction. + scale->predict[0][0][0] = vp9_convolve8_vert; + scale->predict[0][0][1] = vp9_convolve8_avg_vert; + scale->predict[0][1][0] = vp9_convolve8_vert; + scale->predict[0][1][1] = vp9_convolve8_avg_vert; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } else { + if (scale->y_step_q4 == 16) { + // No scaling in the y direction. Must always scale in the x direction. + scale->predict[0][0][0] = vp9_convolve8_horiz; + scale->predict[0][0][1] = vp9_convolve8_avg_horiz; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8_horiz; + scale->predict[1][0][1] = vp9_convolve8_avg_horiz; + } else { + // Must always scale in both directions. + scale->predict[0][0][0] = vp9_convolve8; + scale->predict[0][0][1] = vp9_convolve8_avg; + scale->predict[0][1][0] = vp9_convolve8; + scale->predict[0][1][1] = vp9_convolve8_avg; + scale->predict[1][0][0] = vp9_convolve8; + scale->predict[1][0][1] = vp9_convolve8_avg; + } + } + // 2D subpel motion always gets filtered in both directions + scale->predict[1][1][0] = vp9_convolve8; + scale->predict[1][1][1] = vp9_convolve8_avg; +} + +void vp9_setup_interp_filters(MACROBLOCKD *xd, + INTERPOLATIONFILTERTYPE mcomp_filter_type, + VP9_COMMON *cm) { + if (xd->mode_info_context) { + MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; + + set_scale_factors(xd, + mbmi->ref_frame[0] - 1, + mbmi->ref_frame[1] - 1, + cm->active_ref_scale); + } + + switch (mcomp_filter_type) { + case EIGHTTAP: + case SWITCHABLE: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8; + break; + case EIGHTTAP_SMOOTH: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp; + break; + case EIGHTTAP_SHARP: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s; + break; + case BILINEAR: + xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters; + break; + } + assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0); +} + +void vp9_copy_mem16x16_c(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride) { + int r; + + for (r = 0; r < 16; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + dst[8] = src[8]; + dst[9] = src[9]; + dst[10] = src[10]; + dst[11] = src[11]; + dst[12] = src[12]; + dst[13] = src[13]; + dst[14] = src[14]; + dst[15] = src[15]; + +#else + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; + ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2]; + ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3]; + +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_copy_mem8x8_c(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride) { + int r; + + for (r = 0; r < 8; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_copy_mem8x4_c(const uint8_t *src, + int src_stride, + uint8_t *dst, + int dst_stride) { + int r; + + for (r = 0; r < 4; r++) { +#if !(CONFIG_FAST_UNALIGNED) + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +#else + ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0]; + ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1]; +#endif + src += src_stride; + dst += dst_stride; + } +} + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q3, + const struct scale_factors *scale, + int w, int h, int weight, + const struct subpix_fn_table *subpix) { + int_mv32 mv = scale->scale_mv_q3_to_q4(mv_q3, scale); + src += (mv.as_mv.row >> 4) * src_stride + (mv.as_mv.col >> 4); + scale->predict[!!(mv.as_mv.col & 15)][!!(mv.as_mv.row & 15)][weight]( + src, src_stride, dst, dst_stride, + subpix->filter_x[mv.as_mv.col & 15], scale->x_step_q4, + subpix->filter_y[mv.as_mv.row & 15], scale->y_step_q4, + w, h); +} + +void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q4, + const struct scale_factors *scale, + int w, int h, int weight, + const struct subpix_fn_table *subpix) { + const int scaled_mv_row_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.row, + scale->y_scale_fp, + scale->y_offset_q4); + const int scaled_mv_col_q4 = scale->scale_mv_component_q4(mv_q4->as_mv.col, + scale->x_scale_fp, + scale->x_offset_q4); + const int subpel_x = scaled_mv_col_q4 & 15; + const int subpel_y = scaled_mv_row_q4 & 15; + + src += (scaled_mv_row_q4 >> 4) * src_stride + (scaled_mv_col_q4 >> 4); + scale->predict[!!subpel_x][!!subpel_y][weight]( + src, src_stride, dst, dst_stride, + subpix->filter_x[subpel_x], scale->x_step_q4, + subpix->filter_y[subpel_y], scale->y_step_q4, + w, h); +} + +static INLINE int round_mv_comp_q4(int value) { + return (value < 0 ? value - 2 : value + 2) / 4; +} + +static int mi_mv_pred_row_q4(MACROBLOCKD *mb, int idx) { + const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[1].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[2].as_mv[idx].as_mv.row + + mb->mode_info_context->bmi[3].as_mv[idx].as_mv.row; + return round_mv_comp_q4(temp); +} + +static int mi_mv_pred_col_q4(MACROBLOCKD *mb, int idx) { + const int temp = mb->mode_info_context->bmi[0].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[1].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[2].as_mv[idx].as_mv.col + + mb->mode_info_context->bmi[3].as_mv[idx].as_mv.col; + return round_mv_comp_q4(temp); +} + +// TODO(jkoleszar): yet another mv clamping function :-( +MV clamp_mv_to_umv_border_sb(const MV *src_mv, + int bwl, int bhl, int ss_x, int ss_y, + int mb_to_left_edge, int mb_to_top_edge, + int mb_to_right_edge, int mb_to_bottom_edge) { + /* If the MV points so far into the UMV border that no visible pixels + * are used for reconstruction, the subpel part of the MV can be + * discarded and the MV limited to 16 pixels with equivalent results. + */ + const int spel_left = (VP9_INTERP_EXTEND + (4 << bwl)) << 4; + const int spel_right = spel_left - (1 << 4); + const int spel_top = (VP9_INTERP_EXTEND + (4 << bhl)) << 4; + const int spel_bottom = spel_top - (1 << 4); + MV clamped_mv; + + assert(ss_x <= 1); + assert(ss_y <= 1); + clamped_mv.col = clamp(src_mv->col << (1 - ss_x), + (mb_to_left_edge << (1 - ss_x)) - spel_left, + (mb_to_right_edge << (1 - ss_x)) + spel_right); + clamped_mv.row = clamp(src_mv->row << (1 - ss_y), + (mb_to_top_edge << (1 - ss_y)) - spel_top, + (mb_to_bottom_edge << (1 - ss_y)) + spel_bottom); + return clamped_mv; +} + +struct build_inter_predictors_args { + MACROBLOCKD *xd; + int x; + int y; + uint8_t* dst[MAX_MB_PLANE]; + int dst_stride[MAX_MB_PLANE]; + uint8_t* pre[2][MAX_MB_PLANE]; + int pre_stride[2][MAX_MB_PLANE]; +}; +static void build_inter_predictors(int plane, int block, + BLOCK_SIZE_TYPE bsize, + int pred_w, int pred_h, + void *argv) { + const struct build_inter_predictors_args* const arg = argv; + MACROBLOCKD * const xd = arg->xd; + const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x; + const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y; + const int bh = 4 << bhl, bw = 4 << bwl; + const int x = 4 * (block & ((1 << bwl) - 1)), y = 4 * (block >> bwl); + const int use_second_ref = xd->mode_info_context->mbmi.ref_frame[1] > 0; + int which_mv; + + assert(x < bw); + assert(y < bh); + assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || + 4 << pred_w == bw); + assert(xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8 || + 4 << pred_h == bh); + + for (which_mv = 0; which_mv < 1 + use_second_ref; ++which_mv) { + // source + const uint8_t * const base_pre = arg->pre[which_mv][plane]; + const int pre_stride = arg->pre_stride[which_mv][plane]; + const uint8_t *const pre = base_pre + + scaled_buffer_offset(x, y, pre_stride, &xd->scale_factor[which_mv]); + struct scale_factors * const scale = + plane == 0 ? &xd->scale_factor[which_mv] : &xd->scale_factor_uv[which_mv]; + + // dest + uint8_t *const dst = arg->dst[plane] + arg->dst_stride[plane] * y + x; + + // motion vector + const MV *mv; + MV split_chroma_mv; + int_mv clamped_mv; + + if (xd->mode_info_context->mbmi.sb_type < BLOCK_SIZE_SB8X8) { + if (plane == 0) { + mv = &xd->mode_info_context->bmi[block].as_mv[which_mv].as_mv; + } else { + // TODO(jkoleszar): All chroma MVs in SPLITMV mode are taken as the + // same MV (the average of the 4 luma MVs) but we could do something + // smarter for non-4:2:0. Just punt for now, pending the changes to get + // rid of SPLITMV mode entirely. + split_chroma_mv.row = mi_mv_pred_row_q4(xd, which_mv); + split_chroma_mv.col = mi_mv_pred_col_q4(xd, which_mv); + mv = &split_chroma_mv; + } + } else { + mv = &xd->mode_info_context->mbmi.mv[which_mv].as_mv; + } + + /* TODO(jkoleszar): This clamping is done in the incorrect place for the + * scaling case. It needs to be done on the scaled MV, not the pre-scaling + * MV. Note however that it performs the subsampling aware scaling so + * that the result is always q4. + */ + clamped_mv.as_mv = clamp_mv_to_umv_border_sb(mv, bwl, bhl, + xd->plane[plane].subsampling_x, + xd->plane[plane].subsampling_y, + xd->mb_to_left_edge, + xd->mb_to_top_edge, + xd->mb_to_right_edge, + xd->mb_to_bottom_edge); + scale->set_scaled_offsets(scale, arg->y + y, arg->x + x); + + vp9_build_inter_predictor_q4(pre, pre_stride, + dst, arg->dst_stride[plane], + &clamped_mv, &xd->scale_factor[which_mv], + 4 << pred_w, 4 << pred_h, which_mv, + &xd->subpix); + } +} +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, + int mi_row, + int mi_col, + BLOCK_SIZE_TYPE bsize) { + struct build_inter_predictors_args args = { + xd, mi_col * MI_SIZE, mi_row * MI_SIZE, + {xd->plane[0].dst.buf, NULL, NULL}, {xd->plane[0].dst.stride, 0, 0}, + {{xd->plane[0].pre[0].buf, NULL, NULL}, + {xd->plane[0].pre[1].buf, NULL, NULL}}, + {{xd->plane[0].pre[0].stride, 0, 0}, {xd->plane[0].pre[1].stride, 0, 0}}, + }; + + foreach_predicted_block_in_plane(xd, bsize, 0, build_inter_predictors, &args); +} +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, + int mi_row, + int mi_col, + BLOCK_SIZE_TYPE bsize) { + struct build_inter_predictors_args args = { + xd, mi_col * MI_SIZE, mi_row * MI_SIZE, +#if CONFIG_ALPHA + {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf, + xd->plane[3].dst.buf}, + {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride, + xd->plane[3].dst.stride}, + {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf, + xd->plane[3].pre[0].buf}, + {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf, + xd->plane[3].pre[1].buf}}, + {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride, + xd->plane[3].pre[0].stride}, + {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride, + xd->plane[3].pre[1].stride}}, +#else + {NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf}, + {0, xd->plane[1].dst.stride, xd->plane[1].dst.stride}, + {{NULL, xd->plane[1].pre[0].buf, xd->plane[2].pre[0].buf}, + {NULL, xd->plane[1].pre[1].buf, xd->plane[2].pre[1].buf}}, + {{0, xd->plane[1].pre[0].stride, xd->plane[1].pre[0].stride}, + {0, xd->plane[1].pre[1].stride, xd->plane[1].pre[1].stride}}, +#endif + }; + foreach_predicted_block_uv(xd, bsize, build_inter_predictors, &args); +} +void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, + int mi_row, int mi_col, + BLOCK_SIZE_TYPE bsize) { + + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize); +} + +/*encoder only*/ +void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd, + int mb_row, int mb_col) { + vp9_build_inter_predictors_sbuv(xd, mb_row, mb_col, + BLOCK_SIZE_MB16X16); +} + +// TODO(dkovalev: find better place for this function) +void vp9_setup_scale_factors(VP9_COMMON *cm, int i) { + const int ref = cm->active_ref_idx[i]; + struct scale_factors *const sf = &cm->active_ref_scale[i]; + if (ref >= NUM_YV12_BUFFERS) { + memset(sf, 0, sizeof(*sf)); + } else { + YV12_BUFFER_CONFIG *const fb = &cm->yv12_fb[ref]; + vp9_setup_scale_factors_for_frame(sf, + fb->y_crop_width, fb->y_crop_height, + cm->width, cm->height); + } +} + diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h new file mode 100644 index 000000000..4e521850d --- /dev/null +++ b/libvpx/vp9/common/vp9_reconinter.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_RECONINTER_H_ +#define VP9_COMMON_VP9_RECONINTER_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_onyxc_int.h" + +struct subpix_fn_table; +void vp9_build_inter_predictors_sby(MACROBLOCKD *xd, + int mb_row, + int mb_col, + BLOCK_SIZE_TYPE bsize); + +void vp9_build_inter_predictors_sbuv(MACROBLOCKD *xd, + int mb_row, + int mb_col, + BLOCK_SIZE_TYPE bsize); + +void vp9_build_inter_predictors_sb(MACROBLOCKD *mb, + int mb_row, int mb_col, + BLOCK_SIZE_TYPE bsize); + +void vp9_setup_interp_filters(MACROBLOCKD *xd, + INTERPOLATIONFILTERTYPE filter, + VP9_COMMON *cm); + +void vp9_setup_scale_factors_for_frame(struct scale_factors *scale, + int other_w, int other_h, + int this_w, int this_h); + +void vp9_build_inter_predictor(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q3, + const struct scale_factors *scale, + int w, int h, int do_avg, + const struct subpix_fn_table *subpix); + +void vp9_build_inter_predictor_q4(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int_mv *mv_q4, + const struct scale_factors *scale, + int w, int h, int do_avg, + const struct subpix_fn_table *subpix); + +static int scaled_buffer_offset(int x_offset, int y_offset, int stride, + const struct scale_factors *scale) { + const int x = scale ? scale->scale_value_x(x_offset, scale) : x_offset; + const int y = scale ? scale->scale_value_y(y_offset, scale) : y_offset; + return y * stride + x; +} + +static void setup_pred_plane(struct buf_2d *dst, + uint8_t *src, int stride, + int mi_row, int mi_col, + const struct scale_factors *scale, + int subsampling_x, int subsampling_y) { + const int x = (MI_SIZE * mi_col) >> subsampling_x; + const int y = (MI_SIZE * mi_row) >> subsampling_y; + dst->buf = src + scaled_buffer_offset(x, y, stride, scale); + dst->stride = stride; +} + +// TODO(jkoleszar): audit all uses of this that don't set mb_row, mb_col +static void setup_dst_planes(MACROBLOCKD *xd, + const YV12_BUFFER_CONFIG *src, + int mi_row, int mi_col) { + uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + int i; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + struct macroblockd_plane *pd = &xd->plane[i]; + setup_pred_plane(&pd->dst, buffers[i], strides[i], mi_row, mi_col, NULL, + pd->subsampling_x, pd->subsampling_y); + } +} + +static void setup_pre_planes(MACROBLOCKD *xd, + const YV12_BUFFER_CONFIG *src0, + const YV12_BUFFER_CONFIG *src1, + int mi_row, int mi_col, + const struct scale_factors *scale, + const struct scale_factors *scale_uv) { + const YV12_BUFFER_CONFIG *srcs[2] = {src0, src1}; + int i, j; + + for (i = 0; i < 2; ++i) { + const YV12_BUFFER_CONFIG *src = srcs[i]; + if (src) { + uint8_t* buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, + src->alpha_buffer}; + int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, + src->alpha_stride}; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + struct macroblockd_plane *pd = &xd->plane[j]; + const struct scale_factors *sf = j ? scale_uv : scale; + setup_pred_plane(&pd->pre[i], + buffers[j], strides[j], + mi_row, mi_col, sf ? &sf[i] : NULL, + pd->subsampling_x, pd->subsampling_y); + } + } + } +} + +static void set_scale_factors(MACROBLOCKD *xd, + int ref0, int ref1, + struct scale_factors scale_factor[MAX_REF_FRAMES]) { + + xd->scale_factor[0] = scale_factor[ref0 >= 0 ? ref0 : 0]; + xd->scale_factor[1] = scale_factor[ref1 >= 0 ? ref1 : 0]; + xd->scale_factor_uv[0] = xd->scale_factor[0]; + xd->scale_factor_uv[1] = xd->scale_factor[1]; +} + +void vp9_setup_scale_factors(VP9_COMMON *cm, int i); + +#endif // VP9_COMMON_VP9_RECONINTER_H_ diff --git a/libvpx/vp9/common/vp9_reconintra.c b/libvpx/vp9/common/vp9_reconintra.c new file mode 100644 index 000000000..85dfe5137 --- /dev/null +++ b/libvpx/vp9/common/vp9_reconintra.c @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdio.h> + +#include "./vpx_config.h" +#include "vp9_rtcd.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_onyxc_int.h" +#include "vpx_mem/vpx_mem.h" + +static void d27_predictor(uint8_t *ypred_ptr, int y_stride, + int bw, int bh, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + // first column + for (r = 0; r < bh - 1; ++r) { + ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] + + yleft_col[r + 1], 1); + } + ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1]; + ypred_ptr++; + // second column + for (r = 0; r < bh - 2; ++r) { + ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r] + + yleft_col[r + 1] * 2 + + yleft_col[r + 2], 2); + } + ypred_ptr[(bh - 2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[bh - 2] + + yleft_col[bh - 1] * 3, + 2); + ypred_ptr[(bh - 1) * y_stride] = yleft_col[bh-1]; + ypred_ptr++; + + // rest of last row + for (c = 0; c < bw - 2; ++c) { + ypred_ptr[(bh - 1) * y_stride + c] = yleft_col[bh-1]; + } + + for (r = bh - 2; r >= 0; --r) { + for (c = 0; c < bw - 2; ++c) { + ypred_ptr[r * y_stride + c] = ypred_ptr[(r + 1) * y_stride + c - 2]; + } + } +} + +static void d63_predictor(uint8_t *ypred_ptr, int y_stride, + int bw, int bh, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { + if (r & 1) { + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r/2 + c] + + yabove_row[r/2 + c + 1] * 2 + + yabove_row[r/2 + c + 2], 2); + } else { + ypred_ptr[c] =ROUND_POWER_OF_TWO(yabove_row[r/2 + c] + + yabove_row[r/2+ c + 1], 1); + } + } + ypred_ptr += y_stride; + } +} + +static void d45_predictor(uint8_t *ypred_ptr, int y_stride, + int bw, int bh, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + for (r = 0; r < bh; ++r) { + for (c = 0; c < bw; ++c) { + if (r + c + 2 < bw * 2) + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[r + c] + + yabove_row[r + c + 1] * 2 + + yabove_row[r + c + 2], 2); + else + ypred_ptr[c] = yabove_row[bw * 2 - 1]; + } + ypred_ptr += y_stride; + } +} + +static void d117_predictor(uint8_t *ypred_ptr, int y_stride, + int bw, int bh, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + // first row + for (c = 0; c < bw; c++) + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + yabove_row[c], 1); + ypred_ptr += y_stride; + + // second row + ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + + yabove_row[-1] * 2 + + yabove_row[0], 2); + for (c = 1; c < bw; c++) + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] + + yabove_row[c - 1] * 2 + + yabove_row[c], 2); + ypred_ptr += y_stride; + + // the rest of first col + ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + + yleft_col[0] * 2 + + yleft_col[1], 2); + for (r = 3; r < bh; ++r) + ypred_ptr[(r-2) * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 3] + + yleft_col[r - 2] * 2 + + yleft_col[r - 1], 2); + // the rest of the block + for (r = 2; r < bh; ++r) { + for (c = 1; c < bw; c++) + ypred_ptr[c] = ypred_ptr[-2 * y_stride + c - 1]; + ypred_ptr += y_stride; + } +} + + +static void d135_predictor(uint8_t *ypred_ptr, int y_stride, + int bw, int bh, + uint8_t *yabove_row, uint8_t *yleft_col) { + int r, c; + ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + + yabove_row[-1] * 2 + + yabove_row[0], 2); + for (c = 1; c < bw; c++) + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 2] + + yabove_row[c - 1] * 2 + + yabove_row[c], 2); + + ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] + + yleft_col[0] * 2 + + yleft_col[1], 2); + for (r = 2; r < bh; ++r) + ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] + + yleft_col[r - 1] * 2 + + yleft_col[r], 2); + + ypred_ptr += y_stride; + for (r = 1; r < bh; ++r) { + for (c = 1; c < bw; c++) + ypred_ptr[c] = ypred_ptr[-y_stride + c - 1]; + ypred_ptr += y_stride; + } +} + +static void d153_predictor(uint8_t *ypred_ptr, + int y_stride, + int bw, int bh, + uint8_t *yabove_row, + uint8_t *yleft_col) { + int r, c; + ypred_ptr[0] = ROUND_POWER_OF_TWO(yabove_row[-1] + yleft_col[0], 1); + for (r = 1; r < bh; r++) + ypred_ptr[r * y_stride] = + ROUND_POWER_OF_TWO(yleft_col[r - 1] + yleft_col[r], 1); + ypred_ptr++; + + ypred_ptr[0] = ROUND_POWER_OF_TWO(yleft_col[0] + + yabove_row[-1] * 2 + + yabove_row[0], 2); + ypred_ptr[y_stride] = ROUND_POWER_OF_TWO(yabove_row[-1] + + yleft_col[0] * 2 + + yleft_col[1], 2); + for (r = 2; r < bh; r++) + ypred_ptr[r * y_stride] = ROUND_POWER_OF_TWO(yleft_col[r - 2] + + yleft_col[r - 1] * 2 + + yleft_col[r], 2); + ypred_ptr++; + + for (c = 0; c < bw - 2; c++) + ypred_ptr[c] = ROUND_POWER_OF_TWO(yabove_row[c - 1] + + yabove_row[c] * 2 + + yabove_row[c + 1], 2); + ypred_ptr += y_stride; + for (r = 1; r < bh; ++r) { + for (c = 0; c < bw - 2; c++) + ypred_ptr[c] = ypred_ptr[-y_stride + c - 2]; + ypred_ptr += y_stride; + } +} + +void vp9_build_intra_predictors(uint8_t *src, int src_stride, + uint8_t *ypred_ptr, + int y_stride, int mode, + int bw, int bh, + int up_available, int left_available, + int right_available) { + int r, c, i; + uint8_t yleft_col[64], yabove_data[129], ytop_left; + uint8_t *yabove_row = yabove_data + 1; + + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + // .. + + assert(bw == bh); + + if (left_available) { + for (i = 0; i < bh; i++) + yleft_col[i] = src[i * src_stride - 1]; + } else { + vpx_memset(yleft_col, 129, bh); + } + + if (up_available) { + uint8_t *yabove_ptr = src - src_stride; + vpx_memcpy(yabove_row, yabove_ptr, bw); + if (bw == 4 && right_available) + vpx_memcpy(yabove_row + bw, yabove_ptr + bw, bw); + else + vpx_memset(yabove_row + bw, yabove_row[bw -1], bw); + ytop_left = left_available ? yabove_ptr[-1] : 129; + } else { + vpx_memset(yabove_row, 127, bw * 2); + ytop_left = 127; + } + yabove_row[-1] = ytop_left; + + switch (mode) { + case DC_PRED: { + int i; + int expected_dc = 128; + int average = 0; + int count = 0; + + if (up_available || left_available) { + if (up_available) { + for (i = 0; i < bw; i++) + average += yabove_row[i]; + count += bw; + } + if (left_available) { + for (i = 0; i < bh; i++) + average += yleft_col[i]; + count += bh; + } + expected_dc = (average + (count >> 1)) / count; + } + for (r = 0; r < bh; r++) { + vpx_memset(ypred_ptr, expected_dc, bw); + ypred_ptr += y_stride; + } + } + break; + case V_PRED: + for (r = 0; r < bh; r++) { + vpx_memcpy(ypred_ptr, yabove_row, bw); + ypred_ptr += y_stride; + } + break; + case H_PRED: + for (r = 0; r < bh; r++) { + vpx_memset(ypred_ptr, yleft_col[r], bw); + ypred_ptr += y_stride; + } + break; + case TM_PRED: + for (r = 0; r < bh; r++) { + for (c = 0; c < bw; c++) + ypred_ptr[c] = clip_pixel(yleft_col[r] + yabove_row[c] - ytop_left); + ypred_ptr += y_stride; + } + break; + case D45_PRED: + d45_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + case D135_PRED: + d135_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + case D117_PRED: + d117_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + case D153_PRED: + d153_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + case D27_PRED: + d27_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + case D63_PRED: + d63_predictor(ypred_ptr, y_stride, bw, bh, yabove_row, yleft_col); + break; + default: + break; + } +} + +void vp9_build_intra_predictors_sby_s(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize) { + const struct macroblockd_plane* const pd = &xd->plane[0]; + const int bw = plane_block_width(bsize, pd); + const int bh = plane_block_height(bsize, pd); + vp9_build_intra_predictors(pd->dst.buf, pd->dst.stride, + pd->dst.buf, pd->dst.stride, + xd->mode_info_context->mbmi.mode, + bw, bh, xd->up_available, xd->left_available, + 0 /*xd->right_available*/); +} + +void vp9_build_intra_predictors_sbuv_s(MACROBLOCKD *xd, + BLOCK_SIZE_TYPE bsize) { + const int bwl = b_width_log2(bsize), bw = 2 << bwl; + const int bhl = b_height_log2(bsize), bh = 2 << bhl; + + vp9_build_intra_predictors(xd->plane[1].dst.buf, xd->plane[1].dst.stride, + xd->plane[1].dst.buf, xd->plane[1].dst.stride, + xd->mode_info_context->mbmi.uv_mode, + bw, bh, xd->up_available, + xd->left_available, 0 /*xd->right_available*/); + vp9_build_intra_predictors(xd->plane[2].dst.buf, xd->plane[1].dst.stride, + xd->plane[2].dst.buf, xd->plane[1].dst.stride, + xd->mode_info_context->mbmi.uv_mode, + bw, bh, xd->up_available, + xd->left_available, 0 /*xd->right_available*/); +} + +void vp9_predict_intra_block(MACROBLOCKD *xd, + int block_idx, + int bwl_in, + TX_SIZE tx_size, + int mode, + uint8_t *predictor, int pre_stride) { + const int bwl = bwl_in - tx_size; + const int wmask = (1 << bwl) - 1; + const int have_top = (block_idx >> bwl) || xd->up_available; + const int have_left = (block_idx & wmask) || xd->left_available; + const int have_right = ((block_idx & wmask) != wmask); + const int txfm_block_size = 4 << tx_size; + + assert(bwl >= 0); + vp9_build_intra_predictors(predictor, pre_stride, + predictor, pre_stride, + mode, + txfm_block_size, + txfm_block_size, + have_top, have_left, + have_right); +} + +void vp9_intra4x4_predict(MACROBLOCKD *xd, + int block_idx, + BLOCK_SIZE_TYPE bsize, + int mode, + uint8_t *predictor, int pre_stride) { + vp9_predict_intra_block(xd, block_idx, b_width_log2(bsize), TX_4X4, + mode, predictor, pre_stride); +} diff --git a/libvpx/vp9/common/vp9_reconintra.h b/libvpx/vp9/common/vp9_reconintra.h new file mode 100644 index 000000000..f5f5f42c4 --- /dev/null +++ b/libvpx/vp9/common/vp9_reconintra.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_RECONINTRA_H_ +#define VP9_COMMON_VP9_RECONINTRA_H_ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_blockd.h" + +MB_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, + int stride, int n, + int tx, int ty); + +MB_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, int block, + uint8_t *ptr, int stride); + +void vp9_predict_intra_block(MACROBLOCKD *xd, + int block_idx, + int bwl_in, + TX_SIZE tx_size, + int mode, + uint8_t *predictor, int pre_stride); +#endif // VP9_COMMON_VP9_RECONINTRA_H_ diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c new file mode 100644 index 000000000..72613ae07 --- /dev/null +++ b/libvpx/vp9/common/vp9_rtcd.c @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include "vpx_config.h" +#define RTCD_C +#include "vp9_rtcd.h" +#include "vpx_ports/vpx_once.h" + +void vpx_scale_rtcd(void); + +void vp9_rtcd() { + vpx_scale_rtcd(); + once(setup_rtcd_internal); +} diff --git a/libvpx/vp9/common/vp9_rtcd_defs.sh b/libvpx/vp9/common/vp9_rtcd_defs.sh new file mode 100644 index 000000000..a405aab8d --- /dev/null +++ b/libvpx/vp9/common/vp9_rtcd_defs.sh @@ -0,0 +1,611 @@ +vp9_common_forward_decls() { +cat <<EOF +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_enums.h" + +struct loop_filter_info; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct macroblock; +struct vp9_variance_vtable; + +#define DEC_MVCOSTS int *mvjcost, int *mvcost[2] +union int_mv; +struct yv12_buffer_config; +EOF +} +forward_decls vp9_common_forward_decls + +# +# Dequant +# + +prototype void vp9_idct_add_16x16 "int16_t *input, uint8_t *dest, int stride, int eob" +specialize vp9_idct_add_16x16 + +prototype void vp9_idct_add_8x8 "int16_t *input, uint8_t *dest, int stride, int eob" +specialize vp9_idct_add_8x8 + +prototype void vp9_idct_add "int16_t *input, uint8_t *dest, int stride, int eob" +specialize vp9_idct_add + + + +prototype void vp9_idct_add_32x32 "int16_t *q, uint8_t *dst, int stride, int eob" +specialize vp9_idct_add_32x32 + +# +# RECON +# +prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +specialize vp9_copy_mem16x16 mmx sse2 dspr2 +vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2 + +prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +specialize vp9_copy_mem8x8 mmx dspr2 +vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2 + +prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch" +specialize vp9_copy_mem8x4 mmx + +prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available" +specialize void vp9_build_intra_predictors + +prototype void vp9_build_intra_predictors_sby_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize" +specialize vp9_build_intra_predictors_sby_s + +prototype void vp9_build_intra_predictors_sbuv_s "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize" +specialize vp9_build_intra_predictors_sbuv_s + +prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLOCK_SIZE_TYPE bsize, int b_mode, uint8_t *predictor, int pre_stride" +specialize vp9_intra4x4_predict; + +if [ "$CONFIG_VP9_DECODER" = "yes" ]; then +prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_8x8 sse2 + +prototype void vp9_add_constant_residual_16x16 "const int16_t diff, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_16x16 sse2 + +prototype void vp9_add_constant_residual_32x32 "const int16_t diff, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_32x32 sse2 +fi + +# +# Loopfilter +# +prototype void vp9_mb_lpf_vertical_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +specialize vp9_mb_lpf_vertical_edge_w sse2 + +prototype void vp9_mbloop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" +specialize vp9_mbloop_filter_vertical_edge sse2 + +prototype void vp9_loop_filter_vertical_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" +specialize vp9_loop_filter_vertical_edge mmx + +prototype void vp9_mb_lpf_horizontal_edge_w "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh" +specialize vp9_mb_lpf_horizontal_edge_w sse2 + +prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" +specialize vp9_mbloop_filter_horizontal_edge sse2 + +prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" +specialize vp9_loop_filter_horizontal_edge mmx + +# +# post proc +# +if [ "$CONFIG_POSTPROC" = "yes" ]; then +prototype void vp9_mbpost_proc_down "uint8_t *dst, int pitch, int rows, int cols, int flimit" +specialize vp9_mbpost_proc_down mmx sse2 +vp9_mbpost_proc_down_sse2=vp9_mbpost_proc_down_xmm + +prototype void vp9_mbpost_proc_across_ip "uint8_t *src, int pitch, int rows, int cols, int flimit" +specialize vp9_mbpost_proc_across_ip sse2 +vp9_mbpost_proc_across_ip_sse2=vp9_mbpost_proc_across_ip_xmm + +prototype void vp9_post_proc_down_and_across "const uint8_t *src_ptr, uint8_t *dst_ptr, int src_pixels_per_line, int dst_pixels_per_line, int rows, int cols, int flimit" +specialize vp9_post_proc_down_and_across mmx sse2 +vp9_post_proc_down_and_across_sse2=vp9_post_proc_down_and_across_xmm + +prototype void vp9_plane_add_noise "uint8_t *Start, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], unsigned int Width, unsigned int Height, int Pitch" +specialize vp9_plane_add_noise mmx sse2 +vp9_plane_add_noise_sse2=vp9_plane_add_noise_wmt +fi + +prototype void vp9_blend_mb_inner "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" +specialize vp9_blend_mb_inner + +prototype void vp9_blend_mb_outer "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" +specialize vp9_blend_mb_outer + +prototype void vp9_blend_b "uint8_t *y, uint8_t *u, uint8_t *v, int y1, int u1, int v1, int alpha, int stride" +specialize vp9_blend_b + +# +# Sub Pixel Filters +# +prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8 ssse3 + +prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_horiz ssse3 + +prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_vert ssse3 + +prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg ssse3 + +prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_horiz ssse3 + +prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" +specialize vp9_convolve8_avg_vert ssse3 + +# +# dct +# +prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct4x4_1_add + +prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct4x4_add sse2 + +prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct8x8_add sse2 + +prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct10_8x8_add sse2 + +prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_8x8 + +prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_add sse2 + +prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct10_16x16_add sse2 + +prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_16x16 + +prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct32x32_add sse2 + +prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" +specialize vp9_short_idct1_32x32 + +prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct10_32x32_add + +prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_short_iht4x4_add + +prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type" +specialize vp9_short_iht8x8_add + +prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type" +specialize vp9_short_iht16x16_add + +prototype void vp9_idct4_1d "int16_t *input, int16_t *output" +specialize vp9_idct4_1d sse2 +# dct and add + +prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" +specialize vp9_dc_only_idct_add sse2 + +prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_iwalsh4x4_1_add + +prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_iwalsh4x4_add + +prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" +specialize vp9_sad32x3 + +prototype unsigned int vp9_sad3x32 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad" +specialize vp9_sad3x32 + +# +# Encoder functions below this point. +# +if [ "$CONFIG_VP9_ENCODER" = "yes" ]; then + + +# variance +[ $arch = "x86_64" ] && mmx_x86_64=mmx && sse2_x86_64=sse2 + +prototype unsigned int vp9_variance32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance32x16 sse2 + +prototype unsigned int vp9_variance16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance16x32 sse2 + +prototype unsigned int vp9_variance64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance64x32 sse2 + +prototype unsigned int vp9_variance32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance32x64 sse2 + +prototype unsigned int vp9_variance32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance32x32 sse2 + +prototype unsigned int vp9_variance64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance64x64 sse2 + +prototype unsigned int vp9_variance16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance16x16 mmx sse2 + +prototype unsigned int vp9_variance16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance16x8 mmx sse2 + +prototype unsigned int vp9_variance8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance8x16 mmx sse2 + +prototype unsigned int vp9_variance8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance8x8 mmx sse2 + +prototype void vp9_get_sse_sum_8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum" +specialize vp9_get_sse_sum_8x8 sse2 +vp9_get_sse_sum_8x8_sse2=vp9_get8x8var_sse2 + +prototype unsigned int vp9_variance8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance8x4 sse2 + +prototype unsigned int vp9_variance4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance4x8 sse2 + +prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance4x4 mmx sse2 + +prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance64x64 sse2 + +prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance64x64 + +prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance32x64 + +prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x64 + +prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance64x32 + +prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance64x32 + +prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance32x16 + +prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x16 + +prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance16x32 + +prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x32 + +prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance32x32 sse2 + +prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x32 + +prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 + +prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x16 + +prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance8x16 sse2 mmx +vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt + +prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x16 + +prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 +vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; +vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt + +prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x8 + +prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance8x8 sse2 mmx +vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt + +prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x8 + +# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form +prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance8x4 + +prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x4 + +prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance4x8 + +prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance4x8 + +prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_variance4x4 sse2 mmx +vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt + +prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance4x4 + +prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad64x64 sse2 + +prototype unsigned int vp9_sad32x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad32x64 sse2 + +prototype unsigned int vp9_sad64x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad64x32 sse2 + +prototype unsigned int vp9_sad32x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad32x16 sse2 + +prototype unsigned int vp9_sad16x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad16x32 sse2 + +prototype unsigned int vp9_sad32x32 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad32x32 sse2 + +prototype unsigned int vp9_sad16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad16x16 mmx sse2 + +prototype unsigned int vp9_sad16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad16x8 mmx sse2 + +prototype unsigned int vp9_sad8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad8x16 mmx sse2 + +prototype unsigned int vp9_sad8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad8x8 mmx sse2 + +# TODO(jingning): need to covert these functions into mmx/sse2 form +prototype unsigned int vp9_sad8x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad8x4 sse2 + +prototype unsigned int vp9_sad4x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad4x8 sse + +prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" +specialize vp9_sad4x4 mmx sse + +prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar16x16_h mmx sse2 +vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt + +prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar16x16_v mmx sse2 +vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt + +prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt + +prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar64x64_h + +prototype unsigned int vp9_variance_halfpixvar64x64_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar64x64_v + +prototype unsigned int vp9_variance_halfpixvar64x64_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar64x64_hv + +prototype unsigned int vp9_variance_halfpixvar32x32_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar32x32_h + +prototype unsigned int vp9_variance_halfpixvar32x32_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar32x32_v + +prototype unsigned int vp9_variance_halfpixvar32x32_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_variance_halfpixvar32x32_hv + +prototype void vp9_sad64x64x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad64x64x3 + +prototype void vp9_sad32x32x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad32x32x3 + +prototype void vp9_sad16x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x16x3 sse3 ssse3 + +prototype void vp9_sad16x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x8x3 sse3 ssse3 + +prototype void vp9_sad8x16x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x16x3 sse3 + +prototype void vp9_sad8x8x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x8x3 sse3 + +prototype void vp9_sad4x4x3 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array" +specialize vp9_sad4x4x3 sse3 + +prototype void vp9_sad64x64x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad64x64x8 + +prototype void vp9_sad32x32x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad32x32x8 + +prototype void vp9_sad16x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad16x16x8 sse4 + +prototype void vp9_sad16x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad16x8x8 sse4 + +prototype void vp9_sad8x16x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad8x16x8 sse4 + +prototype void vp9_sad8x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad8x8x8 sse4 + +prototype void vp9_sad8x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad8x4x8 + +prototype void vp9_sad4x8x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad4x8x8 + +prototype void vp9_sad4x4x8 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array" +specialize vp9_sad4x4x8 sse4 + +prototype void vp9_sad64x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad64x64x4d sse2 + +prototype void vp9_sad32x64x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad32x64x4d sse2 + +prototype void vp9_sad64x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad64x32x4d sse2 + +prototype void vp9_sad32x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad32x16x4d sse2 + +prototype void vp9_sad16x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x32x4d sse2 + +prototype void vp9_sad32x32x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad32x32x4d sse2 + +prototype void vp9_sad16x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x16x4d sse2 + +prototype void vp9_sad16x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad16x8x4d sse2 + +prototype void vp9_sad8x16x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x16x4d sse2 + +prototype void vp9_sad8x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x8x4d sse2 + +# TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form +prototype void vp9_sad8x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad8x4x4d sse2 + +prototype void vp9_sad4x8x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad4x8x4d sse + +prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" +specialize vp9_sad4x4x4d sse + +prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +specialize vp9_sub_pixel_mse16x16 sse2 mmx + +prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" +specialize vp9_mse16x16 mmx sse2 +vp9_mse16x16_sse2=vp9_mse16x16_wmt + +prototype unsigned int vp9_mse8x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" +specialize vp9_mse8x16 + +prototype unsigned int vp9_mse16x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" +specialize vp9_mse16x8 + +prototype unsigned int vp9_mse8x8 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" +specialize vp9_mse8x8 + +prototype unsigned int vp9_sub_pixel_mse64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_mse64x64 + +prototype unsigned int vp9_sub_pixel_mse32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" +specialize vp9_sub_pixel_mse32x32 + +prototype unsigned int vp9_get_mb_ss "const int16_t *" +specialize vp9_get_mb_ss mmx sse2 +# ENCODEMB INVOKE + +prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" +specialize vp9_block_error mmx sse2 +vp9_block_error_sse2=vp9_block_error_xmm + +# +# Structured Similarity (SSIM) +# +if [ "$CONFIG_INTERNAL_STATS" = "yes" ]; then + [ $arch = "x86_64" ] && sse2_on_x86_64=sse2 + + prototype void vp9_ssim_parms_8x8 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" + specialize vp9_ssim_parms_8x8 $sse2_on_x86_64 + + prototype void vp9_ssim_parms_16x16 "uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, unsigned long *sum_sq_r, unsigned long *sum_sxr" + specialize vp9_ssim_parms_16x16 $sse2_on_x86_64 +fi + +# fdct functions +prototype void vp9_short_fht4x4 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht4x4 + +prototype void vp9_short_fht8x8 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht8x8 + +prototype void vp9_short_fht16x16 "int16_t *InputData, int16_t *OutputData, int pitch, int tx_type" +specialize vp9_short_fht16x16 + +prototype void vp9_short_fdct8x8 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct8x8 sse2 + +prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct4x4 sse2 + +prototype void vp9_short_fdct8x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct8x4 sse2 + +prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct32x32 + +prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct32x32_rd + +prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_fdct16x16 sse2 + +prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh4x4 + +prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh8x4 + +# +# Motion search +# +prototype int vp9_full_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv, int n" +specialize vp9_full_search_sad sse3 sse4_1 +vp9_full_search_sad_sse3=vp9_full_search_sadx3 +vp9_full_search_sad_sse4_1=vp9_full_search_sadx8 + +prototype int vp9_refining_search_sad "struct macroblock *x, union int_mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv" +specialize vp9_refining_search_sad sse3 +vp9_refining_search_sad_sse3=vp9_refining_search_sadx4 + +prototype int vp9_diamond_search_sad "struct macroblock *x, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, union int_mv *center_mv" +specialize vp9_diamond_search_sad sse3 +vp9_diamond_search_sad_sse3=vp9_diamond_search_sadx4 + +prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count" +specialize vp9_temporal_filter_apply sse2 + +prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction" +specialize vp9_yv12_copy_partial_frame + + +fi +# end encoder functions diff --git a/libvpx/vp9/common/vp9_sadmxn.h b/libvpx/vp9/common/vp9_sadmxn.h new file mode 100644 index 000000000..b2dfd63f9 --- /dev/null +++ b/libvpx/vp9/common/vp9_sadmxn.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SADMXN_H_ +#define VP9_COMMON_VP9_SADMXN_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" + +static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, + int m, + int n) { + int r, c; + unsigned int sad = 0; + + for (r = 0; r < n; r++) { + for (c = 0; c < m; c++) { + sad += abs(src_ptr[c] - ref_ptr[c]); + } + + src_ptr += src_stride; + ref_ptr += ref_stride; + } + + return sad; +} + +#endif // VP9_COMMON_VP9_SADMXN_H_ diff --git a/libvpx/vp9/common/vp9_seg_common.c b/libvpx/vp9/common/vp9_seg_common.c new file mode 100644 index 000000000..df7747c90 --- /dev/null +++ b/libvpx/vp9/common/vp9_seg_common.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_seg_common.h" + +static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 0, 0 }; +static const int seg_feature_data_max[SEG_LVL_MAX] = { MAXQ, 63, 3, 0 }; + +// These functions provide access to new segment level features. +// Eventually these function may be "optimized out" but for the moment, +// the coding mechanism is still subject to change so these provide a +// convenient single point of change. + +int vp9_segfeature_active(const MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id) { + return xd->segmentation_enabled && + (xd->segment_feature_mask[segment_id] & (1 << feature_id)); +} + +void vp9_clearall_segfeatures(MACROBLOCKD *xd) { + vpx_memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data)); + vpx_memset(xd->segment_feature_mask, 0, sizeof(xd->segment_feature_mask)); +} + +void vp9_enable_segfeature(MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_mask[segment_id] |= 1 << feature_id; +} + +void vp9_disable_segfeature(MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_mask[segment_id] &= ~(1 << feature_id); +} + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_max[feature_id]; +} + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id) { + return seg_feature_data_signed[feature_id]; +} + +void vp9_clear_segdata(MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id) { + xd->segment_feature_data[segment_id][feature_id] = 0; +} + +void vp9_set_segdata(MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id, int seg_data) { + assert(seg_data <= seg_feature_data_max[feature_id]); + if (seg_data < 0) { + assert(seg_feature_data_signed[feature_id]); + assert(-seg_data <= seg_feature_data_max[feature_id]); + } + + xd->segment_feature_data[segment_id][feature_id] = seg_data; +} + +int vp9_get_segdata(const MACROBLOCKD *xd, int segment_id, + SEG_LVL_FEATURES feature_id) { + return xd->segment_feature_data[segment_id][feature_id]; +} + + +const vp9_tree_index vp9_segment_tree[14] = { + 2, 4, 6, 8, 10, 12, + 0, -1, -2, -3, -4, -5, -6, -7 +}; + + +// TBD? Functions to read and write segment data with range / validity checking diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h new file mode 100644 index 000000000..74ba03c3e --- /dev/null +++ b/libvpx/vp9/common/vp9_seg_common.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_blockd.h" + +#ifndef VP9_COMMON_VP9_SEG_COMMON_H_ +#define VP9_COMMON_VP9_SEG_COMMON_H_ + +int vp9_segfeature_active(const MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_clearall_segfeatures(MACROBLOCKD *xd); + +void vp9_enable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_disable_segfeature(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +int vp9_seg_feature_data_max(SEG_LVL_FEATURES feature_id); + +int vp9_is_segfeature_signed(SEG_LVL_FEATURES feature_id); + +void vp9_clear_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +void vp9_set_segdata(MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id, + int seg_data); + +int vp9_get_segdata(const MACROBLOCKD *xd, + int segment_id, + SEG_LVL_FEATURES feature_id); + +extern const vp9_tree_index vp9_segment_tree[14]; + +#endif // VP9_COMMON_VP9_SEG_COMMON_H_ + diff --git a/libvpx/vp9/common/vp9_subpelvar.h b/libvpx/vp9/common/vp9_subpelvar.h new file mode 100644 index 000000000..ad674f105 --- /dev/null +++ b/libvpx/vp9/common/vp9_subpelvar.h @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SUBPELVAR_H_ +#define VP9_COMMON_VP9_SUBPELVAR_H_ + +#include "vp9/common/vp9_filter.h" + +static void variance(const uint8_t *src_ptr, + int source_stride, + const uint8_t *ref_ptr, + int recon_stride, + int w, + int h, + unsigned int *sse, + int *sum) { + int i, j; + int diff; + + *sum = 0; + *sse = 0; + + for (i = 0; i < h; i++) { + for (j = 0; j < w; j++) { + diff = src_ptr[j] - ref_ptr[j]; + *sum += diff; + *sse += diff * diff; + } + + src_ptr += source_stride; + ref_ptr += recon_stride; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_first_pass + * + * INPUTS : uint8_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : int32_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement first-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Produces int32_t output to retain precision for next pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply bilinear filter + output_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[pixel_step] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT; + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +/**************************************************************************** + * + * ROUTINE : filter_block2d_bil_second_pass + * + * INPUTS : int32_t *src_ptr : Pointer to source block. + * uint32_t src_pixels_per_line : Stride of input block. + * uint32_t pixel_step : Offset between filter input samples (see notes). + * uint32_t output_height : Input block height. + * uint32_t output_width : Input block width. + * int32_t *vp9_filter : Array of 2 bi-linear filter taps. + * + * OUTPUTS : uint16_t *output_ptr : Pointer to filtered block. + * + * RETURNS : void + * + * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block in + * either horizontal or vertical direction to produce the + * filtered output block. Used to implement second-pass + * of 2-D separable filter. + * + * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass. + * Two filter taps should sum to VP9_FILTER_WEIGHT. + * pixel_step defines whether the filter is applied + * horizontally (pixel_step=1) or vertically (pixel_step=stride). + * It defines the offset required to move from one input + * to the next. + * + ****************************************************************************/ +static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + unsigned int i, j; + int Temp; + + for (i = 0; i < output_height; i++) { + for (j = 0; j < output_width; j++) { + // Apply filter + Temp = ((int)src_ptr[0] * vp9_filter[0]) + + ((int)src_ptr[pixel_step] * vp9_filter[1]) + + (VP9_FILTER_WEIGHT / 2); + output_ptr[j] = (unsigned int)(Temp >> VP9_FILTER_SHIFT); + src_ptr++; + } + + // Next row... + src_ptr += src_pixels_per_line - output_width; + output_ptr += output_width; + } +} + +#endif // VP9_COMMON_VP9_SUBPELVAR_H_ diff --git a/libvpx/vp9/common/vp9_systemdependent.h b/libvpx/vp9/common/vp9_systemdependent.h new file mode 100644 index 000000000..1b9147ef4 --- /dev/null +++ b/libvpx/vp9/common/vp9_systemdependent.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ +#define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ + +#ifdef _MSC_VER +#include <math.h> +#endif + +#include "./vpx_config.h" +#if ARCH_X86 || ARCH_X86_64 +void vpx_reset_mmx_state(void); +#define vp9_clear_system_state() vpx_reset_mmx_state() +#else +#define vp9_clear_system_state() +#endif + +#ifdef _MSC_VER +// round is not defined in MSVC +static int round(double x) { + if (x < 0) + return (int)ceil(x - 0.5); + else + return (int)floor(x + 0.5); +} +#endif + +struct VP9Common; +void vp9_machine_specific_config(struct VP9Common *); + +#endif // VP9_COMMON_VP9_SYSTEMDEPENDENT_H_ diff --git a/libvpx/vp9/common/vp9_tapify.py b/libvpx/vp9/common/vp9_tapify.py new file mode 100644 index 000000000..99529cff0 --- /dev/null +++ b/libvpx/vp9/common/vp9_tapify.py @@ -0,0 +1,106 @@ +""" + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. +""" +#!/usr/bin/env python +import sys,string,os,re,math,numpy +scale = 2**16 +def dist(p1,p2): + x1,y1 = p1 + x2,y2 = p2 + if x1==x2 and y1==y2 : + return 1.0 + return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2)) + +def gettaps(p): + def l(b): + return int(math.floor(b)) + def h(b): + return int(math.ceil(b)) + def t(b,p,s): + return int((scale*dist(b,p)+s/2)/s) + r,c = p + ul=[l(r),l(c)] + ur=[l(r),h(c)] + ll=[h(r),l(c)] + lr=[h(r),h(c)] + sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p) + t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum); + return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)], + [ll,t(ll,p,sum)],[lr,t4]] + +def print_mb_taps(angle,blocksize): + theta = angle / 57.2957795; + affine = [[math.cos(theta),-math.sin(theta)], + [math.sin(theta),math.cos(theta)]] + radius = (float(blocksize)-1)/2 + print " // angle of",angle,"degrees" + for y in range(blocksize) : + for x in range(blocksize) : + r,c = numpy.dot(affine,[y-radius, x-radius]) + tps = gettaps([r+radius,c+radius]) + for t in tps : + p,t = t + tr,tc = p + print " %2d, %2d, %5d, " % (tr,tc,t,), + print " // %2d,%2d " % (y,x) + +i=float(sys.argv[1]) +while i <= float(sys.argv[2]) : + print_mb_taps(i,float(sys.argv[4])) + i=i+float(sys.argv[3]) +""" + +taps = [] +pt=dict() +ptr=dict() +for y in range(16) : + for x in range(16) : + r,c = numpy.dot(affine,[y-7.5, x-7.5]) + tps = gettaps([r+7.5,c+7.5]) + j=0 + for tp in tps : + p,i = tp + r,c = p + pt[y,x,j]= [p,i] + try: + ptr[r,j,c].append([y,x]) + except: + ptr[r,j,c]=[[y,x]] + j = j+1 + +for key in sorted(pt.keys()) : + print key,pt[key] + +lr = -99 +lj = -99 +lc = 0 + +shuf="" +mask="" +for r,j,c in sorted(ptr.keys()) : + for y,x in ptr[r,j,c] : + if lr != r or lj != j : + print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc + shuf="" + lc = 0 + for i in range(lc,c-1) : + shuf = shuf +"0" + shuf = shuf + hex(x)[2] + lc =c + break + lr = r + lj = j +# print r,j,c,ptr[r,j,c] +# print + +for r,j,c in sorted(ptr.keys()) : + for y,x in ptr[r,j,c] : + print r,j,c,y,x + break +""" diff --git a/libvpx/vp9/common/vp9_textblit.c b/libvpx/vp9/common/vp9_textblit.c new file mode 100644 index 000000000..60e95e08f --- /dev/null +++ b/libvpx/vp9/common/vp9_textblit.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "vp9/common/vp9_textblit.h" + +static const int font[] = { + 0x0, 0x5C00, 0x8020, 0xAFABEA, 0xD7EC0, 0x1111111, 0x1855740, 0x18000, + 0x45C0, 0x74400, 0x51140, 0x23880, 0xC4000, 0x21080, 0x80000, 0x111110, + 0xE9D72E, 0x87E40, 0x12AD732, 0xAAD62A, 0x4F94C4, 0x4D6B7, 0x456AA, + 0x3E8423, 0xAAD6AA, 0xAAD6A2, 0x2800, 0x2A00, 0x8A880, 0x52940, 0x22A20, + 0x15422, 0x6AD62E, 0x1E4A53E, 0xAAD6BF, 0x8C62E, 0xE8C63F, 0x118D6BF, + 0x1094BF, 0xCAC62E, 0x1F2109F, 0x118FE31, 0xF8C628, 0x8A89F, 0x108421F, + 0x1F1105F, 0x1F4105F, 0xE8C62E, 0x2294BF, 0x164C62E, 0x12694BF, 0x8AD6A2, + 0x10FC21, 0x1F8421F, 0x744107, 0xF8220F, 0x1151151, 0x117041, 0x119D731, + 0x47E0, 0x1041041, 0xFC400, 0x10440, 0x1084210, 0x820 +}; + +static void plot(int x, int y, unsigned char *image, int pitch) { + image[x + y * pitch] ^= 255; +} + +void vp9_blit_text(const char *msg, unsigned char *address, const int pitch) { + int letter_bitmap; + unsigned char *output_pos = address; + int colpos = 0; + + while (msg[colpos] != 0) { + char letter = msg[colpos]; + int fontcol, fontrow; + + if (letter <= 'Z' && letter >= ' ') + letter_bitmap = font[letter - ' ']; + else if (letter <= 'z' && letter >= 'a') + letter_bitmap = font[letter - 'a' + 'A' - ' ']; + else + letter_bitmap = font[0]; + + for (fontcol = 6; fontcol >= 0; fontcol--) + for (fontrow = 0; fontrow < 5; fontrow++) + output_pos[fontrow * pitch + fontcol] = + ((letter_bitmap >> (fontcol * 5)) & (1 << fontrow) ? 255 : 0); + + output_pos += 7; + colpos++; + } +} + + + +/* Bresenham line algorithm */ +void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, + int pitch) { + int steep = abs(y1 - y0) > abs(x1 - x0); + int deltax, deltay; + int error, ystep, y, x; + + if (steep) { + int t; + t = x0; + x0 = y0; + y0 = t; + + t = x1; + x1 = y1; + y1 = t; + } + + if (x0 > x1) { + int t; + t = x0; + x0 = x1; + x1 = t; + + t = y0; + y0 = y1; + y1 = t; + } + + deltax = x1 - x0; + deltay = abs(y1 - y0); + error = deltax / 2; + + y = y0; + + if (y0 < y1) + ystep = 1; + else + ystep = -1; + + if (steep) { + for (x = x0; x <= x1; x++) { + plot(y, x, image, pitch); + + error = error - deltay; + if (error < 0) { + y = y + ystep; + error = error + deltax; + } + } + } else { + for (x = x0; x <= x1; x++) { + plot(x, y, image, pitch); + + error = error - deltay; + if (error < 0) { + y = y + ystep; + error = error + deltax; + } + } + } +} diff --git a/libvpx/vp9/common/vp9_textblit.h b/libvpx/vp9/common/vp9_textblit.h new file mode 100644 index 000000000..c968628fe --- /dev/null +++ b/libvpx/vp9/common/vp9_textblit.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_TEXTBLIT_H_ +#define VP9_COMMON_VP9_TEXTBLIT_H_ + +void vp9_blit_text(const char *msg, unsigned char *address, int pitch); + +void vp9_blit_line(int x0, int x1, int y0, int y1, unsigned char *image, + int pitch); + +#endif // VP9_COMMON_VP9_TEXTBLIT_H_ diff --git a/libvpx/vp9/common/vp9_tile_common.c b/libvpx/vp9/common/vp9_tile_common.c new file mode 100644 index 000000000..95296ad6f --- /dev/null +++ b/libvpx/vp9/common/vp9_tile_common.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/common/vp9_tile_common.h" + +#define MIN_TILE_WIDTH 256 +#define MAX_TILE_WIDTH 4096 +#define MIN_TILE_WIDTH_SBS (MIN_TILE_WIDTH >> 6) +#define MAX_TILE_WIDTH_SBS (MAX_TILE_WIDTH >> 6) + +static void vp9_get_tile_offsets(VP9_COMMON *cm, int *min_tile_off, + int *max_tile_off, int tile_idx, + int log2_n_tiles, int n_mis) { + const int n_sbs = (n_mis + 7) >> 3; + const int sb_off1 = (tile_idx * n_sbs) >> log2_n_tiles; + const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles; + + *min_tile_off = MIN(sb_off1 << 3, n_mis); + *max_tile_off = MIN(sb_off2 << 3, n_mis); +} + +void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx) { + cm->cur_tile_col_idx = tile_col_idx; + vp9_get_tile_offsets(cm, &cm->cur_tile_mi_col_start, + &cm->cur_tile_mi_col_end, tile_col_idx, + cm->log2_tile_columns, cm->mi_cols); +} + +void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx) { + cm->cur_tile_row_idx = tile_row_idx; + vp9_get_tile_offsets(cm, &cm->cur_tile_mi_row_start, + &cm->cur_tile_mi_row_end, tile_row_idx, + cm->log2_tile_rows, cm->mi_rows); +} + + +void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles_ptr, + int *delta_log2_n_tiles) { + const int sb_cols = (cm->mb_cols + 3) >> 2; + int min_log2_n_tiles, max_log2_n_tiles; + + for (max_log2_n_tiles = 0; + (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_SBS; + max_log2_n_tiles++) {} + max_log2_n_tiles--; + if (max_log2_n_tiles < 0) + max_log2_n_tiles = 0; + + for (min_log2_n_tiles = 0; + (MAX_TILE_WIDTH_SBS << min_log2_n_tiles) < sb_cols; + min_log2_n_tiles++) {} + + assert(max_log2_n_tiles >= min_log2_n_tiles); + *min_log2_n_tiles_ptr = min_log2_n_tiles; + *delta_log2_n_tiles = max_log2_n_tiles - min_log2_n_tiles; +} diff --git a/libvpx/vp9/common/vp9_tile_common.h b/libvpx/vp9/common/vp9_tile_common.h new file mode 100644 index 000000000..7ea377297 --- /dev/null +++ b/libvpx/vp9/common/vp9_tile_common.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_TILE_COMMON_H_ +#define VP9_COMMON_VP9_TILE_COMMON_H_ + +#include "vp9/common/vp9_onyxc_int.h" + +void vp9_get_tile_col_offsets(VP9_COMMON *cm, int tile_col_idx); + +void vp9_get_tile_row_offsets(VP9_COMMON *cm, int tile_row_idx); + +void vp9_get_tile_n_bits(VP9_COMMON *cm, int *min_log2_n_tiles, + int *delta_log2_n_tiles); + +#endif // VP9_COMMON_VP9_TILE_COMMON_H_ diff --git a/libvpx/vp9/common/vp9_treecoder.c b/libvpx/vp9/common/vp9_treecoder.c new file mode 100644 index 000000000..531fa752b --- /dev/null +++ b/libvpx/vp9/common/vp9_treecoder.c @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include "vpx_config.h" + +#if defined(CONFIG_DEBUG) && CONFIG_DEBUG +#include <assert.h> +#endif + +#include "vp9/common/vp9_treecoder.h" + +static void tree2tok(struct vp9_token *const p, vp9_tree t, + int i, int v, int l) { + v += v; + ++l; + + do { + const vp9_tree_index j = t[i++]; + + if (j <= 0) { + p[-j].value = v; + p[-j].len = l; + } else + tree2tok(p, t, j, v, l); + } while (++v & 1); +} + +void vp9_tokens_from_tree(struct vp9_token *p, vp9_tree t) { + tree2tok(p, t, 0, 0, 0); +} + +void vp9_tokens_from_tree_offset(struct vp9_token *p, vp9_tree t, + int offset) { + tree2tok(p - offset, t, 0, 0, 0); +} + +static unsigned int convert_distribution(unsigned int i, + vp9_tree tree, + vp9_prob probs[], + unsigned int branch_ct[][2], + const unsigned int num_events[], + unsigned int tok0_offset) { + unsigned int left, right; + + if (tree[i] <= 0) { + left = num_events[-tree[i] - tok0_offset]; + } else { + left = convert_distribution(tree[i], tree, probs, branch_ct, + num_events, tok0_offset); + } + if (tree[i + 1] <= 0) + right = num_events[-tree[i + 1] - tok0_offset]; + else + right = convert_distribution(tree[i + 1], tree, probs, branch_ct, + num_events, tok0_offset); + + probs[i>>1] = get_binary_prob(left, right); + branch_ct[i>>1][0] = left; + branch_ct[i>>1][1] = right; + return left + right; +} + +void vp9_tree_probs_from_distribution( + vp9_tree tree, + vp9_prob probs [ /* n-1 */ ], + unsigned int branch_ct [ /* n-1 */ ] [2], + const unsigned int num_events[ /* n */ ], + unsigned int tok0_offset) { + convert_distribution(0, tree, probs, branch_ct, num_events, tok0_offset); +} diff --git a/libvpx/vp9/common/vp9_treecoder.h b/libvpx/vp9/common/vp9_treecoder.h new file mode 100644 index 000000000..ebcd4116f --- /dev/null +++ b/libvpx/vp9/common/vp9_treecoder.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_VP9_TREECODER_H_ +#define VP9_COMMON_VP9_TREECODER_H_ + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" + +typedef uint8_t vp9_prob; + +#define vp9_prob_half ((vp9_prob) 128) + +typedef int8_t vp9_tree_index; + +#define vp9_complement(x) (255 - x) + +/* We build coding trees compactly in arrays. + Each node of the tree is a pair of vp9_tree_indices. + Array index often references a corresponding probability table. + Index <= 0 means done encoding/decoding and value = -Index, + Index > 0 means need another bit, specification at index. + Nonnegative indices are always even; processing begins at node 0. */ + +typedef const vp9_tree_index vp9_tree[], *vp9_tree_p; + +struct vp9_token { + int value; + int len; +}; + +/* Construct encoding array from tree. */ + +void vp9_tokens_from_tree(struct vp9_token*, vp9_tree); +void vp9_tokens_from_tree_offset(struct vp9_token*, vp9_tree, int offset); + +/* Convert array of token occurrence counts into a table of probabilities + for the associated binary encoding tree. Also writes count of branches + taken for each node on the tree; this facilitiates decisions as to + probability updates. */ + +void vp9_tree_probs_from_distribution(vp9_tree tree, + vp9_prob probs[ /* n - 1 */ ], + unsigned int branch_ct[ /* n - 1 */ ][2], + const unsigned int num_events[ /* n */ ], + unsigned int tok0_offset); + +static INLINE vp9_prob clip_prob(int p) { + return (p > 255) ? 255u : (p < 1) ? 1u : p; +} + +// int64 is not needed for normal frame level calculations. +// However when outputing entropy stats accumulated over many frames +// or even clips we can overflow int math. +#ifdef ENTROPY_STATS +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob(((int64_t)num * 256 + (den >> 1)) / den); +} +#else +static INLINE vp9_prob get_prob(int num, int den) { + return (den == 0) ? 128u : clip_prob((num * 256 + (den >> 1)) / den); +} +#endif + +static INLINE vp9_prob get_binary_prob(int n0, int n1) { + return get_prob(n0, n0 + n1); +} + +/* this function assumes prob1 and prob2 are already within [1,255] range */ +static INLINE vp9_prob weighted_prob(int prob1, int prob2, int factor) { + return ROUND_POWER_OF_TWO(prob1 * (256 - factor) + prob2 * factor, 8); +} + +#endif // VP9_COMMON_VP9_TREECODER_H_ diff --git a/libvpx/vp9/common/x86/vp9_asm_stubs.c b/libvpx/vp9/common/x86/vp9_asm_stubs.c new file mode 100644 index 000000000..2b66834a7 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_asm_stubs.c @@ -0,0 +1,318 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vpx_ports/mem.h" +/////////////////////////////////////////////////////////////////////////// +// the mmx function that does the bilinear filtering and var calculation // +// int one pass // +/////////////////////////////////////////////////////////////////////////// +DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { + { 128, 128, 128, 128, 0, 0, 0, 0 }, + { 120, 120, 120, 120, 8, 8, 8, 8 }, + { 112, 112, 112, 112, 16, 16, 16, 16 }, + { 104, 104, 104, 104, 24, 24, 24, 24 }, + { 96, 96, 96, 96, 32, 32, 32, 32 }, + { 88, 88, 88, 88, 40, 40, 40, 40 }, + { 80, 80, 80, 80, 48, 48, 48, 48 }, + { 72, 72, 72, 72, 56, 56, 56, 56 }, + { 64, 64, 64, 64, 64, 64, 64, 64 }, + { 56, 56, 56, 56, 72, 72, 72, 72 }, + { 48, 48, 48, 48, 80, 80, 80, 80 }, + { 40, 40, 40, 40, 88, 88, 88, 88 }, + { 32, 32, 32, 32, 96, 96, 96, 96 }, + { 24, 24, 24, 24, 104, 104, 104, 104 }, + { 16, 16, 16, 16, 112, 112, 112, 112 }, + { 8, 8, 8, 8, 120, 120, 120, 120 } +}; + +#if HAVE_SSSE3 +void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + + assert(w <= 64); + assert(h <= 64); + if (x_step_q4 == 16 && y_step_q4 == 16) { + vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h + 7); + vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } else { + vp9_convolve8_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } +} + +void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64*71); + + assert(w <= 64); + assert(h <= 64); + if (x_step_q4 == 16 && y_step_q4 == 16) { + vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h + 7); + vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } else { + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, h); + } +} +#endif diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c new file mode 100644 index 000000000..599dcff93 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c @@ -0,0 +1,1985 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_idct.h" + +// In order to improve performance, clip absolute diff values to [0, 255], +// which allows to keep the additions/subtractions in 8 bits. +void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, + uint8_t *dst_ptr, int pitch, int stride) { + int a1; + int16_t out; + uint8_t abs_diff; + __m128i p0, p1, p2, p3; + unsigned int extended_diff; + __m128i diff; + + out = dct_const_round_shift(input_dc * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); + a1 = ROUND_POWER_OF_TWO(out, 4); + + // Read prediction data. + p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch)); + p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch)); + p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch)); + p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch)); + + // Unpack prediction data, and store 4x4 array in 1 XMM register. + p0 = _mm_unpacklo_epi32(p0, p1); + p2 = _mm_unpacklo_epi32(p2, p3); + p0 = _mm_unpacklo_epi64(p0, p2); + + // Clip dc value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (a1 >= 0) { + abs_diff = (a1 > 255) ? 255 : a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_adds_epu8(p0, diff); + } else { + abs_diff = (a1 < -255) ? 255 : -a1; + extended_diff = abs_diff * 0x01010101u; + diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0); + + p1 = _mm_subs_epu8(p0, diff); + } + + // Store results to dst. + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); + dst_ptr += stride; + + p1 = _mm_srli_si128(p1, 4); + *(int *)dst_ptr = _mm_cvtsi128_si32(p1); +} + +void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i eight = _mm_set1_epi16(8); + const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i input0, input1, input2, input3; + + // Rows + input0 = _mm_loadl_epi64((__m128i *)input); + input1 = _mm_loadl_epi64((__m128i *)(input + 4)); + input2 = _mm_loadl_epi64((__m128i *)(input + 8)); + input3 = _mm_loadl_epi64((__m128i *)(input + 12)); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input0, 0xd8); + input1 = _mm_shufflelo_epi16(input1, 0xd8); + input2 = _mm_shufflelo_epi16(input2, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpacklo_epi32(input1, input1); + input2 = _mm_unpacklo_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Columns + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + input0 = _mm_shufflelo_epi16(input2, 0xd8); + input1 = _mm_shufflehi_epi16(input2, 0xd8); + input2 = _mm_shufflehi_epi16(input3, 0xd8); + input3 = _mm_shufflelo_epi16(input3, 0xd8); + + input0 = _mm_unpacklo_epi32(input0, input0); + input1 = _mm_unpackhi_epi32(input1, input1); + input2 = _mm_unpackhi_epi32(input2, input2); + input3 = _mm_unpacklo_epi32(input3, input3); + + // Stage 1 + input0 = _mm_madd_epi16(input0, cst); + input1 = _mm_madd_epi16(input1, cst); + input2 = _mm_madd_epi16(input2, cst); + input3 = _mm_madd_epi16(input3, cst); + + input0 = _mm_add_epi32(input0, rounding); + input1 = _mm_add_epi32(input1, rounding); + input2 = _mm_add_epi32(input2, rounding); + input3 = _mm_add_epi32(input3, rounding); + + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); + + // Stage 2 + input0 = _mm_packs_epi32(input0, zero); + input1 = _mm_packs_epi32(input1, zero); + input2 = _mm_packs_epi32(input2, zero); + input3 = _mm_packs_epi32(input3, zero); + + // Transpose + input1 = _mm_unpacklo_epi16(input0, input1); + input3 = _mm_unpacklo_epi16(input2, input3); + input0 = _mm_unpacklo_epi32(input1, input3); + input1 = _mm_unpackhi_epi32(input1, input3); + + // Switch column2, column 3, and then, we got: + // input2: column1, column 0; input3: column2, column 3. + input1 = _mm_shuffle_epi32(input1, 0x4e); + input2 = _mm_add_epi16(input0, input1); + input3 = _mm_sub_epi16(input0, input1); + + // Final round and shift + input2 = _mm_add_epi16(input2, eight); + input3 = _mm_add_epi16(input3, eight); + + input2 = _mm_srai_epi16(input2, 4); + input3 = _mm_srai_epi16(input3, 4); + +#define RECON_AND_STORE4X4(dest, in_x) \ + { \ + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + d0 = _mm_add_epi16(in_x, d0); \ + d0 = _mm_packus_epi16(d0, d0); \ + *(int *)dest = _mm_cvtsi128_si32(d0); \ + dest += stride; \ + } + + input0 = _mm_srli_si128(input2, 8); + input1 = _mm_srli_si128(input3, 8); + + RECON_AND_STORE4X4(dest, input2); + RECON_AND_STORE4X4(dest, input0); + RECON_AND_STORE4X4(dest, input1); + RECON_AND_STORE4X4(dest, input3); +} + +void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) { + const __m128i zero = _mm_setzero_si128(); + const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, + (int16_t)cospi_8_64, (int16_t)cospi_24_64); + const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1); + + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + __m128i in, temp; + + // Load input data. + in = _mm_loadl_epi64((__m128i *)input); + + // Construct i3, i1, i3, i1, i2, i0, i2, i0 + in = _mm_shufflelo_epi16(in, 0xd8); + in = _mm_unpacklo_epi32(in, in); + + // Stage 1 + in = _mm_madd_epi16(in, c1); + in = _mm_add_epi32(in, rounding); + in = _mm_srai_epi32(in, DCT_CONST_BITS); + in = _mm_packs_epi32(in, zero); + + // Stage 2 + temp = _mm_shufflelo_epi16(in, 0x9c); + in = _mm_shufflelo_epi16(in, 0xc9); + in = _mm_unpacklo_epi64(temp, in); + in = _mm_madd_epi16(in, c2); + in = _mm_packs_epi32(in, zero); + + // Store results + _mm_storel_epi64((__m128i *)output, in); +} + +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } + +#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = out5 = out6 = out7 = zero; \ + } + +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + \ + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ + in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ + in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + +#define IDCT8x8_1D \ + /* Stage1 */ \ + { \ + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ + \ + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ + stg1_1, stg1_2, stg1_3, stp1_4, \ + stp1_7, stp1_5, stp1_6) \ + } \ + \ + /* Stage2 */ \ + { \ + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ + stg2_1, stg2_2, stg2_3, stp2_0, \ + stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + } \ + \ + /* Stage4 */ \ + in0 = _mm_adds_epi16(stp1_0, stp2_7); \ + in1 = _mm_adds_epi16(stp1_1, stp1_6); \ + in2 = _mm_adds_epi16(stp1_2, stp1_5); \ + in3 = _mm_adds_epi16(stp1_3, stp2_4); \ + in4 = _mm_subs_epi16(stp1_3, stp2_4); \ + in5 = _mm_subs_epi16(stp1_2, stp1_5); \ + in6 = _mm_subs_epi16(stp1_1, stp1_6); \ + in7 = _mm_subs_epi16(stp1_0, stp2_7); + +#define RECON_AND_STORE(dest, in_x) \ + { \ + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ + d0 = _mm_unpacklo_epi8(d0, zero); \ + in_x = _mm_add_epi16(in_x, d0); \ + in_x = _mm_packus_epi16(in_x, in_x); \ + _mm_storel_epi64((__m128i *)(dest), in_x); \ + dest += stride; \ + } + +void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + IDCT8x8_1D + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); +} + +void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + // Rows. Load 4-row input data. + in0 = _mm_load_si128((__m128i *)input); + in1 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 3)); + + // 8x4 Transpose + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) + + // Stage1 + { + const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); + const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); + + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + } + + // Stage2 + { + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); + const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); + + tmp0 = _mm_madd_epi16(lo_04, stg2_0); + tmp2 = _mm_madd_epi16(lo_04, stg2_1); + tmp4 = _mm_madd_epi16(lo_26, stg2_2); + tmp6 = _mm_madd_epi16(lo_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); + } + + // Stage3 + { + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_56, stg3_0); + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, zero); + stp1_6 = _mm_packs_epi32(tmp2, zero); + } + + // Stage4 + in0 = _mm_adds_epi16(stp1_0, stp2_7); + in1 = _mm_adds_epi16(stp1_1, stp1_6); + in2 = _mm_adds_epi16(stp1_2, stp1_5); + in3 = _mm_adds_epi16(stp1_3, stp2_4); + in4 = _mm_subs_epi16(stp1_3, stp2_4); + in5 = _mm_subs_epi16(stp1_2, stp1_5); + in6 = _mm_subs_epi16(stp1_1, stp1_6); + in7 = _mm_subs_epi16(stp1_0, stp2_7); + + // Columns. 4x8 Transpose + TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7) + + // 1D idct8x8 + IDCT8x8_1D + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); +} + +#define IDCT16x16_1D \ + /* Stage2 */ \ + { \ + const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ + const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ + const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ + const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ + const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ + const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ + const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ + const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ + \ + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ + stg2_0, stg2_1, stg2_2, stg2_3, \ + stp2_8, stp2_15, stp2_9, stp2_14) \ + \ + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ + stg2_4, stg2_5, stg2_6, stg2_7, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } \ + \ + /* Stage3 */ \ + { \ + const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ + const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ + const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ + const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ + \ + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ + stg3_0, stg3_1, stg3_2, stg3_3, \ + stp1_4, stp1_7, stp1_5, stp1_6) \ + \ + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ + \ + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ + } \ + \ + /* Stage4 */ \ + { \ + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ + const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ + const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ + const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ + \ + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + \ + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ + stg4_0, stg4_1, stg4_2, stg4_3, \ + stp2_0, stp2_1, stp2_2, stp2_3) \ + \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ + \ + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ + stg4_4, stg4_5, stg4_6, stg4_7, \ + stp2_9, stp2_14, stp2_10, stp2_13) \ + } \ + \ + /* Stage5 */ \ + { \ + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ + \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ + \ + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + \ + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ + \ + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ + \ + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ + } \ + \ + /* Stage6 */ \ + { \ + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ + \ + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ + \ + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ + stg6_0, stg4_0, stg6_0, stg4_0, \ + stp2_10, stp2_13, stp2_11, stp2_12) \ + } + +void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, + r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, + r12 = zero, r13 = zero, r14 = zero, r15 = zero; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. + for (i = 0; i < 4; i++) { + // 1-D idct + if (i < 2) { + if (i == 1) input += 128; + + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in4 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + } + + if (i == 2) { + TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, + in13, in14, in15); + } + + if (i == 3) { + TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, + in12, in13, in14, in15); + } + + IDCT16x16_1D + + // Stage7 + if (i == 0) { + // Left 8x16 + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + } else if (i == 1) { + // Right 8x16 + r0 = _mm_add_epi16(stp2_0, stp1_15); + r1 = _mm_add_epi16(stp2_1, stp1_14); + r2 = _mm_add_epi16(stp2_2, stp2_13); + r3 = _mm_add_epi16(stp2_3, stp2_12); + r4 = _mm_add_epi16(stp2_4, stp2_11); + r5 = _mm_add_epi16(stp2_5, stp2_10); + r6 = _mm_add_epi16(stp2_6, stp1_9); + r7 = _mm_add_epi16(stp2_7, stp1_8); + r8 = _mm_sub_epi16(stp2_7, stp1_8); + r9 = _mm_sub_epi16(stp2_6, stp1_9); + r10 = _mm_sub_epi16(stp2_5, stp2_10); + r11 = _mm_sub_epi16(stp2_4, stp2_11); + r12 = _mm_sub_epi16(stp2_3, stp2_12); + r13 = _mm_sub_epi16(stp2_2, stp2_13); + r14 = _mm_sub_epi16(stp2_1, stp1_14); + r15 = _mm_sub_epi16(stp2_0, stp1_15); + } else { + // 2-D + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + + dest += 8 - (stride * 16); + } + } +} + +void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest, + int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + const __m128i zero = _mm_setzero_si128(); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, + in10 = zero, in11 = zero, in12 = zero, in13 = zero, + in14 = zero, in15 = zero; + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, + l12 = zero, l13 = zero, l14 = zero, l15 = zero; + + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_8_0, stp1_12_0; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + // 1-D idct. Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 7)); + + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); + TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); + + // Stage2 + { + const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); + const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); + const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); + const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); + + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); + tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); + tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); + tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); + tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_8 = _mm_packs_epi32(tmp0, zero); + stp2_15 = _mm_packs_epi32(tmp2, zero); + stp2_9 = _mm_packs_epi32(tmp4, zero); + stp2_14 = _mm_packs_epi32(tmp6, zero); + + stp2_10 = _mm_packs_epi32(tmp1, zero); + stp2_13 = _mm_packs_epi32(tmp3, zero); + stp2_11 = _mm_packs_epi32(tmp5, zero); + stp2_12 = _mm_packs_epi32(tmp7, zero); + } + + // Stage3 + { + const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); + const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); + + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); + tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); + tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_4 = _mm_packs_epi32(tmp0, zero); + stp1_7 = _mm_packs_epi32(tmp2, zero); + stp1_5 = _mm_packs_epi32(tmp4, zero); + stp1_6 = _mm_packs_epi32(tmp6, zero); + + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + } + + // Stage4 + { + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); + const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); + tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); + tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp0, zero); + stp2_1 = _mm_packs_epi32(tmp2, zero); + stp2_2 = _mm_packs_epi32(tmp4, zero); + stp2_3 = _mm_packs_epi32(tmp6, zero); + stp2_9 = _mm_packs_epi32(tmp1, zero); + stp2_14 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp5, zero); + stp2_13 = _mm_packs_epi32(tmp7, zero); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + // Stage5 and Stage6 + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); + + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); + } + + // Stage6 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, zero); + stp1_6 = _mm_packs_epi32(tmp3, zero); + stp2_10 = _mm_packs_epi32(tmp0, zero); + stp2_13 = _mm_packs_epi32(tmp2, zero); + stp2_11 = _mm_packs_epi32(tmp4, zero); + stp2_12 = _mm_packs_epi32(tmp6, zero); + + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); + } + + // Stage7. Left 8x16 only. + l0 = _mm_add_epi16(stp2_0, stp1_15); + l1 = _mm_add_epi16(stp2_1, stp1_14); + l2 = _mm_add_epi16(stp2_2, stp2_13); + l3 = _mm_add_epi16(stp2_3, stp2_12); + l4 = _mm_add_epi16(stp2_4, stp2_11); + l5 = _mm_add_epi16(stp2_5, stp2_10); + l6 = _mm_add_epi16(stp2_6, stp1_9); + l7 = _mm_add_epi16(stp2_7, stp1_8); + l8 = _mm_sub_epi16(stp2_7, stp1_8); + l9 = _mm_sub_epi16(stp2_6, stp1_9); + l10 = _mm_sub_epi16(stp2_5, stp2_10); + l11 = _mm_sub_epi16(stp2_4, stp2_11); + l12 = _mm_sub_epi16(stp2_3, stp2_12); + l13 = _mm_sub_epi16(stp2_2, stp2_13); + l14 = _mm_sub_epi16(stp2_1, stp1_14); + l15 = _mm_sub_epi16(stp2_0, stp1_15); + + // 2-D idct. We do 2 8x16 blocks. + for (i = 0; i < 2; i++) { + if (i == 0) + TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, + in5, in6, in7); + + if (i == 1) + TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, + in4, in5, in6, in7); + + in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; + + IDCT16x16_1D + + // Stage7 + in0 = _mm_add_epi16(stp2_0, stp1_15); + in1 = _mm_add_epi16(stp2_1, stp1_14); + in2 = _mm_add_epi16(stp2_2, stp2_13); + in3 = _mm_add_epi16(stp2_3, stp2_12); + in4 = _mm_add_epi16(stp2_4, stp2_11); + in5 = _mm_add_epi16(stp2_5, stp2_10); + in6 = _mm_add_epi16(stp2_6, stp1_9); + in7 = _mm_add_epi16(stp2_7, stp1_8); + in8 = _mm_sub_epi16(stp2_7, stp1_8); + in9 = _mm_sub_epi16(stp2_6, stp1_9); + in10 = _mm_sub_epi16(stp2_5, stp2_10); + in11 = _mm_sub_epi16(stp2_4, stp2_11); + in12 = _mm_sub_epi16(stp2_3, stp2_12); + in13 = _mm_sub_epi16(stp2_2, stp2_13); + in14 = _mm_sub_epi16(stp2_1, stp1_14); + in15 = _mm_sub_epi16(stp2_0, stp1_15); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + + dest += 8 - (stride * 16); + } +} + +void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1<<5); + + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, + in24, in25, in26, in27, in28, in29, in30, in31; + __m128i col[128]; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, + stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, + stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i, j; + + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. + for (i = 0; i < 8; i++) { + if (i < 4) { + // First 1-D idct + // Load input data. + in0 = _mm_load_si128((__m128i *)input); + in8 = _mm_load_si128((__m128i *)(input + 8 * 1)); + in16 = _mm_load_si128((__m128i *)(input + 8 * 2)); + in24 = _mm_load_si128((__m128i *)(input + 8 * 3)); + in1 = _mm_load_si128((__m128i *)(input + 8 * 4)); + in9 = _mm_load_si128((__m128i *)(input + 8 * 5)); + in17 = _mm_load_si128((__m128i *)(input + 8 * 6)); + in25 = _mm_load_si128((__m128i *)(input + 8 * 7)); + in2 = _mm_load_si128((__m128i *)(input + 8 * 8)); + in10 = _mm_load_si128((__m128i *)(input + 8 * 9)); + in18 = _mm_load_si128((__m128i *)(input + 8 * 10)); + in26 = _mm_load_si128((__m128i *)(input + 8 * 11)); + in3 = _mm_load_si128((__m128i *)(input + 8 * 12)); + in11 = _mm_load_si128((__m128i *)(input + 8 * 13)); + in19 = _mm_load_si128((__m128i *)(input + 8 * 14)); + in27 = _mm_load_si128((__m128i *)(input + 8 * 15)); + + in4 = _mm_load_si128((__m128i *)(input + 8 * 16)); + in12 = _mm_load_si128((__m128i *)(input + 8 * 17)); + in20 = _mm_load_si128((__m128i *)(input + 8 * 18)); + in28 = _mm_load_si128((__m128i *)(input + 8 * 19)); + in5 = _mm_load_si128((__m128i *)(input + 8 * 20)); + in13 = _mm_load_si128((__m128i *)(input + 8 * 21)); + in21 = _mm_load_si128((__m128i *)(input + 8 * 22)); + in29 = _mm_load_si128((__m128i *)(input + 8 * 23)); + in6 = _mm_load_si128((__m128i *)(input + 8 * 24)); + in14 = _mm_load_si128((__m128i *)(input + 8 * 25)); + in22 = _mm_load_si128((__m128i *)(input + 8 * 26)); + in30 = _mm_load_si128((__m128i *)(input + 8 * 27)); + in7 = _mm_load_si128((__m128i *)(input + 8 * 28)); + in15 = _mm_load_si128((__m128i *)(input + 8 * 29)); + in23 = _mm_load_si128((__m128i *)(input + 8 * 30)); + in31 = _mm_load_si128((__m128i *)(input + 8 * 31)); + + input += 256; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, + in10, in11, in12, in13, in14, in15); + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, + in18, in19, in20, in21, in22, in23); + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, + in26, in27, in28, in29, in30, in31); + } else { + // Second 1-D idct + j = i - 4; + + // Transpose 32x8 block to 8x32 block + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, + in5, in6, in7); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, + in11, in12, in13, in14, in15); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, + in19, in20, in21, in22, in23); + j += 4; + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, + in28, in29, in30, in31); + } + + // Stage1 + { + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); + + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); + + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); + + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); + + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, + stp1_17, stp1_30) + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, + stp1_23, stp1_24) + } + + // Stage2 + { + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); + + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); + + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, + stp2_14) + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, + stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + } + + // Stage3 + { + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); + + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, + stp1_6) + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, + stp1_18, stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, + stp1_22, stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + // Stage4 + { + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); + + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, + stp2_2, stp2_3) + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, + stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + // Stage5 + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, + stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, + stp1_21, stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // Stage6 + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, + stp2_13, stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + // Stage7 + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, + stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, + stp1_23, stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + // final stage + if (i < 4) { + // 1_D: Store 32 intermediate results for each 8x32 block. + col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31); + col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30); + col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29); + col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28); + col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27); + col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26); + col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25); + col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24); + col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23); + col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22); + col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21); + col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20); + col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19); + col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18); + col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17); + col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16); + col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); + col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); + col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); + col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); + col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); + col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); + col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); + col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); + col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); + col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); + col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); + col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); + col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); + col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); + col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); + col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); + } else { + const __m128i zero = _mm_setzero_si128(); + + // 2_D: Calculate the results and store them to destination. + in0 = _mm_add_epi16(stp1_0, stp1_31); + in1 = _mm_add_epi16(stp1_1, stp1_30); + in2 = _mm_add_epi16(stp1_2, stp1_29); + in3 = _mm_add_epi16(stp1_3, stp1_28); + in4 = _mm_add_epi16(stp1_4, stp1_27); + in5 = _mm_add_epi16(stp1_5, stp1_26); + in6 = _mm_add_epi16(stp1_6, stp1_25); + in7 = _mm_add_epi16(stp1_7, stp1_24); + in8 = _mm_add_epi16(stp1_8, stp1_23); + in9 = _mm_add_epi16(stp1_9, stp1_22); + in10 = _mm_add_epi16(stp1_10, stp1_21); + in11 = _mm_add_epi16(stp1_11, stp1_20); + in12 = _mm_add_epi16(stp1_12, stp1_19); + in13 = _mm_add_epi16(stp1_13, stp1_18); + in14 = _mm_add_epi16(stp1_14, stp1_17); + in15 = _mm_add_epi16(stp1_15, stp1_16); + in16 = _mm_sub_epi16(stp1_15, stp1_16); + in17 = _mm_sub_epi16(stp1_14, stp1_17); + in18 = _mm_sub_epi16(stp1_13, stp1_18); + in19 = _mm_sub_epi16(stp1_12, stp1_19); + in20 = _mm_sub_epi16(stp1_11, stp1_20); + in21 = _mm_sub_epi16(stp1_10, stp1_21); + in22 = _mm_sub_epi16(stp1_9, stp1_22); + in23 = _mm_sub_epi16(stp1_8, stp1_23); + in24 = _mm_sub_epi16(stp1_7, stp1_24); + in25 = _mm_sub_epi16(stp1_6, stp1_25); + in26 = _mm_sub_epi16(stp1_5, stp1_26); + in27 = _mm_sub_epi16(stp1_4, stp1_27); + in28 = _mm_sub_epi16(stp1_3, stp1_28); + in29 = _mm_sub_epi16(stp1_2, stp1_29); + in30 = _mm_sub_epi16(stp1_1, stp1_30); + in31 = _mm_sub_epi16(stp1_0, stp1_31); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + in8 = _mm_adds_epi16(in8, final_rounding); + in9 = _mm_adds_epi16(in9, final_rounding); + in10 = _mm_adds_epi16(in10, final_rounding); + in11 = _mm_adds_epi16(in11, final_rounding); + in12 = _mm_adds_epi16(in12, final_rounding); + in13 = _mm_adds_epi16(in13, final_rounding); + in14 = _mm_adds_epi16(in14, final_rounding); + in15 = _mm_adds_epi16(in15, final_rounding); + in16 = _mm_adds_epi16(in16, final_rounding); + in17 = _mm_adds_epi16(in17, final_rounding); + in18 = _mm_adds_epi16(in18, final_rounding); + in19 = _mm_adds_epi16(in19, final_rounding); + in20 = _mm_adds_epi16(in20, final_rounding); + in21 = _mm_adds_epi16(in21, final_rounding); + in22 = _mm_adds_epi16(in22, final_rounding); + in23 = _mm_adds_epi16(in23, final_rounding); + in24 = _mm_adds_epi16(in24, final_rounding); + in25 = _mm_adds_epi16(in25, final_rounding); + in26 = _mm_adds_epi16(in26, final_rounding); + in27 = _mm_adds_epi16(in27, final_rounding); + in28 = _mm_adds_epi16(in28, final_rounding); + in29 = _mm_adds_epi16(in29, final_rounding); + in30 = _mm_adds_epi16(in30, final_rounding); + in31 = _mm_adds_epi16(in31, final_rounding); + + in0 = _mm_srai_epi16(in0, 6); + in1 = _mm_srai_epi16(in1, 6); + in2 = _mm_srai_epi16(in2, 6); + in3 = _mm_srai_epi16(in3, 6); + in4 = _mm_srai_epi16(in4, 6); + in5 = _mm_srai_epi16(in5, 6); + in6 = _mm_srai_epi16(in6, 6); + in7 = _mm_srai_epi16(in7, 6); + in8 = _mm_srai_epi16(in8, 6); + in9 = _mm_srai_epi16(in9, 6); + in10 = _mm_srai_epi16(in10, 6); + in11 = _mm_srai_epi16(in11, 6); + in12 = _mm_srai_epi16(in12, 6); + in13 = _mm_srai_epi16(in13, 6); + in14 = _mm_srai_epi16(in14, 6); + in15 = _mm_srai_epi16(in15, 6); + in16 = _mm_srai_epi16(in16, 6); + in17 = _mm_srai_epi16(in17, 6); + in18 = _mm_srai_epi16(in18, 6); + in19 = _mm_srai_epi16(in19, 6); + in20 = _mm_srai_epi16(in20, 6); + in21 = _mm_srai_epi16(in21, 6); + in22 = _mm_srai_epi16(in22, 6); + in23 = _mm_srai_epi16(in23, 6); + in24 = _mm_srai_epi16(in24, 6); + in25 = _mm_srai_epi16(in25, 6); + in26 = _mm_srai_epi16(in26, 6); + in27 = _mm_srai_epi16(in27, 6); + in28 = _mm_srai_epi16(in28, 6); + in29 = _mm_srai_epi16(in29, 6); + in30 = _mm_srai_epi16(in30, 6); + in31 = _mm_srai_epi16(in31, 6); + + RECON_AND_STORE(dest, in0); + RECON_AND_STORE(dest, in1); + RECON_AND_STORE(dest, in2); + RECON_AND_STORE(dest, in3); + RECON_AND_STORE(dest, in4); + RECON_AND_STORE(dest, in5); + RECON_AND_STORE(dest, in6); + RECON_AND_STORE(dest, in7); + RECON_AND_STORE(dest, in8); + RECON_AND_STORE(dest, in9); + RECON_AND_STORE(dest, in10); + RECON_AND_STORE(dest, in11); + RECON_AND_STORE(dest, in12); + RECON_AND_STORE(dest, in13); + RECON_AND_STORE(dest, in14); + RECON_AND_STORE(dest, in15); + RECON_AND_STORE(dest, in16); + RECON_AND_STORE(dest, in17); + RECON_AND_STORE(dest, in18); + RECON_AND_STORE(dest, in19); + RECON_AND_STORE(dest, in20); + RECON_AND_STORE(dest, in21); + RECON_AND_STORE(dest, in22); + RECON_AND_STORE(dest, in23); + RECON_AND_STORE(dest, in24); + RECON_AND_STORE(dest, in25); + RECON_AND_STORE(dest, in26); + RECON_AND_STORE(dest, in27); + RECON_AND_STORE(dest, in28); + RECON_AND_STORE(dest, in29); + RECON_AND_STORE(dest, in30); + RECON_AND_STORE(dest, in31); + + dest += 8 - (stride * 32); + } + } +} diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm b/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm new file mode 100644 index 000000000..1af252168 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm @@ -0,0 +1,173 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_short_inv_walsh4x4_1_mmx(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_1_mmx) PRIVATE +sym(vp9_short_inv_walsh4x4_1_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rax, 3 + + mov rdi, arg(1) + add rax, [rsi] ;input[0] + 3 + + movd mm0, eax + + punpcklwd mm0, mm0 ;x x val val + + punpckldq mm0, mm0 ;val val val val + + psraw mm0, 3 ;(input[0] + 3) >> 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm0 + movq [rdi + 16], mm0 + movq [rdi + 24], mm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_short_inv_walsh4x4_mmx(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_mmx) PRIVATE +sym(vp9_short_inv_walsh4x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rax, 3 + mov rsi, arg(0) + mov rdi, arg(1) + shl rax, 16 + + movq mm0, [rsi + 0] ;ip[0] + movq mm1, [rsi + 8] ;ip[4] + or rax, 3 ;00030003h + + movq mm2, [rsi + 16] ;ip[8] + movq mm3, [rsi + 24] ;ip[12] + + movq mm7, rax + movq mm4, mm0 + + punpcklwd mm7, mm7 ;0003000300030003h + movq mm5, mm1 + + paddw mm4, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm4 ;temp al + + paddw mm4, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm1 ;dl + cl + psubw mm5, mm1 ;dl - cl + + ; 03 02 01 00 + ; 13 12 11 10 + ; 23 22 21 20 + ; 33 32 31 30 + + movq mm3, mm4 ; 03 02 01 00 + punpcklwd mm4, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm1, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm1, mm5 ; 33 23 32 22 + + movq mm0, mm4 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] +;~~~~~~~~~~~~~~~~~~~~~ + movq mm1, mm0 + movq mm5, mm4 + + paddw mm1, mm3 ;ip[0] + ip[12] aka al + paddw mm5, mm2 ;ip[4] + ip[8] aka bl + + movq mm6, mm1 ;temp al + + paddw mm1, mm5 ;al + bl + psubw mm6, mm5 ;al - bl + + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 + + movq mm5, mm0 ;temp dl + + paddw mm0, mm4 ;dl + cl + psubw mm5, mm4 ;dl - cl +;~~~~~~~~~~~~~~~~~~~~~ + movq mm3, mm1 ; 03 02 01 00 + punpcklwd mm1, mm0 ; 11 01 10 00 + punpckhwd mm3, mm0 ; 13 03 12 02 + + movq mm4, mm6 ; 23 22 21 20 + punpcklwd mm6, mm5 ; 31 21 30 20 + punpckhwd mm4, mm5 ; 33 23 32 22 + + movq mm0, mm1 ; 11 01 10 00 + movq mm2, mm3 ; 13 03 12 02 + + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] + punpckhdq mm1, mm6 ; 31 21 11 01 aka ip[4] + + punpckldq mm2, mm4 ; 32 22 12 02 aka ip[8] + punpckhdq mm3, mm4 ; 33 23 13 03 aka ip[12] + + paddw mm0, mm7 + paddw mm1, mm7 + paddw mm2, mm7 + paddw mm3, mm7 + + psraw mm0, 3 + psraw mm1, 3 + psraw mm2, 3 + psraw mm3, 3 + + movq [rdi + 0], mm0 + movq [rdi + 8], mm1 + movq [rdi + 16], mm2 + movq [rdi + 24], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + diff --git a/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm b/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm new file mode 100644 index 000000000..84fa2fe2a --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_iwalsh_sse2.asm @@ -0,0 +1,119 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_short_inv_walsh4x4_sse2(short *input, short *output) +global sym(vp9_short_inv_walsh4x4_sse2) PRIVATE +sym(vp9_short_inv_walsh4x4_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + SAVE_XMM 6 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) + mov rdi, arg(1) + mov rax, 3 + + movdqa xmm0, [rsi + 0] ;ip[4] ip[0] + movdqa xmm1, [rsi + 16] ;ip[12] ip[8] + + shl rax, 16 + or rax, 3 ;00030003h + + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm0 ;ip[4] ip[0] + + paddw xmm0, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm4, xmm0 + punpcklqdq xmm0, xmm3 ;d1 a1 + punpckhqdq xmm4, xmm3 ;c1 b1 + movd xmm6, eax + + movdqa xmm1, xmm4 ;c1 b1 + paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm0, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] + +;;;temp output +;; movdqu [rdi + 0], xmm4 +;; movdqu [rdi + 16], xmm3 + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm3, xmm4 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm4, xmm0 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm3, xmm0 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm4 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm4, xmm3 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm3 ; 33 23 13 03 32 22 12 02 + ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] + movdqa xmm3, xmm4 ;ip[4] ip[0] + + pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 + + paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 + psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 + + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm3 ;d1 a1 + punpckhqdq xmm5, xmm3 ;c1 b1 + + movdqa xmm1, xmm5 ;c1 b1 + paddw xmm5, xmm4 ;dl+cl a1+b1 aka op[4] op[0] + psubw xmm4, xmm1 ;d1-c1 a1-b1 aka op[12] op[8] +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + ; 13 12 11 10 03 02 01 00 + ; + ; 33 32 31 30 23 22 21 20 + ; + movdqa xmm0, xmm5 ; 13 12 11 10 03 02 01 00 + punpcklwd xmm5, xmm4 ; 23 03 22 02 21 01 20 00 + punpckhwd xmm0, xmm4 ; 33 13 32 12 31 11 30 10 + movdqa xmm1, xmm5 ; 23 03 22 02 21 01 20 00 + punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 + punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + paddw xmm5, xmm6 + paddw xmm1, xmm6 + + psraw xmm5, 3 + psraw xmm1, 3 + + movdqa [rdi + 0], xmm5 + movdqa [rdi + 16], xmm1 + + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +x_s1sqr2: + times 4 dw 0x8A8C +align 16 +x_c1sqr2less1: + times 4 dw 0x4E7B +align 16 +fours: + times 4 dw 0x0004 diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c new file mode 100644 index 000000000..50f890ab8 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -0,0 +1,1013 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> /* SSE2 */ +#include "vp9/common/vp9_loopfilter.h" +#include "vpx_ports/emmintrin_compat.h" + +prototype_loopfilter(vp9_loop_filter_vertical_edge_sse2); +prototype_loopfilter(vp9_loop_filter_horizontal_edge_sse2); + +extern loop_filter_uvfunction vp9_loop_filter_horizontal_edge_uv_sse2; +extern loop_filter_uvfunction vp9_loop_filter_vertical_edge_uv_sse2; + +void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh) { + DECLARE_ALIGNED(16, unsigned char, flat2_op[7][8]); + DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][8]); + + DECLARE_ALIGNED(16, unsigned char, flat_op[3][8]); + DECLARE_ALIGNED(16, unsigned char, flat_oq[3][8]); + + DECLARE_ALIGNED(16, unsigned char, ap[8][8]); + DECLARE_ALIGNED(16, unsigned char, aq[8][8]); + + + __m128i mask, hev, flat, flat2; + const __m128i zero = _mm_set1_epi16(0); + const __m128i one = _mm_set1_epi8(1); + __m128i p7, p6, p5; + __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; + __m128i q5, q6, q7; + int i = 0; + const unsigned int extended_thresh = _thresh[0] * 0x01010101u; + const unsigned int extended_limit = _limit[0] * 0x01010101u; + const unsigned int extended_blimit = _blimit[0] * 0x01010101u; + const __m128i thresh = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); + const __m128i limit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); + const __m128i blimit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + + p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + q4 = _mm_loadl_epi64((__m128i *)(s + 4 * p)); + + _mm_storel_epi64((__m128i *)ap[4], p4); + _mm_storel_epi64((__m128i *)ap[3], p3); + _mm_storel_epi64((__m128i *)ap[2], p2); + _mm_storel_epi64((__m128i *)ap[1], p1); + _mm_storel_epi64((__m128i *)ap[0], p0); + _mm_storel_epi64((__m128i *)aq[4], q4); + _mm_storel_epi64((__m128i *)aq[3], q3); + _mm_storel_epi64((__m128i *)aq[2], q2); + _mm_storel_epi64((__m128i *)aq[1], q1); + _mm_storel_epi64((__m128i *)aq[0], q0); + + + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + __m128i ps1 = _mm_xor_si128(p1, t80); + __m128i ps0 = _mm_xor_si128(p0, t80); + __m128i qs0 = _mm_xor_si128(q0, t80); + __m128i qs1 = _mm_xor_si128(q1, t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + filt = _mm_andnot_si128(hev, filt); + ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + // loopfilter done + + { + __m128i work; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0), + _mm_subs_epu8(p0, p4)), + _mm_or_si128(_mm_subs_epu8(q4, q0), + _mm_subs_epu8(q0, q4))); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + + p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); + q5 = _mm_loadl_epi64((__m128i *)(s + 5 * p)); + flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0), + _mm_subs_epu8(p0, p5)), + _mm_or_si128(_mm_subs_epu8(q5, q0), + _mm_subs_epu8(q0, q5))); + _mm_storel_epi64((__m128i *)ap[5], p5); + _mm_storel_epi64((__m128i *)aq[5], q5); + flat2 = _mm_max_epu8(work, flat2); + p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); + q6 = _mm_loadl_epi64((__m128i *)(s + 6 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0), + _mm_subs_epu8(p0, p6)), + _mm_or_si128(_mm_subs_epu8(q6, q0), + _mm_subs_epu8(q0, q6))); + _mm_storel_epi64((__m128i *)ap[6], p6); + _mm_storel_epi64((__m128i *)aq[6], q6); + flat2 = _mm_max_epu8(work, flat2); + + p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); + q7 = _mm_loadl_epi64((__m128i *)(s + 7 * p)); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0), + _mm_subs_epu8(p0, p7)), + _mm_or_si128(_mm_subs_epu8(q7, q0), + _mm_subs_epu8(q0, q7))); + _mm_storel_epi64((__m128i *)ap[7], p7); + _mm_storel_epi64((__m128i *)aq[7], q7); + flat2 = _mm_max_epu8(work, flat2); + flat2 = _mm_subs_epu8(flat2, one); + flat2 = _mm_cmpeq_epi8(flat2, zero); + flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask + } + + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // flat and wide flat calculations + { + const __m128i eight = _mm_set1_epi16(8); + const __m128i four = _mm_set1_epi16(4); + { + __m128i workp_shft; + __m128i a, b, c; + + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7])), zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6])), zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5])), zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4])), zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3])), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2])), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1])), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0])), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0])), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1])), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2])), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3])), zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4])), zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5])), zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6])), zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7])), zero); + + c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 + c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); + + b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2)); + a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); + a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); + + _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q1, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); + _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q2, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); + _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q3, a); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); + _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); + _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + + c = _mm_add_epi16(q4, c); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + b = _mm_add_epi16(q3, b); + b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); + _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) + , b)); + a = _mm_add_epi16(q5, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q6, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + + a = _mm_add_epi16(q7, a); + c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); + workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); + _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // wide flat + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + work_a = _mm_loadl_epi64((__m128i *)ap[2]); + p2 = _mm_loadl_epi64((__m128i *)flat_op[2]); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_storel_epi64((__m128i *)flat_op[2], p2); + + p1 = _mm_loadl_epi64((__m128i *)flat_op[1]); + work_a = _mm_andnot_si128(flat, ps1); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_storel_epi64((__m128i *)flat_op[1], p1); + + p0 = _mm_loadl_epi64((__m128i *)flat_op[0]); + work_a = _mm_andnot_si128(flat, ps0); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_storel_epi64((__m128i *)flat_op[0], p0); + + q0 = _mm_loadl_epi64((__m128i *)flat_oq[0]); + work_a = _mm_andnot_si128(flat, qs0); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_storel_epi64((__m128i *)flat_oq[0], q0); + + q1 = _mm_loadl_epi64((__m128i *)flat_oq[1]); + work_a = _mm_andnot_si128(flat, qs1); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_storel_epi64((__m128i *)flat_oq[1], q1); + + work_a = _mm_loadl_epi64((__m128i *)aq[2]); + q2 = _mm_loadl_epi64((__m128i *)flat_oq[2]); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_storel_epi64((__m128i *)flat_oq[2], q2); + + // write out op6 - op3 + { + unsigned char *dst = (s - 7 * p); + for (i = 6; i > 2; i--) { + __m128i flat2_output; + work_a = _mm_loadl_epi64((__m128i *)ap[i]); + flat2_output = _mm_loadl_epi64((__m128i *)flat2_op[i]); + work_a = _mm_andnot_si128(flat2, work_a); + flat2_output = _mm_and_si128(flat2, flat2_output); + work_a = _mm_or_si128(work_a, flat2_output); + _mm_storel_epi64((__m128i *)dst, work_a); + dst += p; + } + } + + work_a = _mm_loadl_epi64((__m128i *)flat_op[2]); + p2 = _mm_loadl_epi64((__m128i *)flat2_op[2]); + work_a = _mm_andnot_si128(flat2, work_a); + p2 = _mm_and_si128(flat2, p2); + p2 = _mm_or_si128(work_a, p2); + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + + work_a = _mm_loadl_epi64((__m128i *)flat_op[1]); + p1 = _mm_loadl_epi64((__m128i *)flat2_op[1]); + work_a = _mm_andnot_si128(flat2, work_a); + p1 = _mm_and_si128(flat2, p1); + p1 = _mm_or_si128(work_a, p1); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + + work_a = _mm_loadl_epi64((__m128i *)flat_op[0]); + p0 = _mm_loadl_epi64((__m128i *)flat2_op[0]); + work_a = _mm_andnot_si128(flat2, work_a); + p0 = _mm_and_si128(flat2, p0); + p0 = _mm_or_si128(work_a, p0); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + + work_a = _mm_loadl_epi64((__m128i *)flat_oq[0]); + q0 = _mm_loadl_epi64((__m128i *)flat2_oq[0]); + work_a = _mm_andnot_si128(flat2, work_a); + q0 = _mm_and_si128(flat2, q0); + q0 = _mm_or_si128(work_a, q0); + _mm_storel_epi64((__m128i *)(s - 0 * p), q0); + + work_a = _mm_loadl_epi64((__m128i *)flat_oq[1]); + q1 = _mm_loadl_epi64((__m128i *)flat2_oq[1]); + work_a = _mm_andnot_si128(flat2, work_a); + q1 = _mm_and_si128(flat2, q1); + q1 = _mm_or_si128(work_a, q1); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + + work_a = _mm_loadl_epi64((__m128i *)flat_oq[2]); + q2 = _mm_loadl_epi64((__m128i *)flat2_oq[2]); + work_a = _mm_andnot_si128(flat2, work_a); + q2 = _mm_and_si128(flat2, q2); + q2 = _mm_or_si128(work_a, q2); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + + // write out oq3 - oq7 + { + unsigned char *dst = (s + 3 * p); + for (i = 3; i < 7; i++) { + __m128i flat2_output; + work_a = _mm_loadl_epi64((__m128i *)aq[i]); + flat2_output = _mm_loadl_epi64((__m128i *)flat2_oq[i]); + work_a = _mm_andnot_si128(flat2, work_a); + flat2_output = _mm_and_si128(flat2, flat2_output); + work_a = _mm_or_si128(work_a, flat2_output); + _mm_storel_epi64((__m128i *)dst, work_a); + dst += p; + } + } + } +} + +void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + int count) { + DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); + DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); + __m128i mask, hev, flat; + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + const unsigned int extended_thresh = _thresh[0] * 0x01010101u; + const unsigned int extended_limit = _limit[0] * 0x01010101u; + const unsigned int extended_blimit = _blimit[0] * 0x01010101u; + const __m128i thresh = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); + const __m128i limit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); + const __m128i blimit = + _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + + (void)count; + p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); + p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); + p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); + p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); + q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); + q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); + q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); + q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), + _mm_subs_epu8(p0, p2)), + _mm_or_si128(_mm_subs_epu8(q2, q0), + _mm_subs_epu8(q0, q2))); + flat = _mm_max_epu8(work, flat); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0), + _mm_subs_epu8(p0, p3)), + _mm_or_si128(_mm_subs_epu8(q3, q0), + _mm_subs_epu8(q0, q3))); + flat = _mm_max_epu8(work, flat); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u, + int p, + const unsigned char *_blimit, + const unsigned char *_limit, + const unsigned char *_thresh, + unsigned char *v) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, src, 160); + + /* Read source */ + const __m128i p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 5 * p)), + _mm_loadl_epi64((__m128i *)(v - 5 * p))); + const __m128i p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 4 * p)), + _mm_loadl_epi64((__m128i *)(v - 4 * p))); + const __m128i p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 3 * p)), + _mm_loadl_epi64((__m128i *)(v - 3 * p))); + const __m128i p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 2 * p)), + _mm_loadl_epi64((__m128i *)(v - 2 * p))); + const __m128i p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u - 1 * p)), + _mm_loadl_epi64((__m128i *)(v - 1 * p))); + const __m128i q0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u)), + _mm_loadl_epi64((__m128i *)(v))); + const __m128i q1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 1 * p)), + _mm_loadl_epi64((__m128i *)(v + 1 * p))); + const __m128i q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 2 * p)), + _mm_loadl_epi64((__m128i *)(v + 2 * p))); + const __m128i q3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 3 * p)), + _mm_loadl_epi64((__m128i *)(v + 3 * p))); + const __m128i q4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(u + 4 * p)), + _mm_loadl_epi64((__m128i *)(v + 4 * p))); + + _mm_store_si128((__m128i *)(src), p4); + _mm_store_si128((__m128i *)(src + 16), p3); + _mm_store_si128((__m128i *)(src + 32), p2); + _mm_store_si128((__m128i *)(src + 48), p1); + _mm_store_si128((__m128i *)(src + 64), p0); + _mm_store_si128((__m128i *)(src + 80), q0); + _mm_store_si128((__m128i *)(src + 96), q1); + _mm_store_si128((__m128i *)(src + 112), q2); + _mm_store_si128((__m128i *)(src + 128), q3); + _mm_store_si128((__m128i *)(src + 144), q4); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(src + 80, 16, _blimit, _limit, + _thresh, 1); + + /* Store result */ + _mm_storel_epi64((__m128i *)(u - 3 * p), + _mm_loadl_epi64((__m128i *)(src + 32))); + _mm_storel_epi64((__m128i *)(u - 2 * p), + _mm_loadl_epi64((__m128i *)(src + 48))); + _mm_storel_epi64((__m128i *)(u - p), + _mm_loadl_epi64((__m128i *)(src + 64))); + _mm_storel_epi64((__m128i *)u, + _mm_loadl_epi64((__m128i *)(src + 80))); + _mm_storel_epi64((__m128i *)(u + p), + _mm_loadl_epi64((__m128i *)(src + 96))); + _mm_storel_epi64((__m128i *)(u + 2 * p), + _mm_loadl_epi64((__m128i *)(src + 112))); + + _mm_storel_epi64((__m128i *)(v - 3 * p), + _mm_loadl_epi64((__m128i *)(src + 40))); + _mm_storel_epi64((__m128i *)(v - 2 * p), + _mm_loadl_epi64((__m128i *)(src + 56))); + _mm_storel_epi64((__m128i *)(v - p), + _mm_loadl_epi64((__m128i *)(src + 72))); + _mm_storel_epi64((__m128i *)v, + _mm_loadl_epi64((__m128i *)(src + 88))); + _mm_storel_epi64((__m128i *)(v + p), + _mm_loadl_epi64((__m128i *)(src + 104))); + _mm_storel_epi64((__m128i *)(v + 2 * p), + _mm_loadl_epi64((__m128i *)(src + 120))); +} + +static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1, + int in_p, unsigned char *out, int out_p) { + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + __m128i x8, x9, x10, x11, x12, x13, x14, x15; + + /* Read in 16 lines */ + x0 = _mm_loadl_epi64((__m128i *)in0); + x8 = _mm_loadl_epi64((__m128i *)in1); + x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); + x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); + x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); + x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); + x3 = _mm_loadl_epi64((__m128i *)(in0 + 3*in_p)); + x11 = _mm_loadl_epi64((__m128i *)(in1 + 3*in_p)); + x4 = _mm_loadl_epi64((__m128i *)(in0 + 4*in_p)); + x12 = _mm_loadl_epi64((__m128i *)(in1 + 4*in_p)); + x5 = _mm_loadl_epi64((__m128i *)(in0 + 5*in_p)); + x13 = _mm_loadl_epi64((__m128i *)(in1 + 5*in_p)); + x6 = _mm_loadl_epi64((__m128i *)(in0 + 6*in_p)); + x14 = _mm_loadl_epi64((__m128i *)(in1 + 6*in_p)); + x7 = _mm_loadl_epi64((__m128i *)(in0 + 7*in_p)); + x15 = _mm_loadl_epi64((__m128i *)(in1 + 7*in_p)); + + x0 = _mm_unpacklo_epi8(x0, x1); + x1 = _mm_unpacklo_epi8(x2, x3); + x2 = _mm_unpacklo_epi8(x4, x5); + x3 = _mm_unpacklo_epi8(x6, x7); + + x8 = _mm_unpacklo_epi8(x8, x9); + x9 = _mm_unpacklo_epi8(x10, x11); + x10 = _mm_unpacklo_epi8(x12, x13); + x11 = _mm_unpacklo_epi8(x14, x15); + + x4 = _mm_unpacklo_epi16(x0, x1); + x5 = _mm_unpacklo_epi16(x2, x3); + x12 = _mm_unpacklo_epi16(x8, x9); + x13 = _mm_unpacklo_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + /* Store first 4-line result */ + _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); + + x4 = _mm_unpackhi_epi16(x0, x1); + x5 = _mm_unpackhi_epi16(x2, x3); + x12 = _mm_unpackhi_epi16(x8, x9); + x13 = _mm_unpackhi_epi16(x10, x11); + + x6 = _mm_unpacklo_epi32(x4, x5); + x7 = _mm_unpackhi_epi32(x4, x5); + x14 = _mm_unpacklo_epi32(x12, x13); + x15 = _mm_unpackhi_epi32(x12, x13); + + /* Store second 4-line result */ + _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); + _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); + _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); +} + +static INLINE void transpose(unsigned char *src[], int in_p, + unsigned char *dst[], int out_p, + int num_8x8_to_transpose) { + int idx8x8 = 0; + __m128i x0, x1, x2, x3, x4, x5, x6, x7; + do { + unsigned char *in = src[idx8x8]; + unsigned char *out = dst[idx8x8]; + + x0 = _mm_loadl_epi64((__m128i *)(in + 0*in_p)); // 00 01 02 03 04 05 06 07 + x1 = _mm_loadl_epi64((__m128i *)(in + 1*in_p)); // 10 11 12 13 14 15 16 17 + x2 = _mm_loadl_epi64((__m128i *)(in + 2*in_p)); // 20 21 22 23 24 25 26 27 + x3 = _mm_loadl_epi64((__m128i *)(in + 3*in_p)); // 30 31 32 33 34 35 36 37 + x4 = _mm_loadl_epi64((__m128i *)(in + 4*in_p)); // 40 41 42 43 44 45 46 47 + x5 = _mm_loadl_epi64((__m128i *)(in + 5*in_p)); // 50 51 52 53 54 55 56 57 + x6 = _mm_loadl_epi64((__m128i *)(in + 6*in_p)); // 60 61 62 63 64 65 66 67 + x7 = _mm_loadl_epi64((__m128i *)(in + 7*in_p)); // 70 71 72 73 74 75 76 77 + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + x0 = _mm_unpacklo_epi8(x0, x1); + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + x1 = _mm_unpacklo_epi8(x2, x3); + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + x2 = _mm_unpacklo_epi8(x4, x5); + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + x3 = _mm_unpacklo_epi8(x6, x7); + // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 + x4 = _mm_unpacklo_epi16(x0, x1); + // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 + x5 = _mm_unpacklo_epi16(x2, x3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + x6 = _mm_unpacklo_epi32(x4, x5); + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 0*out_p), + _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 + _mm_storeh_pd((double *)(out + 1*out_p), + _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + _mm_storel_pd((double *)(out + 2*out_p), + _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 + _mm_storeh_pd((double *)(out + 3*out_p), + _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + + // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 + x4 = _mm_unpackhi_epi16(x0, x1); + // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 + x5 = _mm_unpackhi_epi16(x2, x3); + // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 + x6 = _mm_unpacklo_epi32(x4, x5); + // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 + x7 = _mm_unpackhi_epi32(x4, x5); + + _mm_storel_pd((double *)(out + 4*out_p), + _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 + _mm_storeh_pd((double *)(out + 5*out_p), + _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + _mm_storel_pd((double *)(out + 6*out_p), + _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 + _mm_storeh_pd((double *)(out + 7*out_p), + _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + } while (++idx8x8 < num_8x8_to_transpose); +} + +void vp9_mbloop_filter_vertical_edge_sse2(unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + int count) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + unsigned char *src[2]; + unsigned char *dst[2]; + + (void)count; + /* Transpose 16x16 */ + transpose8x16(s - 8, s - 8 + p * 8, p, t_dst, 16); + transpose8x16(s, s + p * 8, p, t_dst + 16 * 8, 16); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, 1); + src[0] = t_dst + 3 * 16; + src[1] = t_dst + 3 * 16 + 8; + + dst[0] = s - 5; + dst[1] = s - 5 + p * 8; + + /* Transpose 16x8 */ + transpose(src, 16, dst, p, 2); +} + +void vp9_mb_lpf_vertical_edge_w_sse2(unsigned char *s, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + unsigned char *src[4]; + unsigned char *dst[4]; + + dst[0] = t_dst; + dst[1] = t_dst + 8 * 16; + + src[0] = s - 8; + src[1] = s - 8 + 8; + + /* Transpose 16x16 */ + transpose(src, p, dst, 16, 2); + + /* Loop filtering */ + vp9_mb_lpf_horizontal_edge_w_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh); + + src[0] = t_dst; + src[1] = t_dst + 8 * 16; + + dst[0] = s - 8; + dst[1] = s - 8 + 8; + + transpose(src, 16, dst, p, 2); +} + + +void vp9_mbloop_filter_vertical_edge_uv_sse2(unsigned char *u, + int p, + const unsigned char *blimit, + const unsigned char *limit, + const unsigned char *thresh, + unsigned char *v) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, t_dst, 256); + unsigned char *src[2]; + unsigned char *dst[2]; + + /* Transpose 16x16 */ + transpose8x16(u - 8, v - 8, p, t_dst, 16); + transpose8x16(u, v, p, t_dst + 16 * 8, 16); + + /* Loop filtering */ + vp9_mbloop_filter_horizontal_edge_sse2(t_dst + 8 * 16, 16, blimit, limit, + thresh, 1); + + src[0] = t_dst + 3 * 16; + src[1] = t_dst + 3 * 16 + 8; + + dst[0] = u - 5; + dst[1] = v - 5; + + /* Transpose 16x8 */ + transpose(src, 16, dst, p, 2); +} diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm b/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm new file mode 100644 index 000000000..4ebb51b77 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -0,0 +1,626 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + + +;void vp9_loop_filter_horizontal_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_mmx) PRIVATE +sym(vp9_loop_filter_horizontal_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + movsxd rcx, dword ptr arg(5) ;count +.next8_h: + mov rdx, arg(3) ;limit + movq mm7, [rdx] + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + ; calculate breakout conditions + movq mm2, [rdi+2*rax] ; q3 + movq mm1, [rsi+2*rax] ; q2 + movq mm6, mm1 ; q2 + psubusb mm1, mm2 ; q2-=q3 + psubusb mm2, mm6 ; q3-=q2 + por mm1, mm2 ; abs(q3-q2) + psubusb mm1, mm7 ; + + + movq mm4, [rsi+rax] ; q1 + movq mm3, mm4 ; q1 + psubusb mm4, mm6 ; q1-=q2 + psubusb mm6, mm3 ; q2-=q1 + por mm4, mm6 ; abs(q2-q1) + + psubusb mm4, mm7 + por mm1, mm4 + + movq mm4, [rsi] ; q0 + movq mm0, mm4 ; q0 + psubusb mm4, mm3 ; q0-=q1 + psubusb mm3, mm0 ; q1-=q0 + por mm4, mm3 ; abs(q0-q1) + movq t0, mm4 ; save to t0 + psubusb mm4, mm7 + por mm1, mm4 + + + neg rax ; negate pitch to deal with above border + + movq mm2, [rsi+4*rax] ; p3 + movq mm4, [rdi+4*rax] ; p2 + movq mm5, mm4 ; p2 + psubusb mm4, mm2 ; p2-=p3 + psubusb mm2, mm5 ; p3-=p2 + por mm4, mm2 ; abs(p3 - p2) + psubusb mm4, mm7 + por mm1, mm4 + + + movq mm4, [rsi+2*rax] ; p1 + movq mm3, mm4 ; p1 + psubusb mm4, mm5 ; p1-=p2 + psubusb mm5, mm3 ; p2-=p1 + por mm4, mm5 ; abs(p2 - p1) + psubusb mm4, mm7 + por mm1, mm4 + + movq mm2, mm3 ; p1 + + movq mm4, [rsi+rax] ; p0 + movq mm5, mm4 ; p0 + psubusb mm4, mm3 ; p0-=p1 + psubusb mm3, mm5 ; p1-=p0 + por mm4, mm3 ; abs(p1 - p0) + movq t1, mm4 ; save to t1 + psubusb mm4, mm7 + por mm1, mm4 + + movq mm3, [rdi] ; q1 + movq mm4, mm3 ; q1 + psubusb mm3, mm2 ; q1-=p1 + psubusb mm2, mm4 ; p1-=q1 + por mm2, mm3 ; abs(p1-q1) + pand mm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm2, 1 ; abs(p1-q1)/2 + + movq mm6, mm5 ; p0 + movq mm3, [rsi] ; q0 + psubusb mm5, mm3 ; p0-=q0 + psubusb mm3, mm6 ; q0-=p0 + por mm5, mm3 ; abs(p0 - q0) + paddusb mm5, mm5 ; abs(p0-q0)*2 + paddusb mm5, mm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; get blimit + movq mm7, [rdx] ; blimit + + psubusb mm5, mm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm5 + pxor mm5, mm5 + pcmpeqb mm1, mm5 ; mask mm1 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + paddb mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb mm4, mm5 + + pcmpeqb mm5, mm5 + pxor mm4, mm5 + + + ; start work on filters + movq mm2, [rsi+2*rax] ; p1 + movq mm7, [rdi] ; q1 + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + pxor mm0, mm0 ; + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + psraw mm5, 11 + packsswb mm0, mm5 + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+rax], mm6 ; write back + + movq mm6, [rsi+2*rax] ; p1 + pxor mm6, [GLOBAL(t80)] ; reoffset + paddsb mm6, mm4 ; p1+= p1 add + pxor mm6, [GLOBAL(t80)] ; unoffset + movq [rsi+2*rax], mm6 ; write back + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + movq [rsi], mm3 ; write back + + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + movq [rdi], mm7 ; write back + + add rsi,8 + neg rax + dec rcx + jnz .next8_h + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_vertical_edge_mmx +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_vertical_edge_mmx) PRIVATE +sym(vp9_loop_filter_vertical_edge_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 64 ; reserve 64 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[32]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? + + lea rsi, [rsi + rax*4 - 4] + + movsxd rcx, dword ptr arg(5) ;count +.next8_v: + mov rdi, rsi ; rdi points to row +1 for indirect addressing + add rdi, rax + + + ;transpose + movq mm6, [rsi+2*rax] ; 67 66 65 64 63 62 61 60 + movq mm7, mm6 ; 77 76 75 74 73 72 71 70 + + punpckhbw mm7, [rdi+2*rax] ; 77 67 76 66 75 65 74 64 + punpcklbw mm6, [rdi+2*rax] ; 73 63 72 62 71 61 70 60 + + movq mm4, [rsi] ; 47 46 45 44 43 42 41 40 + movq mm5, mm4 ; 47 46 45 44 43 42 41 40 + + punpckhbw mm5, [rsi+rax] ; 57 47 56 46 55 45 54 44 + punpcklbw mm4, [rsi+rax] ; 53 43 52 42 51 41 50 40 + + movq mm3, mm5 ; 57 47 56 46 55 45 54 44 + punpckhwd mm5, mm7 ; 77 67 57 47 76 66 56 46 + + punpcklwd mm3, mm7 ; 75 65 55 45 74 64 54 44 + movq mm2, mm4 ; 53 43 52 42 51 41 50 40 + + punpckhwd mm4, mm6 ; 73 63 53 43 72 62 52 42 + punpcklwd mm2, mm6 ; 71 61 51 41 70 60 50 40 + + neg rax + movq mm6, [rsi+rax*2] ; 27 26 25 24 23 22 21 20 + + movq mm1, mm6 ; 27 26 25 24 23 22 21 20 + punpckhbw mm6, [rsi+rax] ; 37 27 36 36 35 25 34 24 + + punpcklbw mm1, [rsi+rax] ; 33 23 32 22 31 21 30 20 + movq mm7, [rsi+rax*4]; ; 07 06 05 04 03 02 01 00 + + punpckhbw mm7, [rdi+rax*4] ; 17 07 16 06 15 05 14 04 + movq mm0, mm7 ; 17 07 16 06 15 05 14 04 + + punpckhwd mm7, mm6 ; 37 27 17 07 36 26 16 06 + punpcklwd mm0, mm6 ; 35 25 15 05 34 24 14 04 + + movq mm6, mm7 ; 37 27 17 07 36 26 16 06 + punpckhdq mm7, mm5 ; 77 67 57 47 37 27 17 07 = q3 + + punpckldq mm6, mm5 ; 76 66 56 46 36 26 16 06 = q2 + + movq mm5, mm6 ; 76 66 56 46 36 26 16 06 + psubusb mm5, mm7 ; q2-q3 + + psubusb mm7, mm6 ; q3-q2 + por mm7, mm5; ; mm7=abs (q3-q2) + + movq mm5, mm0 ; 35 25 15 05 34 24 14 04 + punpckhdq mm5, mm3 ; 75 65 55 45 35 25 15 05 = q1 + + punpckldq mm0, mm3 ; 74 64 54 44 34 24 15 04 = q0 + movq mm3, mm5 ; 75 65 55 45 35 25 15 05 = q1 + + psubusb mm3, mm6 ; q1-q2 + psubusb mm6, mm5 ; q2-q1 + + por mm6, mm3 ; mm6=abs(q2-q1) + lea rdx, srct + + movq [rdx+24], mm5 ; save q1 + movq [rdx+16], mm0 ; save q0 + + movq mm3, [rsi+rax*4] ; 07 06 05 04 03 02 01 00 + punpcklbw mm3, [rdi+rax*4] ; 13 03 12 02 11 01 10 00 + + movq mm0, mm3 ; 13 03 12 02 11 01 10 00 + punpcklwd mm0, mm1 ; 31 21 11 01 30 20 10 00 + + punpckhwd mm3, mm1 ; 33 23 13 03 32 22 12 02 + movq mm1, mm0 ; 31 21 11 01 30 20 10 00 + + punpckldq mm0, mm2 ; 70 60 50 40 30 20 10 00 =p3 + punpckhdq mm1, mm2 ; 71 61 51 41 31 21 11 01 =p2 + + movq mm2, mm1 ; 71 61 51 41 31 21 11 01 =p2 + psubusb mm2, mm0 ; p2-p3 + + psubusb mm0, mm1 ; p3-p2 + por mm0, mm2 ; mm0=abs(p3-p2) + + movq mm2, mm3 ; 33 23 13 03 32 22 12 02 + punpckldq mm2, mm4 ; 72 62 52 42 32 22 12 02 = p1 + + punpckhdq mm3, mm4 ; 73 63 53 43 33 23 13 03 = p0 + movq [rdx+8], mm3 ; save p0 + + movq [rdx], mm2 ; save p1 + movq mm5, mm2 ; mm5 = p1 + + psubusb mm2, mm1 ; p1-p2 + psubusb mm1, mm5 ; p2-p1 + + por mm1, mm2 ; mm1=abs(p2-p1) + mov rdx, arg(3) ;limit + + movq mm4, [rdx] ; mm4 = limit + psubusb mm7, mm4 + + psubusb mm0, mm4 + psubusb mm1, mm4 + + psubusb mm6, mm4 + por mm7, mm6 + + por mm0, mm1 + por mm0, mm7 ; abs(q3-q2) > limit || abs(p3-p2) > limit ||abs(p2-p1) > limit || abs(q2-q1) > limit + + movq mm1, mm5 ; p1 + + movq mm7, mm3 ; mm3=mm7=p0 + psubusb mm7, mm5 ; p0 - p1 + + psubusb mm5, mm3 ; p1 - p0 + por mm5, mm7 ; abs(p1-p0) + + movq t0, mm5 ; save abs(p1-p0) + lea rdx, srct + + psubusb mm5, mm4 + por mm0, mm5 ; mm0=mask + + movq mm5, [rdx+16] ; mm5=q0 + movq mm7, [rdx+24] ; mm7=q1 + + movq mm6, mm5 ; mm6=q0 + movq mm2, mm7 ; q1 + psubusb mm5, mm7 ; q0-q1 + + psubusb mm7, mm6 ; q1-q0 + por mm7, mm5 ; abs(q1-q0) + + movq t1, mm7 ; save abs(q1-q0) + psubusb mm7, mm4 + + por mm0, mm7 ; mask + + movq mm5, mm2 ; q1 + psubusb mm5, mm1 ; q1-=p1 + psubusb mm1, mm2 ; p1-=q1 + por mm5, mm1 ; abs(p1-q1) + pand mm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psrlw mm5, 1 ; abs(p1-q1)/2 + + mov rdx, arg(2) ;blimit ; + + movq mm4, [rdx] ;blimit + movq mm1, mm3 ; mm1=mm3=p0 + + movq mm7, mm6 ; mm7=mm6=q0 + psubusb mm1, mm7 ; p0-q0 + + psubusb mm7, mm3 ; q0-p0 + por mm1, mm7 ; abs(q0-p0) + paddusb mm1, mm1 ; abs(q0-p0)*2 + paddusb mm1, mm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + psubusb mm1, mm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por mm1, mm0; ; mask + + pxor mm0, mm0 + pcmpeqb mm1, mm0 + + ; calculate high edge variance + mov rdx, arg(4) ;thresh ; get thresh + movq mm7, [rdx] + ; + movq mm4, t0 ; get abs (q1 - q0) + psubusb mm4, mm7 + + movq mm3, t1 ; get abs (p1 - p0) + psubusb mm3, mm7 + + por mm4, mm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + pcmpeqb mm4, mm0 + + pcmpeqb mm0, mm0 + pxor mm4, mm0 + + + + ; start work on filters + lea rdx, srct + + movq mm2, [rdx] ; p1 + movq mm7, [rdx+24] ; q1 + + movq mm6, [rdx+8] ; p0 + movq mm0, [rdx+16] ; q0 + + pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb mm2, mm7 ; p1 - q1 + pand mm2, mm4 ; high var mask (hvm)(p1 - q1) + + pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values + pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values + + movq mm3, mm0 ; q0 + psubsb mm0, mm6 ; q0 - p0 + + paddsb mm2, mm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + paddsb mm2, mm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb mm2, mm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + pand mm1, mm2 ; mask filter values we don't care about + + movq mm2, mm1 + paddsb mm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + + paddsb mm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + pxor mm0, mm0 ; + + pxor mm5, mm5 + punpcklbw mm0, mm2 ; + + punpckhbw mm5, mm2 ; + psraw mm0, 11 ; + + psraw mm5, 11 + packsswb mm0, mm5 + + movq mm2, mm0 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + + pxor mm0, mm0 ; 0 + movq mm5, mm1 ; abcdefgh + + punpcklbw mm0, mm1 ; e0f0g0h0 + psraw mm0, 11 ; sign extended shift right by 3 + + pxor mm1, mm1 ; 0 + punpckhbw mm1, mm5 ; a0b0c0d0 + + psraw mm1, 11 ; sign extended shift right by 3 + movq mm5, mm0 ; save results + + packsswb mm0, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw mm5, [GLOBAL(ones)] + + paddsw mm1, [GLOBAL(ones)] + psraw mm5, 1 ; partial shifted one more time for 2nd tap + + psraw mm1, 1 ; partial shifted one more time for 2nd tap + packsswb mm5, mm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + + pandn mm4, mm5 ; high edge variance additive + + paddsb mm6, mm2 ; p0+= p0 add + pxor mm6, [GLOBAL(t80)] ; unoffset + + ; mm6=p0 ; + movq mm1, [rdx] ; p1 + pxor mm1, [GLOBAL(t80)] ; reoffset + + paddsb mm1, mm4 ; p1+= p1 add + pxor mm1, [GLOBAL(t80)] ; unoffset + ; mm6 = p0 mm1 = p1 + + psubsb mm3, mm0 ; q0-= q0 add + pxor mm3, [GLOBAL(t80)] ; unoffset + + ; mm3 = q0 + psubsb mm7, mm4 ; q1-= q1 add + pxor mm7, [GLOBAL(t80)] ; unoffset + ; mm7 = q1 + + ; tranpose and write back + ; mm1 = 72 62 52 42 32 22 12 02 + ; mm6 = 73 63 53 43 33 23 13 03 + ; mm3 = 74 64 54 44 34 24 14 04 + ; mm7 = 75 65 55 45 35 25 15 05 + + movq mm2, mm1 ; 72 62 52 42 32 22 12 02 + punpcklbw mm2, mm6 ; 33 32 23 22 13 12 03 02 + + movq mm4, mm3 ; 74 64 54 44 34 24 14 04 + punpckhbw mm1, mm6 ; 73 72 63 62 53 52 43 42 + + punpcklbw mm4, mm7 ; 35 34 25 24 15 14 05 04 + punpckhbw mm3, mm7 ; 75 74 65 64 55 54 45 44 + + movq mm6, mm2 ; 33 32 23 22 13 12 03 02 + punpcklwd mm2, mm4 ; 15 14 13 12 05 04 03 02 + + punpckhwd mm6, mm4 ; 35 34 33 32 25 24 23 22 + movq mm5, mm1 ; 73 72 63 62 53 52 43 42 + + punpcklwd mm1, mm3 ; 55 54 53 52 45 44 43 42 + punpckhwd mm5, mm3 ; 75 74 73 72 65 64 63 62 + + + ; mm2 = 15 14 13 12 05 04 03 02 + ; mm6 = 35 34 33 32 25 24 23 22 + ; mm5 = 55 54 53 52 45 44 43 42 + ; mm1 = 75 74 73 72 65 64 63 62 + + + + movd [rsi+rax*4+2], mm2 + psrlq mm2, 32 + + movd [rdi+rax*4+2], mm2 + movd [rsi+rax*2+2], mm6 + + psrlq mm6, 32 + movd [rsi+rax+2],mm6 + + movd [rsi+2], mm1 + psrlq mm1, 32 + + movd [rdi+2], mm1 + neg rax + + movd [rdi+rax+2],mm5 + psrlq mm5, 32 + + movd [rdi+rax*2+2], mm5 + + lea rsi, [rsi+rax*8] + dec rcx + jnz .next8_v + + add rsp, 64 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 8 db 0xfe +align 16 +t80: + times 8 db 0x80 +align 16 +t1s: + times 8 db 0x01 +align 16 +t3: + times 8 db 0x03 +align 16 +t4: + times 8 db 0x04 +align 16 +ones: + times 4 dw 0x0001 +align 16 +s27: + times 4 dw 0x1b00 +align 16 +s18: + times 4 dw 0x1200 +align 16 +s9: + times 4 dw 0x0900 +align 16 +s63: + times 4 dw 0x003f diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm b/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm new file mode 100644 index 000000000..74236cfbb --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_loopfilter_sse2.asm @@ -0,0 +1,872 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +; Use of pmaxub instead of psubusb to compute filter mask was seen +; in ffvp8 + +%macro LFH_FILTER_AND_HEV_MASK 1 +%if %1 + movdqa xmm2, [rdi+2*rax] ; q3 + movdqa xmm1, [rsi+2*rax] ; q2 + movdqa xmm4, [rsi+rax] ; q1 + movdqa xmm5, [rsi] ; q0 + neg rax ; negate pitch to deal with above border +%else + movlps xmm2, [rsi + rcx*2] ; q3 + movlps xmm1, [rsi + rcx] ; q2 + movlps xmm4, [rsi] ; q1 + movlps xmm5, [rsi + rax] ; q0 + + movhps xmm2, [rdi + rcx*2] + movhps xmm1, [rdi + rcx] + movhps xmm4, [rdi] + movhps xmm5, [rdi + rax] + + lea rsi, [rsi + rax*4] + lea rdi, [rdi + rax*4] + + movdqa XMMWORD PTR [rsp], xmm1 ; store q2 + movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1 +%endif + + movdqa xmm6, xmm1 ; q2 + movdqa xmm3, xmm4 ; q1 + + psubusb xmm1, xmm2 ; q2-=q3 + psubusb xmm2, xmm6 ; q3-=q2 + + psubusb xmm4, xmm6 ; q1-=q2 + psubusb xmm6, xmm3 ; q2-=q1 + + por xmm4, xmm6 ; abs(q2-q1) + por xmm1, xmm2 ; abs(q3-q2) + + movdqa xmm0, xmm5 ; q0 + pmaxub xmm1, xmm4 + + psubusb xmm5, xmm3 ; q0-=q1 + psubusb xmm3, xmm0 ; q1-=q0 + + por xmm5, xmm3 ; abs(q0-q1) + movdqa t0, xmm5 ; save to t0 + + pmaxub xmm1, xmm5 + +%if %1 + movdqa xmm2, [rsi+4*rax] ; p3 + movdqa xmm4, [rdi+4*rax] ; p2 + movdqa xmm6, [rsi+2*rax] ; p1 +%else + movlps xmm2, [rsi + rax] ; p3 + movlps xmm4, [rsi] ; p2 + movlps xmm6, [rsi + rcx] ; p1 + + movhps xmm2, [rdi + rax] + movhps xmm4, [rdi] + movhps xmm6, [rdi + rcx] + + movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2 + movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1 +%endif + + movdqa xmm5, xmm4 ; p2 + movdqa xmm3, xmm6 ; p1 + + psubusb xmm4, xmm2 ; p2-=p3 + psubusb xmm2, xmm5 ; p3-=p2 + + psubusb xmm3, xmm5 ; p1-=p2 + pmaxub xmm1, xmm4 ; abs(p3 - p2) + + psubusb xmm5, xmm6 ; p2-=p1 + pmaxub xmm1, xmm2 ; abs(p3 - p2) + + pmaxub xmm1, xmm5 ; abs(p2 - p1) + movdqa xmm2, xmm6 ; p1 + + pmaxub xmm1, xmm3 ; abs(p2 - p1) +%if %1 + movdqa xmm4, [rsi+rax] ; p0 + movdqa xmm3, [rdi] ; q1 +%else + movlps xmm4, [rsi + rcx*2] ; p0 + movhps xmm4, [rdi + rcx*2] + movdqa xmm3, q1 ; q1 +%endif + + movdqa xmm5, xmm4 ; p0 + psubusb xmm4, xmm6 ; p0-=p1 + + psubusb xmm6, xmm5 ; p1-=p0 + + por xmm6, xmm4 ; abs(p1 - p0) + mov rdx, arg(2) ; get blimit + + movdqa t1, xmm6 ; save to t1 + + movdqa xmm4, xmm3 ; q1 + pmaxub xmm1, xmm6 + + psubusb xmm3, xmm2 ; q1-=p1 + psubusb xmm2, xmm4 ; p1-=q1 + + psubusb xmm1, xmm7 + por xmm2, xmm3 ; abs(p1-q1) + + movdqa xmm7, XMMWORD PTR [rdx] ; blimit + + movdqa xmm3, xmm0 ; q0 + pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero + + mov rdx, arg(4) ; hev get thresh + + movdqa xmm6, xmm5 ; p0 + psrlw xmm2, 1 ; abs(p1-q1)/2 + + psubusb xmm5, xmm3 ; p0-=q0 + + psubusb xmm3, xmm6 ; q0-=p0 + por xmm5, xmm3 ; abs(p0 - q0) + + paddusb xmm5, xmm5 ; abs(p0-q0)*2 + + movdqa xmm4, t0 ; hev get abs (q1 - q0) + + movdqa xmm3, t1 ; get abs (p1 - p0) + + paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + + movdqa xmm2, XMMWORD PTR [rdx] ; hev + + psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + psubusb xmm4, xmm2 ; hev + + psubusb xmm3, xmm2 ; hev + por xmm1, xmm5 + + pxor xmm7, xmm7 + paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + pcmpeqb xmm4, xmm5 ; hev + pcmpeqb xmm3, xmm3 ; hev + + pcmpeqb xmm1, xmm7 ; mask xmm1 + pxor xmm4, xmm3 ; hev +%endmacro + +%macro B_FILTER 1 +%if %1 == 0 + movdqa xmm2, p1 ; p1 + movdqa xmm7, q1 ; q1 +%elif %1 == 1 + movdqa xmm2, [rsi+2*rax] ; p1 + movdqa xmm7, [rdi] ; q1 +%elif %1 == 2 + lea rdx, srct + + movdqa xmm2, [rdx] ; p1 + movdqa xmm7, [rdx+48] ; q1 + movdqa xmm6, [rdx+16] ; p0 + movdqa xmm0, [rdx+32] ; q0 +%endif + + pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values + pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values + + psubsb xmm2, xmm7 ; p1 - q1 + pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values + + pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) + pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values + + movdqa xmm3, xmm0 ; q0 + psubsb xmm0, xmm6 ; q0 - p0 + + paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) + + paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) + + paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) + + pand xmm1, xmm2 ; mask filter values we don't care about + + movdqa xmm2, xmm1 + + paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 + paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 + + punpckhbw xmm5, xmm2 ; axbxcxdx + punpcklbw xmm2, xmm2 ; exfxgxhx + + punpcklbw xmm0, xmm1 ; exfxgxhx + psraw xmm5, 11 ; sign extended shift right by 3 + + punpckhbw xmm1, xmm1 ; axbxcxdx + psraw xmm2, 11 ; sign extended shift right by 3 + + packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; + psraw xmm0, 11 ; sign extended shift right by 3 + + psraw xmm1, 11 ; sign extended shift right by 3 + movdqa xmm5, xmm0 ; save results + + packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 + paddsw xmm5, [GLOBAL(ones)] + + paddsw xmm1, [GLOBAL(ones)] + psraw xmm5, 1 ; partial shifted one more time for 2nd tap + + psraw xmm1, 1 ; partial shifted one more time for 2nd tap + + paddsb xmm6, xmm2 ; p0+= p0 add + packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 + +%if %1 == 0 + movdqa xmm1, p1 ; p1 +%elif %1 == 1 + movdqa xmm1, [rsi+2*rax] ; p1 +%elif %1 == 2 + movdqa xmm1, [rdx] ; p1 +%endif + pandn xmm4, xmm5 ; high edge variance additive + pxor xmm6, [GLOBAL(t80)] ; unoffset + + pxor xmm1, [GLOBAL(t80)] ; reoffset + psubsb xmm3, xmm0 ; q0-= q0 add + + paddsb xmm1, xmm4 ; p1+= p1 add + pxor xmm3, [GLOBAL(t80)] ; unoffset + + pxor xmm1, [GLOBAL(t80)] ; unoffset + psubsb xmm7, xmm4 ; q1-= q1 add + + pxor xmm7, [GLOBAL(t80)] ; unoffset +%if %1 == 0 + lea rsi, [rsi + rcx*2] + lea rdi, [rdi + rcx*2] + movq MMWORD PTR [rsi], xmm6 ; p0 + movhps MMWORD PTR [rdi], xmm6 + movq MMWORD PTR [rsi + rax], xmm1 ; p1 + movhps MMWORD PTR [rdi + rax], xmm1 + movq MMWORD PTR [rsi + rcx], xmm3 ; q0 + movhps MMWORD PTR [rdi + rcx], xmm3 + movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1 + movhps MMWORD PTR [rdi + rcx*2],xmm7 +%elif %1 == 1 + movdqa [rsi+rax], xmm6 ; write back + movdqa [rsi+2*rax], xmm1 ; write back + movdqa [rsi], xmm3 ; write back + movdqa [rdi], xmm7 ; write back +%endif + +%endmacro + + +;void vp9_loop_filter_horizontal_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_sse2) PRIVATE +sym(vp9_loop_filter_horizontal_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 32 ; reserve 32 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ;src_ptr + movsxd rax, dword ptr arg(1) ;src_pixel_step + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 1 + ; filter and write back the result + B_FILTER 1 + + add rsp, 32 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_horizontal_edge_uv_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_horizontal_edge_uv_sse2) PRIVATE +sym(vp9_loop_filter_horizontal_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16]; + %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16]; + %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16]; + %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16]; + %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16]; + + mov rsi, arg(0) ; u + mov rdi, arg(5) ; v + movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rcx, rax + neg rax ; negate pitch to deal with above border + + mov rdx, arg(3) ;limit + movdqa xmm7, XMMWORD PTR [rdx] + + lea rsi, [rsi + rcx] + lea rdi, [rdi + rcx] + + ; calculate breakout conditions and high edge variance + LFH_FILTER_AND_HEV_MASK 0 + ; filter and write back the result + B_FILTER 0 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +%macro TRANSPOSE_16X8 2 + movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 + movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 + + punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + + movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + + movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 +%if %1 + lea rsi, [rsi+rax*8] +%else + mov rsi, arg(5) ; v_ptr +%endif + + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 +%if %1 + lea rdi, [rdi+rax*8] +%else + lea rsi, [rsi - 4] +%endif + + punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 +%if %1 + lea rdx, srct +%else + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing +%endif + + movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + + movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 + punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + + punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + + punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + + movdqa t0, xmm2 ; save to free XMM2 + movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + + punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + + movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 + + movdqa xmm6, xmm1 ; + punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 + + punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 + movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + + punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 + + punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + movdqa xmm0, xmm5 + punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + + punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 + + punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 + + punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 + movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 + + punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 + + punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 +%if %2 + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + movdqa [rdx], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + movdqa [rdx+16], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rdx+32], xmm4 ; save 4 + movdqa [rdx+48], xmm5 ; save 5 + movdqa xmm1, t0 ; get + + movdqa xmm2, xmm1 ; + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 +%else + movdqa [rdx+112], xmm7 ; save 7 + + movdqa [rdx+96], xmm6 ; save 6 + + movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + + punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + + movdqa [rdx+32], xmm2 ; save 2 + + movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 + punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + + movdqa [rdx+48], xmm3 ; save 3 + + punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + + movdqa [rdx+64], xmm4 ; save 4 + movdqa [rdx+80], xmm5 ; save 5 + movdqa xmm1, t0 ; get + + movdqa xmm2, xmm1 + punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + + punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + + movdqa [rdx+16], xmm1 + + movdqa [rdx], xmm2 +%endif +%endmacro + +%macro LFV_FILTER_MASK_HEV_MASK 1 + movdqa xmm0, xmm6 ; q2 + psubusb xmm0, xmm7 ; q2-q3 + + psubusb xmm7, xmm6 ; q3-q2 + movdqa xmm4, xmm5 ; q1 + + por xmm7, xmm0 ; abs (q3-q2) + psubusb xmm4, xmm6 ; q1-q2 + + movdqa xmm0, xmm1 + psubusb xmm6, xmm5 ; q2-q1 + + por xmm6, xmm4 ; abs (q2-q1) + psubusb xmm0, xmm2 ; p2 - p3; + + psubusb xmm2, xmm1 ; p3 - p2; + por xmm0, xmm2 ; abs(p2-p3) +%if %1 + movdqa xmm2, [rdx] ; p1 +%else + movdqa xmm2, [rdx+32] ; p1 +%endif + movdqa xmm5, xmm2 ; p1 + pmaxub xmm0, xmm7 + + psubusb xmm5, xmm1 ; p1-p2 + psubusb xmm1, xmm2 ; p2-p1 + + movdqa xmm7, xmm3 ; p0 + psubusb xmm7, xmm2 ; p0-p1 + + por xmm1, xmm5 ; abs(p2-p1) + pmaxub xmm0, xmm6 + + pmaxub xmm0, xmm1 + movdqa xmm1, xmm2 ; p1 + + psubusb xmm2, xmm3 ; p1-p0 + lea rdx, srct + + por xmm2, xmm7 ; abs(p1-p0) + + movdqa t0, xmm2 ; save abs(p1-p0) + + pmaxub xmm0, xmm2 + +%if %1 + movdqa xmm5, [rdx+32] ; q0 + movdqa xmm7, [rdx+48] ; q1 +%else + movdqa xmm5, [rdx+64] ; q0 + movdqa xmm7, [rdx+80] ; q1 +%endif + mov rdx, arg(3) ; limit + + movdqa xmm6, xmm5 ; q0 + movdqa xmm2, xmm7 ; q1 + + psubusb xmm5, xmm7 ; q0-q1 + psubusb xmm7, xmm6 ; q1-q0 + + por xmm7, xmm5 ; abs(q1-q0) + + movdqa t1, xmm7 ; save abs(q1-q0) + + movdqa xmm4, XMMWORD PTR [rdx]; limit + + pmaxub xmm0, xmm7 + mov rdx, arg(2) ; blimit + + psubusb xmm0, xmm4 + movdqa xmm5, xmm2 ; q1 + + psubusb xmm5, xmm1 ; q1-=p1 + psubusb xmm1, xmm2 ; p1-=q1 + + por xmm5, xmm1 ; abs(p1-q1) + movdqa xmm1, xmm3 ; p0 + + pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero + psubusb xmm1, xmm6 ; p0-q0 + + psrlw xmm5, 1 ; abs(p1-q1)/2 + psubusb xmm6, xmm3 ; q0-p0 + + movdqa xmm4, XMMWORD PTR [rdx]; blimit + + mov rdx, arg(4) ; get thresh + + por xmm1, xmm6 ; abs(q0-p0) + + movdqa xmm6, t0 ; get abs (q1 - q0) + + paddusb xmm1, xmm1 ; abs(q0-p0)*2 + + movdqa xmm3, t1 ; get abs (p1 - p0) + + movdqa xmm7, XMMWORD PTR [rdx] + + paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 + psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh + + psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh + + psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit + por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh + + por xmm1, xmm0 ; mask + pcmpeqb xmm6, xmm0 + + pxor xmm0, xmm0 + pcmpeqb xmm4, xmm4 + + pcmpeqb xmm1, xmm0 + pxor xmm4, xmm6 +%endmacro + +%macro BV_TRANSPOSE 0 + ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 + movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + + movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 + punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 + + punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 + + movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + + punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + + punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + + punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 + ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 + ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 + ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 + ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 +%endmacro + +%macro BV_WRITEBACK 2 + movd [rsi+2], %1 + psrldq %1, 4 + + movd [rdi+2], %1 + psrldq %1, 4 + + movd [rsi+2*rax+2], %1 + psrldq %1, 4 + + movd [rdi+2*rax+2], %1 + + movd [rsi+4*rax+2], %2 + psrldq %2, 4 + + movd [rdi+4*rax+2], %2 + psrldq %2, 4 + + movd [rsi+2*rcx+2], %2 + psrldq %2, 4 + + movd [rdi+2*rcx+2], %2 +%endmacro + + +;void vp9_loop_filter_vertical_edge_sse2 +;( +; unsigned char *src_ptr, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; int count +;) +global sym(vp9_loop_filter_vertical_edge_sse2) PRIVATE +sym(vp9_loop_filter_vertical_edge_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 1, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 + + ; start work on filters + B_FILTER 2 + + ; tranpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + ; store 16-line result + + lea rdx, [rax] + neg rdx + + BV_WRITEBACK xmm1, xmm5 + + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] + BV_WRITEBACK xmm2, xmm6 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_loop_filter_vertical_edge_uv_sse2 +;( +; unsigned char *u, +; int src_pixel_step, +; const char *blimit, +; const char *limit, +; const char *thresh, +; unsigned char *v +;) +global sym(vp9_loop_filter_vertical_edge_uv_sse2) PRIVATE +sym(vp9_loop_filter_vertical_edge_uv_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 96 ; reserve 96 bytes + %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; + %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; + %define srct [rsp + 32] ;__declspec(align(16)) char srct[64]; + + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step + + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] + + lea rdx, srct + + ;transpose 16x8 to 8x16, and store the 8-line result on stack. + TRANSPOSE_16X8 0, 1 + + ; calculate filter mask and high edge variance + LFV_FILTER_MASK_HEV_MASK 1 + + ; start work on filters + B_FILTER 2 + + ; tranpose and write back - only work on q1, q0, p0, p1 + BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + + ; store 16-line result + BV_WRITEBACK xmm1, xmm5 + + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + BV_WRITEBACK xmm2, xmm6 + + add rsp, 96 + pop rsp + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +tfe: + times 16 db 0xfe +align 16 +t80: + times 16 db 0x80 +align 16 +t1s: + times 16 db 0x01 +align 16 +t3: + times 16 db 0x03 +align 16 +t4: + times 16 db 0x04 +align 16 +ones: + times 8 dw 0x0001 +align 16 +s9: + times 8 dw 0x0900 +align 16 +s63: + times 8 dw 0x003f diff --git a/libvpx/vp9/common/x86/vp9_loopfilter_x86.h b/libvpx/vp9/common/x86/vp9_loopfilter_x86.h new file mode 100644 index 000000000..fb5af05f7 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_loopfilter_x86.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_ +#define VP9_COMMON_X86_VP9_LOOPFILTER_X86_H_ + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); +extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); +#endif + +#if HAVE_SSE2 +extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); +extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); +#endif + +#endif // LOOPFILTER_X86_H diff --git a/libvpx/vp9/common/x86/vp9_mask_sse3.asm b/libvpx/vp9/common/x86/vp9_mask_sse3.asm new file mode 100644 index 000000000..fe46823d0 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_mask_sse3.asm @@ -0,0 +1,484 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void int vp8_makemask_sse3( +; unsigned char *y, +; unsigned char *u, +; unsigned char *v, +; unsigned char *ym, +; unsigned char *uvm, +; int yp, +; int uvp, +; int ys, +; int us, +; int vs, +; int yt, +; int ut, +; int vt) +global sym(vp8_makemask_sse3) PRIVATE +sym(vp8_makemask_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 14 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;y + mov rdi, arg(1) ;u + mov rcx, arg(2) ;v + mov rax, arg(3) ;ym + movsxd rbx, dword arg(4) ;yp + movsxd rdx, dword arg(5) ;uvp + + pxor xmm0,xmm0 + + ;make 16 copies of the center y value + movd xmm1, arg(6) + pshufb xmm1, xmm0 + + ; make 16 copies of the center u value + movd xmm2, arg(7) + pshufb xmm2, xmm0 + + ; make 16 copies of the center v value + movd xmm3, arg(8) + pshufb xmm3, xmm0 + unpcklpd xmm2, xmm3 + + ;make 16 copies of the y tolerance + movd xmm3, arg(9) + pshufb xmm3, xmm0 + + ;make 16 copies of the u tolerance + movd xmm4, arg(10) + pshufb xmm4, xmm0 + + ;make 16 copies of the v tolerance + movd xmm5, arg(11) + pshufb xmm5, xmm0 + unpckhpd xmm4, xmm5 + + mov r8,8 + +NextPairOfRows: + + ;grab the y source values + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm6, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm6, xmm7 + por xmm0, xmm6 + + ;compute abs difference between + movdqa xmm6, xmm3 + pcmpgtb xmm6, xmm0 + + ;grab the y source values + add rsi, rbx + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm11, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm11, xmm7 + por xmm0, xmm11 + + ;compute abs difference between + movdqa xmm11, xmm3 + pcmpgtb xmm11, xmm0 + + + ;grab the u and v source values + movdqu xmm7, [rdi] + movdqu xmm8, [rcx] + unpcklpd xmm7, xmm8 + + ;compute abs difference between source and uv targets + movdqa xmm9, xmm2 + movdqa xmm10, xmm7 + psubusb xmm7, xmm2 + psubusb xmm9, xmm10 + por xmm7, xmm9 + + ;check whether the number is < tolerance + movdqa xmm0, xmm4 + pcmpgtb xmm0, xmm7 + + ;double u and v masks + movdqa xmm8, xmm0 + punpckhbw xmm0, xmm0 + punpcklbw xmm8, xmm8 + + ;mask row 0 and output + pand xmm6, xmm8 + pand xmm6, xmm0 + movdqa [rax],xmm6 + + ;mask row 1 and output + pand xmm11, xmm8 + pand xmm11, xmm0 + movdqa [rax+16],xmm11 + + + ; to the next row or set of rows + add rsi, rbx + add rdi, rdx + add rcx, rdx + add rax,32 + dec r8 + jnz NextPairOfRows + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;GROW_HORIZ (register for result, source register or mem local) +; takes source and shifts left and ors with source +; then shifts right and ors with source +%macro GROW_HORIZ 2 + movdqa %1, %2 + movdqa xmm14, %1 + movdqa xmm15, %1 + pslldq xmm14, 1 + psrldq xmm15, 1 + por %1,xmm14 + por %1,xmm15 +%endmacro +;GROW_VERT (result, center row, above row, below row) +%macro GROW_VERT 4 + movdqa %1,%2 + por %1,%3 + por %1,%4 +%endmacro + +;GROW_NEXTLINE (new line to grow, new source, line to write) +%macro GROW_NEXTLINE 3 + GROW_HORIZ %1, %2 + GROW_VERT xmm3, xmm0, xmm1, xmm2 + movdqa %3,xmm3 +%endmacro + + +;void int vp8_growmaskmb_sse3( +; unsigned char *om, +; unsigned char *nm, +global sym(vp8_growmaskmb_sse3) PRIVATE +sym(vp8_growmaskmb_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src + mov rdi, arg(1) ;rst + + GROW_HORIZ xmm0, [rsi] + GROW_HORIZ xmm1, [rsi+16] + GROW_HORIZ xmm2, [rsi+32] + + GROW_VERT xmm3, xmm0, xmm1, xmm2 + por xmm0,xmm1 + movdqa [rdi], xmm0 + movdqa [rdi+16],xmm3 + + GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] + GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] + GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] + GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] + GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] + GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] + GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] + GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] + GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] + GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] + GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] + GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] + GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] + + por xmm0,xmm2 + movdqa [rdi+240], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_sad16x16_masked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_masked_wmt) PRIVATE +sym(vp8_sad16x16_masked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +NextSadRow: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + pand xmm0, xmm2 + pand xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz NextSadRow + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x16_unmasked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_unmasked_wmt) PRIVATE +sym(vp8_sad16x16_unmasked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_sad16x16_unmasked_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + por xmm0, xmm2 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_sad16x16_unmasked_wmt + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_masked_predictor_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_wmt) PRIVATE +sym(vp8_masked_predictor_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movdqu [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_masked_predictor_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_masked_predictor_uv_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_uv_wmt) PRIVATE +sym(vp8_masked_predictor_uv_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_uv_wmt: + movq xmm0, [rsi] + movq xmm1, [rdi] + movq xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movq [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rax + add rbx, 8 + + dec rcx + jnz next_vp8_masked_predictor_uv_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_uv_from_y_mask( +; unsigned char *ymask, +; unsigned char *uvmask) +global sym(vp8_uv_from_y_mask) PRIVATE +sym(vp8_uv_from_y_mask): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_p8_uv_from_y_mask: + movdqu xmm0, [rsi] + pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] + movq [rdi],xmm0 + add rdi, 8 + add rsi,32 + + dec rcx + jnz next_p8_uv_from_y_mask + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 + diff --git a/libvpx/vp9/common/x86/vp9_postproc_mmx.asm b/libvpx/vp9/common/x86/vp9_postproc_mmx.asm new file mode 100644 index 000000000..c2118dbb7 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_postproc_mmx.asm @@ -0,0 +1,534 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP9_FILTER_WEIGHT 128 +%define VP9_FILTER_SHIFT 7 + +;void vp9_post_proc_down_and_across_mmx +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_mmx) PRIVATE +sym(vp9_post_proc_down_and_across_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movq mm0, [GLOBAL(rd)] + sub rsp, 8 + movq [rsp], mm0 +%define RD [rsp] +%else +%define RD [GLOBAL(rd)] +%endif + + push rbx + lea rbx, [GLOBAL(Blur)] + movd mm2, dword ptr arg(6) ;flimit + punpcklwd mm2, mm2 + punpckldq mm2, mm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps + movq mm3, [rsi] ; mm4 = r0 p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] ; mm6 = kernel 3 taps + movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 + pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = r0 p0..p3 + psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers + movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + neg rax + movq mm6, [rbx ] ; kernel 0 taps + movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] ; kernel 1 taps + movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + + movd [rdi], mm1 ; + neg rax ; pitch is positive + + + add rsi, 4 + add rdi, 4 + add rdx, 4 + + cmp edx, dword ptr arg(5) ;cols + jl .nextcol + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + + push rax + xor rdx, rdx + mov rax, [rdi-4]; + +.acrossnextcol: + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; + movq mm4, [rdi+rdx] ; mm4 = p0..p7 + movq mm3, mm4 ; mm3 = p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] + psrlq mm4, 8 ; mm4 = p1..p7 + movq mm5, mm4 ; mm5 = p1..p7 + punpcklbw mm5, mm0 ; mm5 = p1..p4 + pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = p0..p3 + psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] + psrlq mm4, 8 ; mm4 = p2..p7 + movq mm5, mm4 ; mm5 = p2..p7 + punpcklbw mm5, mm0 ; mm5 = p2..p5 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + movq mm6, [rbx ] + movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 + movq mm5, mm4 ; mm5 = p-2..p5 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] + psrlq mm4, 8 ; mm4 = p-1..p5 + punpcklbw mm4, mm0 ; mm4 = p-1..p2 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 + psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes + movd eax, mm1 + + add rdx, 4 + cmp edx, dword ptr arg(5) ;cols + jl .acrossnextcol; + + mov DWORD PTR [rdi+rdx-4], eax + pop rax + + ; done with this rwo + add rsi,rax ; next line + movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef RD + + +;void vp9_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_mmx) PRIVATE +sym(vp9_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c<cols; c+=4) +.loop_col: + mov rsi, arg(0) ;s + pxor mm0, mm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + neg rax ; rax = -pitch + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor mm5, mm5 + pxor mm6, mm6 ; + + pxor mm7, mm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movd mm1, DWORD PTR [rdi]; + punpcklbw mm1, mm0 ; + + paddw mm5, mm1 ; + pmullw mm1, mm1 ; + + movq mm2, mm1 ; + punpcklwd mm1, mm0 ; + + punpckhwd mm2, mm0 ; + paddd mm6, mm1 ; + + paddd mm7, mm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw mm1, mm0 + punpcklbw mm2, mm0 + + paddw mm5, mm2 + psubw mm5, mm1 + + pmullw mm2, mm2 + movq mm4, mm2 + + punpcklwd mm2, mm0 + punpckhwd mm4, mm0 + + paddd mm6, mm2 + paddd mm7, mm4 + + pmullw mm1, mm1 + movq mm2, mm1 + + punpcklwd mm1, mm0 + psubd mm6, mm1 + + punpckhwd mm2, mm0 + psubd mm7, mm2 + + + movq mm3, mm6 + pslld mm3, 4 + + psubd mm3, mm6 + movq mm1, mm5 + + movq mm4, mm5 + pmullw mm1, mm1 + + pmulhw mm4, mm4 + movq mm2, mm1 + + punpcklwd mm1, mm4 + punpckhwd mm2, mm4 + + movq mm4, mm7 + pslld mm4, 4 + + psubd mm4, mm7 + + psubd mm3, mm1 + psubd mm4, mm2 + + psubd mm3, flimit2 + psubd mm4, flimit2 + + psrad mm3, 31 + psrad mm4, 31 + + packssdw mm3, mm4 + packsswb mm3, mm0 + + movd mm1, DWORD PTR [rsi+rax*8] + + movq mm2, mm1 + punpcklbw mm1, mm0 + + paddw mm1, mm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp9_rv))] + movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] +%else + movq mm4, [sym(vp9_rv) + rcx*2] +%endif + paddw mm1, mm4 + ;paddw xmm1, eight8s + psraw mm1, 4 + + packuswb mm1, mm0 + pand mm1, mm3 + + pandn mm3, mm2 + por mm1, mm3 + + and rcx, 15 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] + + mov rcx, rdx + sub rcx, 8 + + and rcx, 15 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] + + movd [rsi], mm1 + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + + add dword arg(0), 4 ; s += 4 + sub dword arg(3), 4 ; cols -= 4 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 136 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef flimit2 + + +;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int width, unsigned int height, int pitch) +extern sym(rand) +global sym(vp9_plane_add_noise_mmx) PRIVATE +sym(vp9_plane_add_noise_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(rand) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movq mm1,[rsi+rax] ; get the source + + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb mm1, [rdx+32] ;bothclamp + psubusb mm1, [rdx+16] ;whiteclamp + + movq mm2,[rdi+rax] ; get the noise for this line + paddb mm1,mm2 ; add it in + movq [rsi+rax],mm1 ; store the result + + add rax,8 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +Blur: + times 16 dw 16 + times 8 dw 64 + times 16 dw 16 + times 8 dw 0 + +rd: + times 4 dw 0x40 diff --git a/libvpx/vp9/common/x86/vp9_postproc_sse2.asm b/libvpx/vp9/common/x86/vp9_postproc_sse2.asm new file mode 100644 index 000000000..858fc99b6 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_postproc_sse2.asm @@ -0,0 +1,695 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_post_proc_down_and_across_xmm +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_xmm) PRIVATE +sym(vp9_post_proc_down_and_across_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ALIGN_STACK 16, rax + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movdqa xmm0, [GLOBAL(rd42)] + sub rsp, 16 + movdqa [rsp], xmm0 +%define RD42 [rsp] +%else +%define RD42 [GLOBAL(rd42)] +%endif + + + movd xmm2, dword ptr arg(6) ;flimit + punpcklwd xmm2, xmm2 + punpckldq xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor xmm0, xmm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 ; + + movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 + psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw xmm7, xmm2 + + movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + neg rax + movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi], xmm1 ; + + neg rax ; pitch is positive + add rsi, 8 + add rdi, 8 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + + jl .nextcol + + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-8]; + +.acrossnextcol: + movq xmm7, QWORD PTR [rdi +rdx -2] + movd xmm4, DWORD PTR [rdi +rdx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = p0..p3 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + movdqa xmm5, xmm4 ; mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + psrldq xmm4, 1 ; mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes + movdq2q mm0, xmm1 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + jl .acrossnextcol; + + ; last 8 pixels + movq QWORD PTR [rdi+rdx-8], mm0 + + ; done with this rwo + add rsi,rax ; next line + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + add rsp,16 + pop rsp +%endif + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef RD42 + + +;void vp9_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_xmm) PRIVATE +sym(vp9_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c<cols; c+=8) +.loop_col: + mov rsi, arg(0) ; s + pxor xmm0, xmm0 ; + + movsxd rax, dword ptr arg(1) ;pitch ; + neg rax ; rax = -pitch + + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] + neg rax + + + pxor xmm5, xmm5 + pxor xmm6, xmm6 ; + + pxor xmm7, xmm7 ; + mov rdi, rsi + + mov rcx, 15 ; + +.loop_initvar: + movq xmm1, QWORD PTR [rdi]; + punpcklbw xmm1, xmm0 ; + + paddw xmm5, xmm1 ; + pmullw xmm1, xmm1 ; + + movdqa xmm2, xmm1 ; + punpcklwd xmm1, xmm0 ; + + punpckhwd xmm2, xmm0 ; + paddd xmm6, xmm1 ; + + paddd xmm7, xmm2 ; + lea rdi, [rdi+rax] ; + + dec rcx + jne .loop_initvar + ;save the var and sum + xor rdx, rdx +.loop_row: + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] + + punpcklbw xmm1, xmm0 + punpcklbw xmm2, xmm0 + + paddw xmm5, xmm2 + psubw xmm5, xmm1 + + pmullw xmm2, xmm2 + movdqa xmm4, xmm2 + + punpcklwd xmm2, xmm0 + punpckhwd xmm4, xmm0 + + paddd xmm6, xmm2 + paddd xmm7, xmm4 + + pmullw xmm1, xmm1 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm0 + psubd xmm6, xmm1 + + punpckhwd xmm2, xmm0 + psubd xmm7, xmm2 + + + movdqa xmm3, xmm6 + pslld xmm3, 4 + + psubd xmm3, xmm6 + movdqa xmm1, xmm5 + + movdqa xmm4, xmm5 + pmullw xmm1, xmm1 + + pmulhw xmm4, xmm4 + movdqa xmm2, xmm1 + + punpcklwd xmm1, xmm4 + punpckhwd xmm2, xmm4 + + movdqa xmm4, xmm7 + pslld xmm4, 4 + + psubd xmm4, xmm7 + + psubd xmm3, xmm1 + psubd xmm4, xmm2 + + psubd xmm3, flimit4 + psubd xmm4, flimit4 + + psrad xmm3, 31 + psrad xmm4, 31 + + packssdw xmm3, xmm4 + packsswb xmm3, xmm0 + + movq xmm1, QWORD PTR [rsi+rax*8] + + movq xmm2, xmm1 + punpcklbw xmm1, xmm0 + + paddw xmm1, xmm5 + mov rcx, rdx + + and rcx, 127 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + push rax + lea rax, [GLOBAL(sym(vp9_rv))] + movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2] + pop rax +%elif ABI_IS_32BIT=0 + movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2] +%else + movdqu xmm4, [sym(vp9_rv) + rcx*2] +%endif + + paddw xmm1, xmm4 + ;paddw xmm1, eight8s + psraw xmm1, 4 + + packuswb xmm1, xmm0 + pand xmm1, xmm3 + + pandn xmm3, xmm2 + por xmm1, xmm3 + + and rcx, 15 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] + + mov rcx, rdx + sub rcx, 8 + + and rcx, 15 + movq mm0, [rsp + rcx*8] ;d[rcx*8] + + movq [rsi], mm0 + lea rsi, [rsi+rax] + + lea rdi, [rdi+rax] + add rdx, 1 + + cmp edx, dword arg(2) ;rows + jl .loop_row + + add dword arg(0), 8 ; s += 8 + sub dword arg(3), 8 ; cols -= 8 + cmp dword arg(3), 0 + jg .loop_col + + add rsp, 128+16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE +sym(vp9_mbpost_proc_across_ip_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16 + + ; create flimit4 at [rsp] + mov eax, dword ptr arg(4) ;flimit + mov [rsp], eax + mov [rsp+4], eax + mov [rsp+8], eax + mov [rsp+12], eax +%define flimit4 [rsp] + + + ;for(r=0;r<rows;r++) +.ip_row_loop: + + xor rdx, rdx ;sumsq=0; + xor rcx, rcx ;sum=0; + mov rsi, arg(0); s + mov rdi, -8 +.ip_var_loop: + ;for(i=-8;i<=6;i++) + ;{ + ; sumsq += s[i]*s[i]; + ; sum += s[i]; + ;} + movzx eax, byte [rsi+rdi] + add ecx, eax + mul al + add edx, eax + add rdi, 1 + cmp rdi, 6 + jle .ip_var_loop + + + ;mov rax, sumsq + ;movd xmm7, rax + movd xmm7, edx + + ;mov rax, sum + ;movd xmm6, rax + movd xmm6, ecx + + mov rsi, arg(0) ;s + xor rcx, rcx + + movsxd rdx, dword arg(3) ;cols + add rdx, 8 + pxor mm0, mm0 + pxor mm1, mm1 + + pxor xmm0, xmm0 +.nextcol4: + + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 + + punpcklbw xmm1, xmm0 ; expanding + punpcklbw xmm2, xmm0 ; expanding + + punpcklwd xmm1, xmm0 ; expanding to dwords + punpcklwd xmm2, xmm0 ; expanding to dwords + + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 + + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 + + paddd xmm6, xmm2 + paddd xmm7, xmm1 + + pshufd xmm6, xmm6, 0 ; duplicate the last ones + pshufd xmm7, xmm7, 0 ; duplicate the last ones + + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 + + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared + + paddd xmm6, xmm4 + paddd xmm7, xmm3 + + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared + + paddd xmm7, xmm3 + paddd xmm6, xmm4 + + movdqa xmm3, xmm6 + pmaddwd xmm3, xmm3 + + movdqa xmm5, xmm7 + pslld xmm5, 4 + + psubd xmm5, xmm7 + psubd xmm5, xmm3 + + psubd xmm5, flimit4 + psrad xmm5, 31 + + packssdw xmm5, xmm0 + packsswb xmm5, xmm0 + + movd xmm1, DWORD PTR [rsi+rcx] + movq xmm2, xmm1 + + punpcklbw xmm1, xmm0 + punpcklwd xmm1, xmm0 + + paddd xmm1, xmm6 + paddd xmm1, [GLOBAL(four8s)] + + psrad xmm1, 4 + packssdw xmm1, xmm0 + + packuswb xmm1, xmm0 + pand xmm1, xmm5 + + pandn xmm5, xmm2 + por xmm5, xmm1 + + movd [rsi+rcx-8], mm0 + movq mm0, mm1 + + movdq2q mm1, xmm5 + psrldq xmm7, 12 + + psrldq xmm6, 12 + add rcx, 4 + + cmp rcx, rdx + jl .nextcol4 + + ;s+=pitch; + movsxd rax, dword arg(1) + add arg(0), rax + + sub dword arg(2), 1 ;rows-=1 + cmp dword arg(2), 0 + jg .ip_row_loop + + add rsp, 16 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef flimit4 + + +;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, +; unsigned char blackclamp[16], +; unsigned char whiteclamp[16], +; unsigned char bothclamp[16], +; unsigned int width, unsigned int height, int pitch) +extern sym(rand) +global sym(vp9_plane_add_noise_wmt) PRIVATE +sym(vp9_plane_add_noise_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 8 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +.addnoise_loop: + call sym(rand) WRT_PLT + mov rcx, arg(1) ;noise + and rax, 0xff + add rcx, rax + + ; we rely on the fact that the clamping vectors are stored contiguously + ; in black/white/both order. Note that we have to reload this here because + ; rdx could be trashed by rand() + mov rdx, arg(2) ; blackclamp + + + mov rdi, rcx + movsxd rcx, dword arg(5) ;[Width] + mov rsi, arg(0) ;Pos + xor rax,rax + +.addnoise_nextset: + movdqu xmm1,[rsi+rax] ; get the source + + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise + paddusb xmm1, [rdx+32] ;bothclamp + psubusb xmm1, [rdx+16] ;whiteclamp + + movdqu xmm2,[rdi+rax] ; get the noise for this line + paddb xmm1,xmm2 ; add it in + movdqu [rsi+rax],xmm1 ; store the result + + add rax,16 ; move to the next line + + cmp rax, rcx + jl .addnoise_nextset + + movsxd rax, dword arg(7) ; Pitch + add arg(0), rax ; Start += Pitch + sub dword arg(6), 1 ; Height -= 1 + jg .addnoise_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + + +SECTION_RODATA +align 16 +rd42: + times 8 dw 0x04 +four8s: + times 4 dd 8 diff --git a/libvpx/vp9/common/x86/vp9_postproc_x86.h b/libvpx/vp9/common/x86/vp9_postproc_x86.h new file mode 100644 index 000000000..b0e8b181f --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_postproc_x86.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#ifndef VP9_COMMON_X86_VP9_POSTPROC_X86_H_ +#define VP9_COMMON_X86_VP9_POSTPROC_X86_H_ + +/* Note: + * + * This platform is commonly built for runtime CPU detection. If you modify + * any of the function mappings present in this file, be sure to also update + * them in the function pointer initialization code + */ + +#if HAVE_MMX +extern prototype_postproc_inplace(vp9_mbpost_proc_down_mmx); +extern prototype_postproc(vp9_post_proc_down_and_across_mmx); +extern prototype_postproc_addnoise(vp9_plane_add_noise_mmx); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_postproc_down +#define vp9_postproc_down vp9_mbpost_proc_down_mmx + +#undef vp9_postproc_downacross +#define vp9_postproc_downacross vp9_post_proc_down_and_across_mmx + +#undef vp9_postproc_addnoise +#define vp9_postproc_addnoise vp9_plane_add_noise_mmx + +#endif +#endif + + +#if HAVE_SSE2 +extern prototype_postproc_inplace(vp9_mbpost_proc_down_xmm); +extern prototype_postproc_inplace(vp9_mbpost_proc_across_ip_xmm); +extern prototype_postproc(vp9_post_proc_down_and_across_xmm); +extern prototype_postproc_addnoise(vp9_plane_add_noise_wmt); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp9_postproc_down +#define vp9_postproc_down vp9_mbpost_proc_down_xmm + +#undef vp9_postproc_across +#define vp9_postproc_across vp9_mbpost_proc_across_ip_xmm + +#undef vp9_postproc_downacross +#define vp9_postproc_downacross vp9_post_proc_down_and_across_xmm + +#undef vp9_postproc_addnoise +#define vp9_postproc_addnoise vp9_plane_add_noise_wmt + + +#endif +#endif + +#endif diff --git a/libvpx/vp9/common/x86/vp9_recon_mmx.asm b/libvpx/vp9/common/x86/vp9_recon_mmx.asm new file mode 100644 index 000000000..6fbbe48cb --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_recon_mmx.asm @@ -0,0 +1,272 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +;void copy_mem8x8_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp9_copy_mem8x8_mmx) PRIVATE +sym(vp9_copy_mem8x8_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + add rsi, rax + + movq [rdi+rcx], mm1 + movq [rdi+rcx*2], mm2 + + + lea rdi, [rdi+rcx*2] + movq mm3, [rsi] + + add rdi, rcx + movq mm4, [rsi+rax] + + movq mm5, [rsi+rax*2] + movq [rdi], mm3 + + lea rsi, [rsi+rax*2] + movq [rdi+rcx], mm4 + + movq [rdi+rcx*2], mm5 + lea rdi, [rdi+rcx*2] + + movq mm0, [rsi+rax] + movq mm1, [rsi+rax*2] + + movq [rdi+rcx], mm0 + movq [rdi+rcx*2],mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem8x4_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp9_copy_mem8x4_mmx) PRIVATE +sym(vp9_copy_mem8x4_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movq mm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movq mm1, [rsi+rax] + movq mm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movq [rdi], mm0 + movq [rdi+rcx], mm1 + + movq [rdi+rcx*2], mm2 + lea rdi, [rdi+rcx*2] + + movq mm3, [rsi+rax] + movq [rdi+rcx], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void copy_mem16x16_mmx( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp9_copy_mem16x16_mmx) PRIVATE +sym(vp9_copy_mem16x16_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movsxd rax, dword ptr arg(1) ;src_stride; + + mov rdi, arg(2) ;dst; + movsxd rcx, dword ptr arg(3) ;dst_stride + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq mm1, [rsi+rax] + movq mm4, [rsi+rax+8] + + movq mm2, [rsi+rax*2] + movq mm5, [rsi+rax*2+8] + + lea rsi, [rsi+rax*2] + add rsi, rax + + movq [rdi], mm0 + movq [rdi+8], mm3 + + movq [rdi+rcx], mm1 + movq [rdi+rcx+8], mm4 + + movq [rdi+rcx*2], mm2 + movq [rdi+rcx*2+8], mm5 + + lea rdi, [rdi+rcx*2] + add rdi, rcx + + movq mm0, [rsi] + movq mm3, [rsi+8]; + + movq [rdi], mm0 + movq [rdi+8], mm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/libvpx/vp9/common/x86/vp9_recon_sse2.asm b/libvpx/vp9/common/x86/vp9_recon_sse2.asm new file mode 100644 index 000000000..9ee30432a --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_recon_sse2.asm @@ -0,0 +1,572 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +;void copy_mem16x16_sse2( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride +; ) +global sym(vp9_copy_mem16x16_sse2) PRIVATE +sym(vp9_copy_mem16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src; + movdqu xmm0, [rsi] + + movsxd rax, dword ptr arg(1) ;src_stride; + mov rdi, arg(2) ;dst; + + movdqu xmm1, [rsi+rax] + movdqu xmm2, [rsi+rax*2] + + movsxd rcx, dword ptr arg(3) ;dst_stride + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + lea rdi, [rdi+rcx*2] + movdqu xmm3, [rsi] + + add rdi, rcx + movdqu xmm4, [rsi+rax] + + movdqu xmm5, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm3 + add rsi, rax + + movdqa [rdi+rcx], xmm4 + movdqa [rdi+rcx*2],xmm5 + + lea rdi, [rdi+rcx*2] + movdqu xmm0, [rsi] + + add rdi, rcx + movdqu xmm1, [rsi+rax] + + movdqu xmm2, [rsi+rax*2] + lea rsi, [rsi+rax*2] + + movdqa [rdi], xmm0 + add rsi, rax + + movdqa [rdi+rcx], xmm1 + + movdqa [rdi+rcx*2], xmm2 + movdqu xmm3, [rsi] + + movdqu xmm4, [rsi+rax] + lea rdi, [rdi+rcx*2] + + add rdi, rcx + movdqu xmm5, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm3 + + add rsi, rax + movdqa [rdi+rcx], xmm4 + + movdqa [rdi+rcx*2],xmm5 + movdqu xmm0, [rsi] + + lea rdi, [rdi+rcx*2] + movdqu xmm1, [rsi+rax] + + add rdi, rcx + movdqu xmm2, [rsi+rax*2] + + lea rsi, [rsi+rax*2] + movdqa [rdi], xmm0 + + movdqa [rdi+rcx], xmm1 + movdqa [rdi+rcx*2],xmm2 + + movdqu xmm3, [rsi+rax] + lea rdi, [rdi+rcx*2] + + movdqa [rdi+rcx], xmm3 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp9_intra_pred_uv_dc_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp9_intra_pred_uv_dc_mmx2) PRIVATE +sym(vp9_intra_pred_uv_dc_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; from top + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; from left + dec rsi + lea rdi, [rax*3] + movzx ecx, byte [rsi+rax] + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + movzx edx, byte [rsi+rax*4] + add ecx, edx + + ; add up + pextrw edx, mm1, 0x0 + lea edx, [edx+ecx+8] + sar edx, 4 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_intra_pred_uv_dctop_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp9_intra_pred_uv_dctop_mmx2) PRIVATE +sym(vp9_intra_pred_uv_dctop_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; from top + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; add up + paddw mm1, [GLOBAL(dc_4)] + psraw mm1, 3 + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_intra_pred_uv_dcleft_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp9_intra_pred_uv_dcleft_mmx2) PRIVATE +sym(vp9_intra_pred_uv_dcleft_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; from left + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + dec rsi + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+4] + + ; add up + shr edx, 3 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_intra_pred_uv_dc128_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp9_intra_pred_uv_dc128_mmx) PRIVATE +sym(vp9_intra_pred_uv_dc128_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + ; end prolog + + ; write out + movq mm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_intra_pred_uv_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +%macro vp9_intra_pred_uv_tm 1 +global sym(vp9_intra_pred_uv_tm_%1) PRIVATE +sym(vp9_intra_pred_uv_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ; read top row + mov edx, 4 + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm2, [GLOBAL(dc_1024)] +%endif + movq xmm1, [rsi] + punpcklbw xmm1, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm3, [rsi-1] + lea rsi, [rsi+rax-1] +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + pshuflw xmm3, xmm3, 0x0 + punpcklqdq xmm3, xmm3 +%else + pshufb xmm3, xmm2 +%endif + psubw xmm1, xmm3 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +.vp9_intra_pred_uv_tm_%1_loop: + movd xmm3, [rsi] + movd xmm5, [rsi+rax] +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm3, xmm3, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm3, xmm3 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm3, xmm2 + pshufb xmm5, xmm2 +%endif + paddw xmm3, xmm1 + paddw xmm5, xmm1 + packuswb xmm3, xmm5 + movq [rdi ], xmm3 + movhps[rdi+rcx], xmm3 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp9_intra_pred_uv_tm_%1_loop + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp9_intra_pred_uv_tm sse2 +vp9_intra_pred_uv_tm ssse3 + +;void vp9_intra_pred_uv_ve_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp9_intra_pred_uv_ve_mmx) PRIVATE +sym(vp9_intra_pred_uv_ve_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + ; end prolog + + ; read from top + mov rax, arg(2) ;src; + movsxd rdx, dword ptr arg(3) ;src_stride; + sub rax, rdx + movq mm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_intra_pred_uv_ho_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +%macro vp9_intra_pred_uv_ho 1 +global sym(vp9_intra_pred_uv_ho_%1) PRIVATE +sym(vp9_intra_pred_uv_ho_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi +%ifidn %1, ssse3 +%ifndef GET_GOT_SAVE_ARG + push rbx +%endif + GET_GOT rbx +%endif + ; end prolog + + ; read from left and write out +%ifidn %1, mmx2 + mov edx, 4 +%endif + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride +%ifidn %1, ssse3 + lea rdx, [rcx*3] + movdqa xmm2, [GLOBAL(dc_00001111)] + lea rbx, [rax*3] +%endif + dec rsi +%ifidn %1, mmx2 +.vp9_intra_pred_uv_ho_%1_loop: + movd mm0, [rsi] + movd mm1, [rsi+rax] + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + pshufw mm0, mm0, 0x0 + pshufw mm1, mm1, 0x0 + movq [rdi ], mm0 + movq [rdi+rcx], mm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz .vp9_intra_pred_uv_ho_%1_loop +%else + movd xmm0, [rsi] + movd xmm3, [rsi+rax] + movd xmm1, [rsi+rax*2] + movd xmm4, [rsi+rbx] + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 + lea rsi, [rsi+rax*4] + lea rdi, [rdi+rcx*4] + movd xmm0, [rsi] + movd xmm3, [rsi+rax] + movd xmm1, [rsi+rax*2] + movd xmm4, [rsi+rbx] + punpcklbw xmm0, xmm3 + punpcklbw xmm1, xmm4 + pshufb xmm0, xmm2 + pshufb xmm1, xmm2 + movq [rdi ], xmm0 + movhps [rdi+rcx], xmm0 + movq [rdi+rcx*2], xmm1 + movhps [rdi+rdx], xmm1 +%endif + + ; begin epilog +%ifidn %1, ssse3 + RESTORE_GOT +%ifndef GET_GOT_SAVE_ARG + pop rbx +%endif +%endif + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp9_intra_pred_uv_ho mmx2 +vp9_intra_pred_uv_ho ssse3 + +SECTION_RODATA +dc_128: + times 8 db 128 +dc_4: + times 4 dw 4 +align 16 +dc_1024: + times 8 dw 0x400 +align 16 +dc_00001111: + times 8 db 0 + times 8 db 1 diff --git a/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c b/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c new file mode 100644 index 000000000..97148fbb8 --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_recon_wrapper_sse2.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_config.h" +#include "vpx_mem/vpx_mem.h" +#include "vp9/common/vp9_blockd.h" + +#define build_intra_predictors_mbuv_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *src, int src_stride) +typedef build_intra_predictors_mbuv_prototype((*build_intra_pred_mbuv_fn_t)); + +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc_mmx2); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dctop_mmx2); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dcleft_mmx2); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_dc128_mmx); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_mmx2); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ho_ssse3); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_ve_mmx); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_sse2); +extern build_intra_predictors_mbuv_prototype(vp9_intra_pred_uv_tm_ssse3); + +static void build_intra_predictors_mbuv_x86(MACROBLOCKD *xd, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_stride, + build_intra_pred_mbuv_fn_t tm_fn, + build_intra_pred_mbuv_fn_t ho_fn) { + int mode = xd->mode_info_context->mbmi.uv_mode; + build_intra_pred_mbuv_fn_t fn; + int src_stride = xd->plane[1].dst.stride; + + switch (mode) { + case V_PRED: + fn = vp9_intra_pred_uv_ve_mmx; + break; + case H_PRED: + fn = ho_fn; + break; + case TM_PRED: + fn = tm_fn; + break; + case DC_PRED: + if (xd->up_available) { + if (xd->left_available) { + fn = vp9_intra_pred_uv_dc_mmx2; + break; + } else { + fn = vp9_intra_pred_uv_dctop_mmx2; + break; + } + } else if (xd->left_available) { + fn = vp9_intra_pred_uv_dcleft_mmx2; + break; + } else { + fn = vp9_intra_pred_uv_dc128_mmx; + break; + } + break; + default: + return; + } + + fn(dst_u, dst_stride, xd->plane[1].dst.buf, src_stride); + fn(dst_v, dst_stride, xd->plane[2].dst.buf, src_stride); +} + +void vp9_build_intra_predictors_mbuv_sse2(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, + xd->plane[2].dst.buf, xd->plane[1].dst.stride, + vp9_intra_pred_uv_tm_sse2, + vp9_intra_pred_uv_ho_mmx2); +} + +void vp9_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, + xd->plane[2].dst.buf, xd->plane[1].dst.stride, + vp9_intra_pred_uv_tm_ssse3, + vp9_intra_pred_uv_ho_ssse3); +} + +void vp9_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, + xd->plane[2].dst.buf, xd->plane[1].dst.stride, + vp9_intra_pred_uv_tm_sse2, + vp9_intra_pred_uv_ho_mmx2); +} + +void vp9_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *xd) { + build_intra_predictors_mbuv_x86(xd, xd->plane[1].dst.buf, + xd->plane[2].dst.buf, xd->plane[1].dst.stride, + vp9_intra_pred_uv_tm_ssse3, + vp9_intra_pred_uv_ho_ssse3); +} diff --git a/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c b/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c new file mode 100644 index 000000000..ed873a5ba --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_sadmxn_sse2.c @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <emmintrin.h> /* SSE2 */ +#include "vpx/vpx_integer.h" +#include "vpx_ports/emmintrin_compat.h" + +unsigned int vp9_sad16x3_sse2( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + __m128i s0, s1, s2; + __m128i r0, r1, r2; + __m128i sad; + + s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride)); + s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride)); + s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride)); + + r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride)); + r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride)); + r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride)); + + sad = _mm_sad_epu8(s0, r0); + sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1)); + sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2)); + sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); + + return _mm_cvtsi128_si32(sad); +} + +unsigned int vp9_sad3x16_sse2( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride) { + int r; + __m128i s0, s1, s2, s3; + __m128i r0, r1, r2, r3; + __m128i sad = _mm_setzero_si128(); + __m128i mask; + const int offset = (uintptr_t)src_ptr & 3; + + /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. + * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd + * takes much less time. + */ + if (offset == 1) + src_ptr -= 1; + + /* mask = 0xffffffffffff0000ffffffffffff0000 */ + mask = _mm_cmpeq_epi32(sad, sad); + mask = _mm_slli_epi64(mask, 16); + + for (r = 0; r < 16; r += 4) { + s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); + s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); + s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); + s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); + r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); + r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); + r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); + r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); + + s0 = _mm_unpacklo_epi8(s0, s1); + r0 = _mm_unpacklo_epi8(r0, r1); + s2 = _mm_unpacklo_epi8(s2, s3); + r2 = _mm_unpacklo_epi8(r2, r3); + s0 = _mm_unpacklo_epi64(s0, s2); + r0 = _mm_unpacklo_epi64(r0, r2); + + // throw out extra byte + if (offset == 1) + s0 = _mm_and_si128(s0, mask); + else + s0 = _mm_slli_epi64(s0, 16); + r0 = _mm_slli_epi64(r0, 16); + + sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); + + src_ptr += src_stride*4; + ref_ptr += ref_stride*4; + } + + sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); + return _mm_cvtsi128_si32(sad); +} diff --git a/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm new file mode 100644 index 000000000..bbf9888ca --- /dev/null +++ b/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -0,0 +1,1011 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;/************************************************************************************ +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The +; input pixel array has output_height rows. This routine assumes that output_height is an +; even number. This function handles 8 pixels in horizontal direction, calculating ONE +; rows each iteration to take advantage of the 128 bits operations. +; +; This is an implementation of some of the SSE optimizations first seen in ffvp8 +; +;*************************************************************************************/ + + +%macro VERTx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movd xmm0, [rsi] ;A + movd xmm1, [rsi + rdx] ;B + movd xmm2, [rsi + rdx * 2] ;C + movd xmm3, [rax + rdx * 2] ;D + movd xmm4, [rsi + rdx * 4] ;E + movd xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movd xmm6, [rsi + rbx] ;G + movd xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + paddsw xmm0, xmm2 + paddsw xmm0, xmm4 + paddsw xmm0, krd + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movd [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + +%macro VERTx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + paddsw xmm0, xmm2 + paddsw xmm0, xmm4 + paddsw xmm0, krd + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + + +%macro VERTx16 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line + +%if ABI_IS_32BIT=0 + movsxd r8, DWORD PTR arg(3) ;out_pitch +%endif + mov rax, rsi + movsxd rcx, DWORD PTR arg(4) ;output_height + add rax, rdx + + lea rbx, [rdx + rdx*4] + add rbx, rdx ;pitch * 6 + +.loop: + movq xmm0, [rsi] ;A + movq xmm1, [rsi + rdx] ;B + movq xmm2, [rsi + rdx * 2] ;C + movq xmm3, [rax + rdx * 2] ;D + movq xmm4, [rsi + rdx * 4] ;E + movq xmm5, [rax + rdx * 4] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + movq xmm6, [rsi + rbx] ;G + movq xmm7, [rax + rbx] ;H + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + punpcklbw xmm6, xmm7 ;G H + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + paddsw xmm0, xmm2 + paddsw xmm0, xmm4 + paddsw xmm0, krd + + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + movq [rdi], xmm0 + + movq xmm0, [rsi + 8] ;A + movq xmm1, [rsi + rdx + 8] ;B + movq xmm2, [rsi + rdx * 2 + 8] ;C + movq xmm3, [rax + rdx * 2 + 8] ;D + movq xmm4, [rsi + rdx * 4 + 8] ;E + movq xmm5, [rax + rdx * 4 + 8] ;F + + punpcklbw xmm0, xmm1 ;A B + punpcklbw xmm2, xmm3 ;C D + punpcklbw xmm4, xmm5 ;E F + + + movq xmm6, [rsi + rbx + 8] ;G + movq xmm7, [rax + rbx + 8] ;H + punpcklbw xmm6, xmm7 ;G H + + + pmaddubsw xmm0, k0k1 + pmaddubsw xmm2, k2k3 + pmaddubsw xmm4, k4k5 + pmaddubsw xmm6, k6k7 + + paddsw xmm0, xmm6 + paddsw xmm0, xmm2 + paddsw xmm0, xmm4 + paddsw xmm0, krd + + psraw xmm0, 7 + packuswb xmm0, xmm0 + + add rsi, rdx + add rax, rdx +%if %1 + movq xmm1, [rdi+8] + pavgb xmm0, xmm1 +%endif + + movq [rdi+8], xmm0 + +%if ABI_IS_32BIT + add rdi, DWORD PTR arg(3) ;out_pitch +%else + add rdi, r8 +%endif + dec rcx + jnz .loop +%endm + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +%macro HORIZx4 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + lea rsi, [rsi + rax] + movd [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +%macro HORIZx8 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movd xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + lea rsi, [rsi + rax] + movq [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +%macro HORIZx16 1 + mov rdx, arg(5) ;filter ptr + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;output_ptr + mov rcx, 0x0400040 + + movdqa xmm4, [rdx] ;load filters + movq xmm5, rcx + packsswb xmm4, xmm4 + pshuflw xmm0, xmm4, 0b ;k0_k1 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 + + punpcklqdq xmm0, xmm0 + punpcklqdq xmm1, xmm1 + punpcklqdq xmm2, xmm2 + punpcklqdq xmm3, xmm3 + + movdqa k0k1, xmm0 + movdqa k2k3, xmm1 + pshufd xmm5, xmm5, 0 + movdqa k4k5, xmm2 + movdqa k6k7, xmm3 + movdqa krd, xmm5 + + movsxd rax, dword ptr arg(1) ;src_pixels_per_line + movsxd rdx, dword ptr arg(3) ;output_pitch + movsxd rcx, dword ptr arg(4) ;output_height + +.loop: + movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 + + movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 + punpcklqdq xmm0, xmm3 + + movdqa xmm1, xmm0 + pshufb xmm0, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm0, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm0, xmm1 + paddsw xmm0, xmm4 + paddsw xmm0, xmm2 + paddsw xmm0, krd + psraw xmm0, 7 + packuswb xmm0, xmm0 + + + movq xmm3, [rsi + 5] + movq xmm7, [rsi + 13] + punpcklqdq xmm3, xmm7 + + movdqa xmm1, xmm3 + pshufb xmm3, [GLOBAL(shuf_t0t1)] + pmaddubsw xmm3, k0k1 + + movdqa xmm2, xmm1 + pshufb xmm1, [GLOBAL(shuf_t2t3)] + pmaddubsw xmm1, k2k3 + + movdqa xmm4, xmm2 + pshufb xmm2, [GLOBAL(shuf_t4t5)] + pmaddubsw xmm2, k4k5 + + pshufb xmm4, [GLOBAL(shuf_t6t7)] + pmaddubsw xmm4, k6k7 + + paddsw xmm3, xmm1 + paddsw xmm3, xmm4 + paddsw xmm3, xmm2 + paddsw xmm3, krd + psraw xmm3, 7 + packuswb xmm3, xmm3 + punpcklqdq xmm0, xmm3 +%if %1 + movdqa xmm1, [rdi] + pavgb xmm0, xmm1 +%endif + + lea rsi, [rsi + rax] + movdqa [rdi], xmm0 + + lea rdi, [rdi + rdx] + dec rcx + jnz .loop +%endm + +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +SECTION_RODATA +align 16 +shuf_t0t1: + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 +align 16 +shuf_t2t3: + db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 +align 16 +shuf_t4t5: + db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 +align 16 +shuf_t6t7: + db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 |